diff --git a/CMakeLists.txt b/CMakeLists.txt
index e037af3bc..eaee5d5e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(Eigen)
 
-cmake_minimum_required(VERSION 2.8.4)
+cmake_minimum_required(VERSION 2.8.5)
 
 # guard against in-source builds
 
@@ -55,6 +55,7 @@ endif(EIGEN_HG_CHANGESET)
 
 
 include(CheckCXXCompilerFlag)
+include(GNUInstallDirs)
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 
@@ -118,11 +119,7 @@ endmacro(ei_add_cxx_compiler_flag)
 
 if(NOT MSVC)
   # We assume that other compilers are partly compatible with GNUCC
-  
-#  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-  set(CMAKE_CXX_FLAGS_DEBUG "-g3")
-  set(CMAKE_CXX_FLAGS_RELEASE "-g0 -O2")
-  
+
   # clang outputs some warnings for unknwon flags that are not caught by check_cxx_compiler_flag
   # adding -Werror turns such warnings into errors
   check_cxx_compiler_flag("-Werror" COMPILER_SUPPORT_WERROR)
@@ -341,24 +338,29 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 
-# the user modifiable install path for header files
-set(EIGEN_INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} CACHE PATH "The directory where we install the header files (optional)")
-
-# set the internal install path for header files which depends on wether the user modifiable
-# EIGEN_INCLUDE_INSTALL_DIR has been set by the user or not.
+# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
-  set(INCLUDE_INSTALL_DIR
-    ${EIGEN_INCLUDE_INSTALL_DIR}
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+  message(WARNING "EIGEN_INCLUDE_INSTALL_DIR is deprecated. Use INCLUDE_INSTALL_DIR instead.")
+endif()
+
+if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
+  set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
   set(INCLUDE_INSTALL_DIR
-    "${CMAKE_INSTALL_PREFIX}/include/eigen3"
-    CACHE INTERNAL
-    "The directory where we install the header files (internal)"
-  )
+      "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
+      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      )
 endif()
+set(CMAKEPACKAGE_INSTALL_DIR
+    "${CMAKE_INSTALL_LIBDIR}/cmake/eigen3"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    )
+set(PKGCONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/pkgconfig"
+    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    )
+
 
 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)
@@ -377,21 +379,9 @@ install(FILES
   )
 
 if(EIGEN_BUILD_PKGCONFIG)
-    SET(path_separator ":")
-    STRING(REPLACE ${path_separator} ";" pkg_config_libdir_search "$ENV{PKG_CONFIG_LIBDIR}")
-    message(STATUS "searching for 'pkgconfig' directory in PKG_CONFIG_LIBDIR ( $ENV{PKG_CONFIG_LIBDIR} ), ${CMAKE_INSTALL_PREFIX}/share, and ${CMAKE_INSTALL_PREFIX}/lib")
-    FIND_PATH(pkg_config_libdir pkgconfig ${pkg_config_libdir_search} ${CMAKE_INSTALL_PREFIX}/share ${CMAKE_INSTALL_PREFIX}/lib ${pkg_config_libdir_search})
-    if(pkg_config_libdir)
-        SET(pkg_config_install_dir ${pkg_config_libdir})
-        message(STATUS "found ${pkg_config_libdir}/pkgconfig" )
-    else(pkg_config_libdir)
-        SET(pkg_config_install_dir ${CMAKE_INSTALL_PREFIX}/share)
-        message(STATUS "pkgconfig not found; installing in ${pkg_config_install_dir}" )
-    endif(pkg_config_libdir)
-
-    configure_file(eigen3.pc.in eigen3.pc)
+    configure_file(eigen3.pc.in eigen3.pc @ONLY)
     install(FILES ${CMAKE_CURRENT_BINARY_DIR}/eigen3.pc
-        DESTINATION ${pkg_config_install_dir}/pkgconfig
+        DESTINATION ${PKGCONFIG_INSTALL_DIR}
         )
 endif(EIGEN_BUILD_PKGCONFIG)
 
@@ -454,12 +444,15 @@ if(cmake_generator_tolower MATCHES "makefile")
   message(STATUS "--------------+--------------------------------------------------------------")
   message(STATUS "Command       |   Description")
   message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS "make install  | Install to ${CMAKE_INSTALL_PREFIX}. To change that:")
-  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourpath")
-  message(STATUS "              |   Eigen headers will then be installed to:")
-  message(STATUS "              |     ${INCLUDE_INSTALL_DIR}")
-  message(STATUS "              |   To install Eigen headers to a separate location, do:")
-  message(STATUS "              |     cmake . -DEIGEN_INCLUDE_INSTALL_DIR=yourpath")
+  message(STATUS "make install  | Install Eigen. Headers will be installed to:")
+  message(STATUS "              |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+  message(STATUS "              |   Using the following values:")
+  message(STATUS "              |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "              |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+  message(STATUS "              |   Change the install location of Eigen headers using:")
+  message(STATUS "              |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+  message(STATUS "              |   Or:")
+  message(STATUS "              |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
   message(STATUS "make doc      | Generate the API documentation, requires Doxygen & LaTeX")
   message(STATUS "make check    | Build and run the unit-tests. Read this page:")
   message(STATUS "              |   http://eigen.tuxfamily.org/index.php?title=Tests")
@@ -473,21 +466,13 @@ endif()
 
 message(STATUS "")
 
-set ( EIGEN_CONFIG_CMAKE_PATH
-      lib${LIB_SUFFIX}/cmake/eigen3
-      CACHE PATH "The directory where the CMake files are installed"
-    )
-if ( NOT IS_ABSOLUTE EIGEN_CONFIG_CMAKE_PATH )
-  set ( EIGEN_CONFIG_CMAKE_PATH ${CMAKE_INSTALL_PREFIX}/${EIGEN_CONFIG_CMAKE_PATH} )
-endif ()
 
-set ( EIGEN_USE_FILE ${EIGEN_CONFIG_CMAKE_PATH}/UseEigen3.cmake )
 set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
 set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
 set ( EIGEN_VERSION_MINOR  ${EIGEN_MAJOR_VERSION} )
 set ( EIGEN_VERSION_PATCH  ${EIGEN_MINOR_VERSION} )
 set ( EIGEN_DEFINITIONS "")
-set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
+set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" )
 set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
 
@@ -498,7 +483,7 @@ configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
 
 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
-          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
+          DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}
         )
 
 # Add uninstall target
diff --git a/Eigen/Cholesky b/Eigen/Cholesky
index dd0ca911c..705a04cc4 100644
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLESKY_MODULE_H
 #define EIGEN_CHOLESKY_MODULE_H
 
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index 687cd9777..83e2c1da4 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
 #define EIGEN_CHOLMODSUPPORT_MODULE_H
 
diff --git a/Eigen/Core b/Eigen/Core
index 713d18a6d..63602f4c3 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -300,6 +300,7 @@ using std::ptrdiff_t;
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
+#include "src/Core/SpecialFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 
 #if defined EIGEN_VECTORIZE_AVX
@@ -382,8 +383,6 @@ using std::ptrdiff_t;
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
 #include "src/Core/DiagonalProduct.h"
-#include "src/Core/PermutationMatrix.h"
-#include "src/Core/Transpositions.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
@@ -393,6 +392,9 @@ using std::ptrdiff_t;
 #include "src/Core/GeneralProduct.h"
 #include "src/Core/Solve.h"
 #include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues
index 53c5a73a2..ea93eb303 100644
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_EIGENVALUES_MODULE_H
 #define EIGEN_EIGENVALUES_MODULE_H
 
diff --git a/Eigen/Geometry b/Eigen/Geometry
index 11aea8025..06b736e3f 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_GEOMETRY_MODULE_H
 #define EIGEN_GEOMETRY_MODULE_H
 
diff --git a/Eigen/Householder b/Eigen/Householder
index 6e348db5c..89cd81b1a 100644
--- a/Eigen/Householder
+++ b/Eigen/Householder
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_HOUSEHOLDER_MODULE_H
 #define EIGEN_HOUSEHOLDER_MODULE_H
 
diff --git a/Eigen/IterativeLinearSolvers b/Eigen/IterativeLinearSolvers
index f5fdcd9e5..957d5750b 100644
--- a/Eigen/IterativeLinearSolvers
+++ b/Eigen/IterativeLinearSolvers
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 #define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 
@@ -34,6 +41,7 @@
 #include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/Eigen/Jacobi b/Eigen/Jacobi
index ba8a4dc36..17c1d785a 100644
--- a/Eigen/Jacobi
+++ b/Eigen/Jacobi
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_JACOBI_MODULE_H
 #define EIGEN_JACOBI_MODULE_H
 
diff --git a/Eigen/LU b/Eigen/LU
index 132ecc42c..2d70c92de 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_LU_MODULE_H
 #define EIGEN_LU_MODULE_H
 
diff --git a/Eigen/MetisSupport b/Eigen/MetisSupport
index 6a113f7a8..85c41bf34 100644
--- a/Eigen/MetisSupport
+++ b/Eigen/MetisSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_METISSUPPORT_MODULE_H
 #define EIGEN_METISSUPPORT_MODULE_H
 
diff --git a/Eigen/OrderingMethods b/Eigen/OrderingMethods
index 7c0f1ffff..d8ea36193 100644
--- a/Eigen/OrderingMethods
+++ b/Eigen/OrderingMethods
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ORDERINGMETHODS_MODULE_H
 #define EIGEN_ORDERINGMETHODS_MODULE_H
 
diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport
index e7d275f97..3411dface 100644
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PASTIXSUPPORT_MODULE_H
 #define EIGEN_PASTIXSUPPORT_MODULE_H
 
diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport
old mode 100644
new mode 100755
index 99330ce7a..340edf51f
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PARDISOSUPPORT_MODULE_H
 #define EIGEN_PARDISOSUPPORT_MODULE_H
 
@@ -7,8 +14,6 @@
 
 #include <mkl_pardiso.h>
 
-#include <unsupported/Eigen/SparseExtra>
-
 /** \ingroup Support_modules
   * \defgroup PardisoSupport_Module PardisoSupport module
   *
diff --git a/Eigen/QR b/Eigen/QR
index 230cb079a..f74f365f1 100644
--- a/Eigen/QR
+++ b/Eigen/QR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_QR_MODULE_H
 #define EIGEN_QR_MODULE_H
 
diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc
index 46f7d83b7..4044d5ac5 100644
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@@ -1,3 +1,9 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_QTMALLOC_MODULE_H
 #define EIGEN_QTMALLOC_MODULE_H
diff --git a/Eigen/SPQRSupport b/Eigen/SPQRSupport
index e3f49bb5a..f9489dcd8 100644
--- a/Eigen/SPQRSupport
+++ b/Eigen/SPQRSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPQRSUPPORT_MODULE_H
 #define EIGEN_SPQRSUPPORT_MODULE_H
 
diff --git a/Eigen/SVD b/Eigen/SVD
index dbd37b17a..b353f3f54 100644
--- a/Eigen/SVD
+++ b/Eigen/SVD
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SVD_MODULE_H
 #define EIGEN_SVD_MODULE_H
 
diff --git a/Eigen/Sparse b/Eigen/Sparse
index a540f0eec..a2ef7a665 100644
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSE_MODULE_H
 #define EIGEN_SPARSE_MODULE_H
 
diff --git a/Eigen/SparseCore b/Eigen/SparseCore
index 48ed967b8..76966c4c4 100644
--- a/Eigen/SparseCore
+++ b/Eigen/SparseCore
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSECORE_MODULE_H
 #define EIGEN_SPARSECORE_MODULE_H
 
@@ -14,7 +21,7 @@
 /** 
   * \defgroup SparseCore_Module SparseCore module
   *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
   * and operations.
   *
   * See the \ref TutorialSparse "Sparse tutorial"
diff --git a/Eigen/SparseQR b/Eigen/SparseQR
index efb2695ba..a6f3b7f7d 100644
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSEQR_MODULE_H
 #define EIGEN_SPARSEQR_MODULE_H
 
diff --git a/Eigen/SuperLUSupport b/Eigen/SuperLUSupport
index d1eac9464..113f58ee5 100644
--- a/Eigen/SuperLUSupport
+++ b/Eigen/SuperLUSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
 #define EIGEN_SUPERLUSUPPORT_MODULE_H
 
@@ -36,6 +43,8 @@ namespace Eigen { struct SluMatrix; }
   * - class SuperLU: a supernodal sequential LU factorization.
   * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
   *
+  * \warning This wrapper is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  *
   * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
   *
   * \code
diff --git a/Eigen/UmfPackSupport b/Eigen/UmfPackSupport
index 0efad5dee..4a9f46a1e 100644
--- a/Eigen/UmfPackSupport
+++ b/Eigen/UmfPackSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
 #define EIGEN_UMFPACKSUPPORT_MODULE_H
 
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index dc73304e8..1f0091f3c 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -285,7 +285,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
         return k;
       mat.coeffRef(k,k) = x = sqrt(x);
       if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
     }
     return -1;
   }
diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h
index d2b0fb282..06421d5ed 100644
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -78,7 +78,7 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_StorageIndex,UF_long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -170,6 +170,10 @@ class CholmodBase : public SparseSolverBase<Derived>
     typedef typename MatrixType::RealScalar RealScalar;
     typedef MatrixType CholMatrixType;
     typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
@@ -350,6 +354,8 @@ class CholmodBase : public SparseSolverBase<Derived>
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
@@ -397,6 +403,8 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
@@ -442,6 +450,8 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers
@@ -489,6 +499,8 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers
diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h
index 151c05526..b4c24a27a 100644
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@@ -46,15 +46,14 @@ template<typename Derived> class ArrayBase
 
     typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
 
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     typedef DenseBase<Derived> Base;
+    using Base::operator*;
+    using Base::operator/;
     using Base::RowsAtCompileTime;
     using Base::ColsAtCompileTime;
     using Base::SizeAtCompileTime;
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
old mode 100644
new mode 100755
index c4ba60d6d..9dfffbcc4
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -54,6 +54,7 @@ private:
     InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
               : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
               : int(Dst::MaxRowsAtCompileTime),
+    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
     MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
     PacketSize = unpacket_traits<PacketType>::size
   };
@@ -65,7 +66,9 @@ private:
     MightVectorize = StorageOrdersAgree
                   && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
                   && (functor_traits<AssignFunc>::PacketAccess),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+    MayInnerVectorize  = MightVectorize
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0
                        && int(JointAlignment)>=int(RequiredAlignment),
     MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
     MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
@@ -95,10 +98,8 @@ private:
   enum {
     UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
     MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
-                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
                        && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
     MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(SrcEvaluator::CoeffReadCost) != Dynamic
                        && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
   };
 
@@ -125,8 +126,8 @@ public:
     std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
     std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(DstFlags)
-    EIGEN_DEBUG_VAR(SrcFlags)
+    std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(DstAlignment)
     EIGEN_DEBUG_VAR(SrcAlignment)
@@ -141,11 +142,11 @@ public:
     EIGEN_DEBUG_VAR(MayInnerVectorize)
     EIGEN_DEBUG_VAR(MayLinearVectorize)
     EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
     EIGEN_DEBUG_VAR(UnrollingLimit)
     EIGEN_DEBUG_VAR(MayUnrollCompletely)
     EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
     std::cerr << std::endl;
   }
 #endif
@@ -288,7 +289,7 @@ struct dense_assignment_loop;
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
   {
     for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
       for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
@@ -311,7 +312,6 @@ struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
 {
-  typedef typename Kernel::StorageIndex StorageIndex;
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
@@ -392,7 +392,6 @@ struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Kernel::StorageIndex StorageIndex;
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
@@ -414,7 +413,7 @@ template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
 {
   typedef typename Kernel::PacketType PacketType;
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
@@ -438,7 +437,6 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 {
-  typedef typename Kernel::StorageIndex StorageIndex;
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
@@ -455,7 +453,7 @@ struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
 template<typename Kernel>
 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
 {
-  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
   {
     const Index size = kernel.size();
     for(Index i = 0; i < size; ++i)
@@ -545,7 +543,6 @@ public:
   typedef DstEvaluatorTypeT DstEvaluatorType;
   typedef SrcEvaluatorTypeT SrcEvaluatorType;
   typedef typename DstEvaluatorType::Scalar Scalar;
-  typedef typename DstEvaluatorType::StorageIndex StorageIndex;
   typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
   typedef typename AssignmentTraits::PacketType PacketType;
   
@@ -565,26 +562,23 @@ public:
   EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
   EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
   
-  // TODO get rid of this one:
-  EIGEN_DEVICE_FUNC DstXprType& dstExpression() const { return m_dstExpr; }
-  
   EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
   EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
   
   /// Assign src(row,col) to dst(row,col) through the assignment functor.
-  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
   {
     m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
   }
   
   /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC void assignCoeff(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
   {
     m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
   }
   
   /// \sa assignCoeff(Index,Index)
-  EIGEN_DEVICE_FUNC void assignCoeffByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
   {
     Index row = rowIndexByOuterInner(outer, inner); 
     Index col = colIndexByOuterInner(outer, inner); 
@@ -593,26 +587,26 @@ public:
   
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC void assignPacket(Index row, Index col)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
   {
     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
   }
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC void assignPacket(Index index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
   {
     m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
   }
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC void assignPacketByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
   {
     Index row = rowIndexByOuterInner(outer, inner); 
     Index col = colIndexByOuterInner(outer, inner);
     assignPacket<StoreMode,LoadMode,PacketType>(row, col);
   }
   
-  EIGEN_DEVICE_FUNC static Index rowIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::RowsAtCompileTime) == 1 ? 0
@@ -621,7 +615,7 @@ public:
       : inner;
   }
 
-  EIGEN_DEVICE_FUNC static Index colIndexByOuterInner(Index outer, Index inner)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
   {
     typedef typename DstEvaluatorType::ExpressionTraits Traits;
     return int(Traits::ColsAtCompileTime) == 1 ? 0
@@ -719,14 +713,8 @@ EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& fun
 }
 
 // by-pass AssumeAliasing
-// FIXME the const version should probably not be needed
 // When there is no aliasing, we require that 'dst' has been properly resized
 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
-EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
-{
-  call_assignment_no_alias(dst.expression(), src, func);
-}
-template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
 EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
 {
   call_assignment_no_alias(dst.expression(), src, func);
@@ -737,11 +725,9 @@ template<typename Dst, typename Src, typename Func>
 EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
 {
   enum {
-    NeedToTranspose = (  (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
-                        |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                // revert to || as soon as not needed anymore.
-                         (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1))
-                     && int(Dst::SizeAtCompileTime) != 1
+    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
+                      ) && int(Dst::SizeAtCompileTime) != 1
   };
 
   Index dstRows = NeedToTranspose ? src.cols() : src.rows();
@@ -756,11 +742,7 @@ EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const
   // TODO check whether this is the right place to perform these checks:
   EIGEN_STATIC_ASSERT_LVALUE(Dst)
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
-
-  // TODO this line is commented to allow matrix = permutation
-  // Actually, the "Scalar" type for a permutation matrix does not really make sense,
-  // perhaps it could be void, and EIGEN_CHECK_BINARY_COMPATIBILIY could allow micing void with anything...?
-//   EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
   
   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index ba45cf5c3..8409d8749 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -83,8 +83,6 @@ inline bool DenseBase<Derived>::all() const
   typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && Evaluator::CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
           && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
   };
   Evaluator evaluator(derived());
@@ -109,8 +107,6 @@ inline bool DenseBase<Derived>::any() const
   typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && Evaluator::CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
           && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
   };
   Evaluator evaluator(derived());
@@ -142,7 +138,11 @@ inline Eigen::Index DenseBase<Derived>::count() const
 template<typename Derived>
 inline bool DenseBase<Derived>::hasNaN() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isNaN().any();
+#else
   return !((derived().array()==derived().array()).all());
+#endif
 }
 
 /** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
@@ -152,7 +152,11 @@ inline bool DenseBase<Derived>::hasNaN() const
 template<typename Derived>
 inline bool DenseBase<Derived>::allFinite() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isFinite().all();
+#else
   return !((derived()-derived()).hasNaN());
+#endif
 }
     
 } // end namespace Eigen
diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h
index dc772277d..89bcd750c 100644
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@@ -106,7 +106,7 @@ struct CommaInitializer
   EIGEN_DEVICE_FUNC
   inline ~CommaInitializer()
 #if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
-  throw(Eigen::eigen_assert_exception)
+  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
 #endif
   {
     eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 2cbb6cd44..f97dc33de 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -29,6 +29,7 @@ struct storage_kind_to_evaluator_kind {
 template<typename StorageKind> struct storage_kind_to_shape;
 
 template<> struct storage_kind_to_shape<Dense>                  { typedef DenseShape Shape;           };
+template<> struct storage_kind_to_shape<SolverStorage>          { typedef SolverShape Shape;           };
 template<> struct storage_kind_to_shape<PermutationStorage>     { typedef PermutationShape Shape;     };
 template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef TranspositionsShape Shape;  };
 
@@ -98,9 +99,6 @@ struct evaluator<const T>
 template<typename ExpressionType>
 struct evaluator_base : public noncopyable
 {
-  // FIXME is it really usefull?
-  typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
-  
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
   
@@ -140,11 +138,15 @@ struct evaluator<PlainObjectBase<Derived> >
       m_outerStride(IsVectorAtCompileTime  ? 0 
                                            : int(IsRowMajor) ? ColsAtCompileTime 
                                            : RowsAtCompileTime)
-  {}
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
     : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
   {
@@ -324,13 +326,15 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
           &  (  HereditaryBits
               | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
               | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
-          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit), // FIXME EvalBeforeNestingBit should be needed anymore
-    Alignment = 0 // FIXME alignment should not matter here, perhaps we could set it to AlignMax??
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    Alignment = AlignedMax
   };
 
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
     : m_functor(n.functor()) 
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -379,7 +383,10 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
   EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
     : m_functor(op.functor()), 
       m_argImpl(op.nestedExpression()) 
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -452,7 +459,10 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -505,7 +515,10 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
   EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
     : m_unaryOp(op.functor()), 
       m_argImpl(op.nestedExpression()) 
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -562,6 +575,7 @@ struct mapbase_evaluator : evaluator_base<Derived>
   {
     EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
                         PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
  
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@@ -636,17 +650,9 @@ struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
     HasNoStride = HasNoInnerStride && HasNoOuterStride,
     IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
     
-    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
-    
-    KeepsPacketAccess = bool(HasNoInnerStride)
-                        && ( bool(IsDynamicSize)
-                           || HasNoOuterStride
-                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime) % PacketAlignment)==0 ) ),
-    Flags0 = evaluator<PlainObjectType>::Flags,
-    Flags1 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags0) : int(Flags0 & ~LinearAccessBit),
-    Flags = KeepsPacketAccess ? int(Flags1) : (int(Flags1) & ~PacketAccessBit),
+    PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
+    LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
+    Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
     
     Alignment = int(MapOptions)&int(AlignedMask)
   };
@@ -724,7 +730,10 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 };
 
 // no direct-access => dispatch to a unary evaluator
@@ -825,14 +834,14 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
   EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
   {
-    // FIXME this should be an internal assertion
+    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
     eigen_assert(((size_t(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
   }
 };
 
 
 // -------------------- Select --------------------
-// TODO shall we introduce a ternary_evaluator?
+// NOTE shall we introduce a ternary_evaluator?
 
 // TODO enable vectorization for Select
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
@@ -842,8 +851,8 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
   typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
   enum {
     CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
-                  + EIGEN_SIZE_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
-                                   evaluator<ElseMatrixType>::CoeffReadCost),
+                  + EIGEN_PLAIN_ENUM_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
+                                         evaluator<ElseMatrixType>::CoeffReadCost),
 
     Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
     
@@ -854,7 +863,9 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     : m_conditionImpl(select.conditionMatrix()),
       m_thenImpl(select.thenMatrix()),
       m_elseImpl(select.elseMatrix())
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
  
   typedef typename XprType::CoeffReturnType CoeffReturnType;
 
@@ -897,8 +908,8 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   
   enum {
     CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
-    
-    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & HereditaryBits & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
     
     Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
   };
@@ -957,7 +968,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
   }
  
 protected:
-  const ArgTypeNested m_arg; // FIXME is it OK to store both the argument and its evaluator?? (we have the same situation in evaluator_product)
+  const ArgTypeNested m_arg;
   evaluator<ArgTypeNestedCleaned> m_argImpl;
   const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
   const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
@@ -965,48 +976,57 @@ protected:
 
 
 // -------------------- PartialReduxExpr --------------------
-//
-// This is a wrapper around the expression object. 
-// TODO: Find out how to write a proper evaluator without duplicating
-//       the row() and col() member functions.
 
 template< typename ArgType, typename MemberOp, int Direction>
 struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
   : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
 {
   typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
-  typedef typename XprType::Scalar InputScalar;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
   enum {
-    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(XprType::ColsAtCompileTime)
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
   };
   typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
   enum {
-    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
                   : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
     
-    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&HereditaryBits),
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))),
     
-    Alignment = 0 // FIXME this could be improved
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
   };
 
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType expr)
-    : m_expr(expr)
-  {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   typedef typename XprType::CoeffReturnType CoeffReturnType;
- 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
-  { 
-    return m_expr.coeff(row, col);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(j));
+    else
+      return m_functor(m_arg.row(i));
   }
-  
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  { 
-    return m_expr.coeff(index);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(index));
+    else
+      return m_functor(m_arg.row(index));
   }
 
 protected:
-  const XprType m_expr;
+  const ArgTypeNested m_arg;
+  const MemberOp m_functor;
 };
 
 
@@ -1130,6 +1150,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
     // FIXME enable DirectAccess with negative strides?
     Flags0 = evaluator<ArgType>::Flags,
     LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                  || ((ReverseRow && XprType::ColsAtCompileTime==1) || (ReverseCol && XprType::RowsAtCompileTime==1))
                  ? LinearAccessBit : 0,
 
     Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
@@ -1139,8 +1160,8 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
 
   EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
     : m_argImpl(reverse.nestedExpression()),
-      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 0),
-      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 0)
+      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
+      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
   { }
  
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@@ -1214,8 +1235,9 @@ protected:
   evaluator<ArgType> m_argImpl;
 
   // If we do not reverse rows, then we do not need to know the number of rows; same for columns
-  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 0> m_rows;
-  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 0> m_cols;
+  // Nonetheless, in this case it is important to set to 1 such that the coeff(index) method works fine for vectors.
+  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 1> m_rows;
+  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 1> m_cols;
 };
 
 
@@ -1331,20 +1353,16 @@ struct evaluator<EvalToTemp<ArgType> >
   typedef evaluator<PlainObject> Base;
   
   EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
-    : m_result(xpr.rows(), xpr.cols())
+    : m_result(xpr.arg())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
-    // TODO we should simply do m_result(xpr.arg());
-    call_dense_assignment_loop(m_result, xpr.arg());
   }
 
   // This constructor is used when nesting an EvalTo evaluator in another evaluator
   EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
-    : m_result(arg.rows(), arg.cols())
+    : m_result(arg)
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
-    // TODO we should simply do m_result(xpr.arg());
-    call_dense_assignment_loop(m_result, arg);
   }
 
 protected:
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 488f15061..e181dafaf 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -40,18 +40,14 @@ static inline void check_DenseIndex_is_signed() {
   */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
+                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                                            DenseCoeffsBase<Derived> >
 #else
   : public DenseCoeffsBase<Derived>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator/;
-
 
     /** Inner iterator type to iterate over the coefficients of a row or column.
       * \sa class InnerIterator
@@ -77,9 +73,10 @@ template<typename Derived> class DenseBase
     typedef Scalar value_type;
     
     typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;
 
-    typedef internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                      typename NumTraits<typename internal::traits<Derived>::Scalar>::Real> Base;
+    using Base::operator*;
+    using Base::operator/;
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
diff --git a/Eigen/src/Core/DenseCoeffsBase.h b/Eigen/src/Core/DenseCoeffsBase.h
index 9581757f3..820a90e6f 100644
--- a/Eigen/src/Core/DenseCoeffsBase.h
+++ b/Eigen/src/Core/DenseCoeffsBase.h
@@ -138,6 +138,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     EIGEN_STRONG_INLINE CoeffReturnType
     coeff(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
       return internal::evaluator<Derived>(derived()).coeff(index);
     }
@@ -243,6 +245,8 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
       eigen_internal_assert(index >= 0 && index < size());
       return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
@@ -370,6 +374,8 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
     EIGEN_STRONG_INLINE Scalar&
     coeffRef(Index index)
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
       return internal::evaluator<Derived>(derived()).coeffRef(index);
     }
@@ -617,7 +623,7 @@ static inline Index first_default_aligned(const DenseBase<Derived>& m)
 {
   typedef typename Derived::Scalar Scalar;
   typedef typename packet_traits<Scalar>::type DefaultPacketType;
-  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(m);
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment),Derived>(m);
 }
 
 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 94b058466..003450f1a 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -178,9 +178,11 @@ struct lpNorm_selector<Derived, Infinity>
 
 } // end namespace internal
 
-/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
-  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
-  *          norm, that is the maximum of the absolute values of the coefficients of *this.
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
+  *
+  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
   *
   * \sa norm()
   */
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 475d6f4aa..fe8204ac3 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -160,7 +160,7 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum
 namespace internal {
 
 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_dense_sense_selector;
+struct gemv_dense_selector;
 
 } // end namespace internal
 
@@ -204,19 +204,19 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 
 // The vector is on the left => transposition
 template<int StorageOrder, bool BlasCompatible>
-struct gemv_dense_sense_selector<OnTheLeft,StorageOrder,BlasCompatible>
+struct gemv_dense_selector<OnTheLeft,StorageOrder,BlasCompatible>
 {
   template<typename Lhs, typename Rhs, typename Dest>
   static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
     Transpose<Dest> destT(dest);
     enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_dense_sense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+    gemv_dense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
       ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
   }
 };
 
-template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
 {
   template<typename Lhs, typename Rhs, typename Dest>
   static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
@@ -292,7 +292,7 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
   }
 };
 
-template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
 {
   template<typename Lhs, typename Rhs, typename Dest>
   static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
@@ -345,27 +345,28 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
   }
 };
 
-template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
 {
   template<typename Lhs, typename Rhs, typename Dest>
   static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
     const Index size = rhs.rows();
     for(Index k=0; k<size; ++k)
-      dest += (alpha*rhs.coeff(k)) * lhs.col(k);
+      dest += (alpha*actual_rhs.coeff(k)) * lhs.col(k);
   }
 };
 
-template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
 {
   template<typename Lhs, typename Rhs, typename Dest>
   static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
     const Index rows = dest.rows();
     for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(rhs.transpose())).sum();
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
   }
 };
 
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index c767757b4..8ad51bad5 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -43,7 +43,7 @@ struct default_packet_traits
 {
   enum {
     HasHalfPacket = 0,
-    
+
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
@@ -74,10 +74,15 @@ struct default_packet_traits
     HasSinh    = 0,
     HasCosh    = 0,
     HasTanh    = 0,
+    HasLGamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
 
     HasRound  = 0,
     HasFloor  = 0,
-    HasCeil   = 0
+    HasCeil   = 0,
+
+    HasSign   = 0
   };
 };
 
@@ -430,6 +435,18 @@ Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h
index aaa076701..62fec7008 100644
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@@ -49,6 +49,9 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op)
@@ -64,6 +67,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op)
   
   template<typename Derived>
   inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
diff --git a/Eigen/src/Core/Inverse.h b/Eigen/src/Core/Inverse.h
index b359e1287..f3ec84990 100644
--- a/Eigen/src/Core/Inverse.h
+++ b/Eigen/src/Core/Inverse.h
@@ -12,8 +12,6 @@
 
 namespace Eigen { 
 
-// TODO move the general declaration in Core, and rename this file DenseInverseImpl.h, or something like this...
-
 template<typename XprType,typename StorageKind> class InverseImpl;
 
 namespace internal {
@@ -49,11 +47,13 @@ public:
   typedef typename XprType::PlainObject                       PlainObject;
   typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
   typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
   
   explicit Inverse(const XprType &xpr)
     : m_xpr(xpr)
   {}
-  
+
   EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
   EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
 
@@ -63,25 +63,16 @@ protected:
   XprTypeNested m_xpr;
 };
 
-/** \internal
-  * Specialization of the Inverse expression for dense expressions.
-  * Direct access to the coefficients are discared.
-  * FIXME this intermediate class is probably not needed anymore.
-  */
-template<typename XprType>
-class InverseImpl<XprType,Dense>
-  : public MatrixBase<Inverse<XprType> >
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class InverseImpl
+  : public internal::generic_xpr_base<Inverse<XprType> >::type
 {
-  typedef Inverse<XprType> Derived;
-  
 public:
-  
-  typedef MatrixBase<Derived> Base;
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-  typedef typename internal::remove_all<XprType>::type NestedExpression;
-
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
 private:
-  
+
   Scalar coeff(Index row, Index col) const;
   Scalar coeff(Index i) const;
 };
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index ae28d4db6..75a80daaa 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -155,6 +155,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       checkSanity();
     }
 
+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
   protected:
 
     EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index c795149b8..48cf565fb 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -241,8 +241,8 @@ struct conj_retval
 * Implementation of abs2                                                 *
 ****************************************************************************/
 
-template<typename Scalar>
-struct abs2_impl
+template<typename Scalar,bool IsComplex>
+struct abs2_impl_default
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
@@ -252,16 +252,28 @@ struct abs2_impl
   }
 };
 
-template<typename RealScalar>
-struct abs2_impl<std::complex<RealScalar> >
+template<typename Scalar>
+struct abs2_impl_default<Scalar, true> // IsComplex
 {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const std::complex<RealScalar>& x)
+  static inline RealScalar run(const Scalar& x)
   {
     return real(x)*real(x) + imag(x)*imag(x);
   }
 };
 
+template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return abs2_impl_default<Scalar,NumTraits<Scalar>::IsComplex>::run(x);
+  }
+};
+
 template<typename Scalar>
 struct abs2_retval
 {
@@ -314,8 +326,6 @@ struct hypot_impl
   typedef typename NumTraits<Scalar>::Real RealScalar;
   static inline RealScalar run(const Scalar& x, const Scalar& y)
   {
-    EIGEN_USING_STD_MATH(max);
-    EIGEN_USING_STD_MATH(min);
     EIGEN_USING_STD_MATH(abs);
     EIGEN_USING_STD_MATH(sqrt);
     RealScalar _x = abs(x);
@@ -607,8 +617,6 @@ struct random_default_impl<Scalar, false, true>
 {
   static inline Scalar run(const Scalar& x, const Scalar& y)
   { 
-    using std::max;
-    using std::min;
     typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
     if(y<x)
       return x;
@@ -667,6 +675,115 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
+// Implementatin of is* functions
+
+// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
+#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
+#define EIGEN_USE_STD_FPCLASSIFY 1
+#else
+#define EIGEN_USE_STD_FPCLASSIFY 0
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isnan_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isinf_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isfinite_impl(const T&) { return true; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isfinite_impl(const T& x)
+{
+  #if EIGEN_USE_STD_FPCLASSIFY
+    using std::isfinite;
+    return isfinite EIGEN_NOT_A_MACRO (x);
+  #else
+    return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isinf_impl(const T& x)
+{
+  #if EIGEN_USE_STD_FPCLASSIFY
+    using std::isinf;
+    return isinf EIGEN_NOT_A_MACRO (x);
+  #else
+    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isnan_impl(const T& x)
+{
+  #if EIGEN_USE_STD_FPCLASSIFY
+    using std::isnan;
+    return isnan EIGEN_NOT_A_MACRO (x);
+  #else
+    return x != x;
+  #endif
+}
+
+#if (!EIGEN_USE_STD_FPCLASSIFY)
+
+#if EIGEN_COMP_MSVC
+
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
+{
+  return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF;
+}
+
+//MSVC defines a _isnan builtin function, but for double only
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x); }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x); }
+
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
+
+#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
+
+#if EIGEN_GNUC_AT_LEAST(5,0)
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
+#else
+  // NOTE the inline qualifier and noinline attribute are both needed: the former is to avoid linking issue (duplicate symbol),
+  //      while the second prevent too aggressive optimizations in fast-math mode:
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
+#endif
+
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+
+#undef EIGEN_TMP_NOOPT_ATTRIB
+
+#endif
+
+#endif
+
+// The following overload are defined at the end of this file
+template<typename T> bool isfinite_impl(const std::complex<T>& x);
+template<typename T> bool isnan_impl(const std::complex<T>& x);
+template<typename T> bool isinf_impl(const std::complex<T>& x);
+
 } // end namespace internal
 
 /****************************************************************************
@@ -810,59 +927,9 @@ inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
   return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
 }
 
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isfinite)(const T& x)
-{
-  #if EIGEN_HAS_CXX11_MATH
-    using std::isfinite;
-    return isfinite EIGEN_NOT_A_MACRO (x);
-  #else
-    return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
-  #endif
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isnan)(const T& x)
-{
-  #if EIGEN_HAS_CXX11_MATH
-    using std::isnan;
-    return isnan EIGEN_NOT_A_MACRO (x);
-  #else
-    return x != x;
-  #endif
-}
-
-template<typename T>
-EIGEN_DEVICE_FUNC
-bool (isinf)(const T& x)
-{
-  #if EIGEN_HAS_CXX11_MATH
-    using std::isinf;
-    return isinf EIGEN_NOT_A_MACRO (x);
-  #else
-    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
-  #endif
-}
-
-template<typename T>
-bool (isfinite)(const std::complex<T>& x)
-{
-  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
-}
-
-template<typename T>
-bool (isnan)(const std::complex<T>& x)
-{
-  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
-}
-
-template<typename T>
-bool (isinf)(const std::complex<T>& x)
-{
-  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
-}
+template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
 
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
@@ -906,6 +973,24 @@ inline int log2(int x)
 
 namespace internal {
 
+template<typename T>
+bool isfinite_impl(const std::complex<T>& x)
+{
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template<typename T>
+bool isnan_impl(const std::complex<T>& x)
+{
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template<typename T>
+bool isinf_impl(const std::complex<T>& x)
+{
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
 /****************************************************************************
 * Implementation of fuzzy comparisons                                       *
 ****************************************************************************/
@@ -928,9 +1013,8 @@ struct scalar_fuzzy_default_impl<Scalar, false, false>
   EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    EIGEN_USING_STD_MATH(min);
     EIGEN_USING_STD_MATH(abs);
-    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
+    return abs(x - y) <= numext::mini(abs(x), abs(y)) * prec;
   }
   EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
@@ -971,8 +1055,7 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
   }
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    EIGEN_USING_STD_MATH(min);
-    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
+    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
   }
 };
 
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 7c66572d1..9d612c852 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -328,23 +328,26 @@ template<typename Derived> class MatrixBase
 
 /////////// LU module ///////////
 
-    EIGEN_DEVICE_FUNC const FullPivLU<PlainObject> fullPivLu() const;
-    EIGEN_DEVICE_FUNC const PartialPivLU<PlainObject> partialPivLu() const;
-
-    const PartialPivLU<PlainObject> lu() const;
+    EIGEN_DEVICE_FUNC
+    inline const FullPivLU<PlainObject> fullPivLu() const;
+    EIGEN_DEVICE_FUNC
+    inline const PartialPivLU<PlainObject> partialPivLu() const;
 
     EIGEN_DEVICE_FUNC
-    const Inverse<Derived> inverse() const;
+    inline const PartialPivLU<PlainObject> lu() const;
+
+    EIGEN_DEVICE_FUNC
+    inline const Inverse<Derived> inverse() const;
     
     template<typename ResultType>
-    void computeInverseAndDetWithCheck(
+    inline void computeInverseAndDetWithCheck(
       ResultType& inverse,
       typename ResultType::Scalar& determinant,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
     template<typename ResultType>
-    void computeInverseWithCheck(
+    inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
@@ -353,22 +356,24 @@ template<typename Derived> class MatrixBase
 
 /////////// Cholesky module ///////////
 
-    const LLT<PlainObject>  llt() const;
-    const LDLT<PlainObject> ldlt() const;
+    inline const LLT<PlainObject>  llt() const;
+    inline const LDLT<PlainObject> ldlt() const;
 
 /////////// QR module ///////////
 
-    const HouseholderQR<PlainObject> householderQr() const;
-    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
+    inline const HouseholderQR<PlainObject> householderQr() const;
+    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
 
-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
+/////////// Eigenvalues module ///////////
+
+    inline EigenvaluesReturnType eigenvalues() const;
+    inline RealScalar operatorNorm() const;
 
 /////////// SVD module ///////////
 
-    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
-    BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
+    inline JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+    inline BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
 
 /////////// Geometry module ///////////
 
@@ -381,24 +386,24 @@ template<typename Derived> class MatrixBase
     #endif // EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    typename cross_product_return_type<OtherDerived>::type
+    inline typename cross_product_return_type<OtherDerived>::type
     cross(const MatrixBase<OtherDerived>& other) const;
     
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+    inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
     
     EIGEN_DEVICE_FUNC
-    PlainObject unitOrthogonal(void) const;
+    inline PlainObject unitOrthogonal(void) const;
     
-    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
     
-    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
+    inline ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
     // put this as separate enum value to work around possible GCC 4.3 bug (?)
     enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
                                           : ColsAtCompileTime==1 ? Vertical : Horizontal };
     typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
+    inline HomogeneousReturnType homogeneous() const;
     
     enum {
       SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
@@ -409,7 +414,7 @@ template<typename Derived> class MatrixBase
     typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
                 const ConstStartMinusOne > HNormalizedReturnType;
 
-    const HNormalizedReturnType hnormalized() const;
+    inline const HNormalizedReturnType hnormalized() const;
 
 ////////// Householder module ///////////
 
@@ -433,6 +438,15 @@ template<typename Derived> class MatrixBase
     template<typename OtherScalar>
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////
 
     typedef typename internal::stem_function<Scalar>::type StemFunction;
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index 61ec2f533..1d85dec72 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -157,9 +157,9 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
     IsInteger = NumTraits<Scalar>::IsInteger,
     IsSigned  = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
   };
   
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index bfe6f899a..90e1df233 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,9 +13,6 @@
 
 namespace Eigen { 
 
-// TODO: this does not seems to be needed at all:
-// template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
-
 /** \class PermutationBase
   * \ingroup Core_Module
   *
@@ -67,8 +64,10 @@ class PermutationBase : public EigenBase<Derived>
             DenseMatrixType;
     typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
             PlainPermutationType;
+    typedef PlainPermutationType PlainObject;
     using Base::derived;
-    typedef Transpose<PermutationBase> TransposeReturnType;
+    typedef Inverse<Derived> InverseReturnType;
+    typedef void Scalar;
     #endif
 
     /** Copies the other permutation into *this */
@@ -195,14 +194,14 @@ class PermutationBase : public EigenBase<Derived>
       *
       * \note \note_try_to_help_rvo
       */
-    inline TransposeReturnType inverse() const
-    { return TransposeReturnType(derived()); }
+    inline InverseReturnType inverse() const
+    { return InverseReturnType(derived()); }
     /** \returns the tranpose permutation matrix.
       *
       * \note \note_try_to_help_rvo
       */
-    inline TransposeReturnType transpose() const
-    { return TransposeReturnType(derived()); }
+    inline InverseReturnType transpose() const
+    { return InverseReturnType(derived()); }
 
     /**** multiplication helpers to hopefully get RVO ****/
 
@@ -237,7 +236,7 @@ class PermutationBase : public EigenBase<Derived>
       * \note \note_try_to_help_rvo
       */
     template<typename Other>
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
+    inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const
     { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
 
     /** \returns the product of an inverse permutation with another permutation.
@@ -245,7 +244,7 @@ class PermutationBase : public EigenBase<Derived>
       * \note \note_try_to_help_rvo
       */
     template<typename Other> friend
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
+    inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm)
     { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
     
     /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
@@ -303,6 +302,7 @@ struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _Storag
   typedef PermutationStorage StorageKind;
   typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
   typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }
 
@@ -396,13 +396,13 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Other>
-    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedExpression().size())
+    PermutationMatrix(const InverseImpl<Other,PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size())
     {
       eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
       StorageIndex end = StorageIndex(m_indices.size());
       for (StorageIndex i=0; i<end;++i)
-        m_indices.coeffRef(other.nestedExpression().indices().coeff(i)) = i;
+        m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -426,6 +426,7 @@ struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _St
   typedef PermutationStorage StorageKind;
   typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
   typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }
 
@@ -499,7 +500,7 @@ template<typename _IndicesType>
 struct traits<PermutationWrapper<_IndicesType> >
 {
   typedef PermutationStorage StorageKind;
-  typedef typename _IndicesType::Scalar Scalar;
+  typedef void Scalar;
   typedef typename _IndicesType::Scalar StorageIndex;
   typedef _IndicesType IndicesType;
   enum {
@@ -561,84 +562,61 @@ operator*(const PermutationBase<PermutationDerived> &permutation,
             (permutation.derived(), matrix.derived());
 }
 
-namespace internal {
 
-/* Template partial specialization for transposed/inverse permutations */
-
-template<typename Derived>
-struct traits<Transpose<PermutationBase<Derived> > >
- : traits<Derived>
-{};
-
-} // end namespace internal
-
-// TODO: the specificties should be handled by the evaluator,
-// at the very least we should only specialize TransposeImpl
-template<typename Derived>
-class Transpose<PermutationBase<Derived> >
-  : public EigenBase<Transpose<PermutationBase<Derived> > >
+template<typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage>
+  : public EigenBase<Inverse<PermutationType> >
 {
-    typedef Derived PermutationType;
-    typedef typename PermutationType::IndicesType IndicesType;
     typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+    typedef internal::traits<PermutationType> PermTraits;
+  protected:
+    InverseImpl() {}
   public:
+    typedef Inverse<PermutationType> InverseType;
+    using EigenBase<Inverse<PermutationType> >::derived;
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef internal::traits<PermutationType> Traits;
-    typedef typename Derived::DenseMatrixType DenseMatrixType;
+    typedef typename PermutationType::DenseMatrixType DenseMatrixType;
     enum {
-      Flags = Traits::Flags,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+      RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+      ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::StorageIndex StorageIndex;
     #endif
 
-    Transpose(const PermutationType& p) : m_permutation(p) {}
-
-    inline Index rows() const { return m_permutation.rows(); }
-    inline Index cols() const { return m_permutation.cols(); }
-
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (Index i=0; i<rows();++i)
-        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+      for (Index i=0; i<derived().rows();++i)
+        other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
     }
     #endif
 
     /** \return the equivalent permutation matrix */
-    PlainPermutationType eval() const { return *this; }
+    PlainPermutationType eval() const { return derived(); }
 
-    DenseMatrixType toDenseMatrix() const { return *this; }
+    DenseMatrixType toDenseMatrix() const { return derived(); }
 
     /** \returns the matrix with the inverse permutation applied to the columns.
       */
     template<typename OtherDerived> friend
-    const Product<OtherDerived, Transpose, AliasFreeProduct>
-    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
+    const Product<OtherDerived, InverseType, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const InverseType& trPerm)
     {
-      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trPerm.derived());
+      return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
     }
 
     /** \returns the matrix with the inverse permutation applied to the rows.
       */
     template<typename OtherDerived>
-    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    const Product<InverseType, OtherDerived, AliasFreeProduct>
     operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
+      return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
     }
-
-    const PermutationType& nestedExpression() const { return m_permutation; }
-
-  protected:
-    const PermutationType& m_permutation;
 };
 
 template<typename Derived>
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 48e29ebdc..1225e85b4 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -263,7 +263,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         m_storage.resize(size, rows, cols);
         if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
       #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
         m_storage.resize(rows*cols, rows, cols);
       #endif
     }
@@ -450,6 +449,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       return Base::operator=(func);
     }
 
+    // Prevent user from trying to instantiate PlainObjectBase objects
+    // by making all its constructor protected. See bug 1074.
+  protected:
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
     {
@@ -496,17 +499,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
-    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
-      */
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
-    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
-    {
-      _resize_to_match(other);
-      Base::operator=(other.derived());
-      return this->derived();
-    }
-
     /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -520,7 +512,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
       : m_storage()
     {
@@ -539,6 +531,19 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       other.evalTo(this->derived());
     }
 
+  public:
+
+    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      _resize_to_match(other);
+      Base::operator=(other.derived());
+      return this->derived();
+    }
+
     /** \name Map
       * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
       * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index b79236f15..fdd2fed3f 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -217,29 +217,6 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
   
 };
 
-/***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
-
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs>
-prod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs>(lhs,rhs);
-}
-
-/** \internal used to test the evaluator only
-  */
-template<typename Lhs,typename Rhs>
-const Product<Lhs,Rhs,LazyProduct>
-lazyprod(const Lhs& lhs, const Rhs& rhs)
-{
-  return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_PRODUCT_H
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 04e5e5e37..794038a2a 100755
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -38,6 +38,12 @@ struct evaluator<Product<Lhs, Rhs, Options> >
 // Catch scalar * ( A * B ) and transform it to (A*scalar) * B
 // TODO we should apply that rule only if that's really helpful
 template<typename Lhs, typename Rhs, typename Scalar>
+struct evaluator_traits<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+ : evaluator_traits_base<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+{
+  enum { AssumeAliasing = 1 };
+};
+template<typename Lhs, typename Rhs, typename Scalar>
 struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
  : public evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> >
 {
@@ -91,8 +97,7 @@ struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> >
 // This is the default evaluator implementation for products:
 // It creates a temporary and call generic_product_impl
 template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
-struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar,
-  EnableIf<(Options==DefaultProduct || Options==AliasFreeProduct)> >
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
   : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
 {
   typedef Product<Lhs, Rhs, Options> XprType;
@@ -177,11 +182,41 @@ struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBi
                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
   {
-    // TODO use operator* instead of prod() once we have made enough progress
-    call_assignment(dst.noalias(), prod(src.functor().m_other * src.nestedExpression().lhs(), src.nestedExpression().rhs()), func);
+    call_assignment_no_alias(dst, (src.functor().m_other * src.nestedExpression().lhs())*src.nestedExpression().rhs(), func);
   }
 };
 
+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2>
+struct assignment_from_xpr_plus_product
+{
+  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr, const ProductType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const Func1& func)
+  {
+    call_assignment_no_alias(dst, src.lhs(), func);
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::assign_op<Scalar>, Dense2Dense>
+  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::assign_op<Scalar>, internal::add_assign_op<Scalar> >
+{};
+template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::add_assign_op<Scalar>, Dense2Dense>
+  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::add_assign_op<Scalar>, internal::add_assign_op<Scalar> >
+{};
+template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::sub_assign_op<Scalar>, Dense2Dense>
+  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::sub_assign_op<Scalar>, internal::sub_assign_op<Scalar> >
+{};
+//----------------------------------------
 
 template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
@@ -213,12 +248,12 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  // FIXME make sure lhs is sequentially stored
+  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst
   const Index cols = dst.cols();
   for (Index j=0; j<cols; ++j)
-    func(dst.col(j), rhsEval.coeff(0,j) * lhs);
+    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
 }
 
 // Row major result
@@ -226,12 +261,12 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
 EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  // FIXME make sure rhs is sequentially stored
+  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  // FIXME we should probably build an evaluator for dst
   const Index rows = dst.rows();
   for (Index i=0; i<rows; ++i)
-    func(dst.row(i), lhsEval.coeff(i,0) * rhs);
+    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
 }
 
 template<typename Lhs, typename Rhs>
@@ -314,7 +349,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
   template<typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
-    internal::gemv_dense_sense_selector<Side,
+    internal::gemv_dense_selector<Side,
                             (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
                             bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
                            >::run(lhs, rhs, dst, alpha);
@@ -329,28 +364,28 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   template<typename Dst>
   static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    // TODO: use the following instead of calling call_assignment, same for the other methods
-    // dst = lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::assign_op<Scalar>());
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<Scalar>());
   }
   
   template<typename Dst>
   static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    // dst += lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::add_assign_op<Scalar>());
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<Scalar>());
   }
   
   template<typename Dst>
   static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    // dst -= lazyprod(lhs,rhs);
-    call_assignment(dst, lazyprod(lhs,rhs), internal::sub_assign_op<Scalar>());
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<Scalar>());
   }
   
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
-//   { dst += alpha * lazyprod(lhs,rhs); }
+//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
 };
 
 // This specialization enforces the use of a coefficient-based evaluation strategy
@@ -371,7 +406,7 @@ template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typen
 struct etor_product_packet_impl;
 
 template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar > 
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
     : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
 {
   typedef Product<Lhs, Rhs, LazyProduct> XprType;
@@ -387,7 +422,11 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
       m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
                             //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
       m_innerDim(xpr.lhs().cols())
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
   // Everything below here is taken from CoeffBasedProduct.h
 
@@ -408,15 +447,15 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
       
     PacketSize = packet_traits<Scalar>::size,
-    
+
     LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
     RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
     CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
-                  : (InnerSize == Dynamic || LhsCoeffReadCost==Dynamic || RhsCoeffReadCost==Dynamic || NumTraits<Scalar>::AddCost==Dynamic || NumTraits<Scalar>::MulCost==Dynamic) ? Dynamic
+                  : InnerSize == Dynamic ? HugeCost
                   : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
                     + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
 
-    Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
     
     LhsFlags = LhsEtorType::Flags,
     RhsFlags = RhsEtorType::Flags,
@@ -424,19 +463,16 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     LhsAlignment = LhsEtorType::Alignment,
     RhsAlignment = RhsEtorType::Alignment,
     
-    LhsIsAligned = int(LhsAlignment) >= int(unpacket_traits<PacketScalar>::alignment),
-    RhsIsAligned = int(RhsAlignment) >= int(unpacket_traits<PacketScalar>::alignment),
-    
     LhsRowMajor = LhsFlags & RowMajorBit,
     RhsRowMajor = RhsFlags & RowMajorBit,
       
     SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
 
     CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                    && (ColsAtCompileTime == Dynamic || ( (ColsAtCompileTime % PacketSize) == 0 && RhsIsAligned ) ),
+                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ),
 
     CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                    && (RowsAtCompileTime == Dynamic || ( (RowsAtCompileTime % PacketSize) == 0 && LhsIsAligned ) ),
+                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ),
 
     EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@@ -445,12 +481,16 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
     Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
           | (EvalToRowMajor ? RowMajorBit : 0)
           // TODO enable vectorization for mixed types
-          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
+          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
           
-    Alignment = CanVectorizeLhs ? LhsAlignment
-              : CanVectorizeRhs ? RhsAlignment
+    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = CanVectorizeLhs ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : CanVectorizeRhs ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
               : 0,
-          
+
     /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
@@ -460,13 +500,11 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
                         && LhsRowMajor
                         && (!RhsRowMajor)
                         && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                        && (LhsIsAligned && RhsIsAligned)
                         && (InnerSize % packet_traits<Scalar>::size == 0)
   };
   
-  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
   {
-    // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
     return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
   }
 
@@ -478,7 +516,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   {
     const Index row = RowsAtCompileTime == 1 ? 0 : index;
     const Index col = RowsAtCompileTime == 1 ? index : 0;
-    // TODO check performance regression wrt to Eigen 3.2 which has special handling of this function
     return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
   }
 
@@ -486,14 +523,21 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   const PacketType packet(Index row, Index col) const
   {
     PacketType res;
-    typedef etor_product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                                     Unroll ? InnerSize : Dynamic,
+    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic,
                                      LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
-
     PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
     return res;
   }
 
+  template<int LoadMode, typename PacketType>
+  const PacketType packet(Index index) const
+  {
+    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    return packet<LoadMode,PacketType>(row,col);
+  }
+
 protected:
   const LhsNested m_lhs;
   const RhsNested m_rhs;
@@ -506,12 +550,12 @@ protected:
 };
 
 template<typename Lhs, typename Rhs>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar > 
-  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar >
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
 {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
   typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
-  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar > Base;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
   enum {
     Flags = Base::Flags | EvalBeforeNestingBit
   };
@@ -703,6 +747,8 @@ public:
   diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
     : m_diagImpl(diag), m_matImpl(mat)
   {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
@@ -735,7 +781,7 @@ protected:
 
 // diagonal * dense
 template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape, typename Lhs::Scalar, typename Rhs::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
   : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
 {
   typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
@@ -781,7 +827,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
 
 // dense * diagonal
 template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape, typename Lhs::Scalar, typename Rhs::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
   : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
 {
   typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
@@ -911,20 +957,20 @@ struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
 };
 
 template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Transpose<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
   {
     permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
   }
 };
 
 template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
-struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, PermutationShape, ProductTag>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
   {
     permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
   }
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index c427a4d58..d170cae29 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -50,20 +50,14 @@ public:
 
 public:
   enum {
-    Cost = (  Derived::SizeAtCompileTime == Dynamic
-           || Derived::CoeffReadCost == Dynamic
-           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
-           ) ? Dynamic
-           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
-               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
+         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
 public:
   enum {
-    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
+    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
   };
   
 #ifdef EIGEN_DEBUG_ASSIGN
@@ -269,8 +263,9 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
   typedef typename Derived::Scalar Scalar;
   typedef typename packet_traits<Scalar>::type PacketType;
@@ -414,17 +409,7 @@ typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-  
-  // FIXME, eval_nest should be handled by redux_evaluator, however:
-  //  - it is currently difficult to provide the right Flags since they are still handled by the expressions
-  //  - handling it here might reduce the number of template instantiations
-//   typedef typename internal::nested_eval<Derived,1>::type ThisNested;
-//   typedef typename internal::remove_all<ThisNested>::type ThisNestedCleaned;
-//   typedef typename internal::redux_evaluator<ThisNestedCleaned> ThisEvaluator;
-//   
-//   ThisNested thisNested(derived());
-//   ThisEvaluator thisEval(thisNested);
-  
+
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index 4857a7c42..ba2ee53b8 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@@ -34,12 +34,11 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
 template<typename Decomposition, typename RhsType>
 struct solve_traits<Decomposition,RhsType,Dense>
 {
-  typedef typename Decomposition::MatrixType MatrixType;
   typedef Matrix<typename RhsType::Scalar,
-                 MatrixType::ColsAtCompileTime,
+                 Decomposition::ColsAtCompileTime,
                  RhsType::ColsAtCompileTime,
                  RhsType::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
+                 Decomposition::MaxColsAtCompileTime,
                  RhsType::MaxColsAtCompileTime> PlainObject;  
 };
 
@@ -52,7 +51,7 @@ struct traits<Solve<Decomposition, RhsType> >
   typedef traits<PlainObject> BaseTraits;
   enum {
     Flags = BaseTraits::Flags & RowMajorBit,
-    CoeffReadCost = Dynamic
+    CoeffReadCost = HugeCost
   };
 };
 
@@ -118,6 +117,8 @@ struct evaluator<Solve<Decomposition,RhsType> >
   typedef Solve<Decomposition,RhsType> SolveType;
   typedef typename SolveType::PlainObject PlainObject;
   typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
   
   EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
     : m_result(solve.rows(), solve.cols())
@@ -143,6 +144,28 @@ struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar
   }
 };
 
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
 } // end namepsace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index ded42e0e8..5a2010449 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -107,32 +107,32 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 * meta-unrolling implementation
 ***************************************************************************/
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
-         bool Stop = Index==Size>
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
+         bool Stop = LoopIndex==Size>
 struct triangular_solver_unroller;
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
   enum {
     IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0         : DiagIndex+1
   };
   static void run(const Lhs& lhs, Rhs& rhs)
   {
-    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
-                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+    if (LoopIndex>0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
+                                .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();
 
     if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+      rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);
 
-    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+    triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
   }
 };
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
   static void run(const Lhs&, Rhs&) {}
 };
 
@@ -161,13 +161,6 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 * TriangularView methods
 ***************************************************************************/
 
-/** "in-place" version of TriangularView::solve() where the result is written in \a other
-  *
-  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
-  * This function will const_cast it, so constness isn't honored here.
-  *
-  * See TriangularView:solve() for the details.
-  */
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
 void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
@@ -188,27 +181,6 @@ void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<Ot
     other = otherCopy;
 }
 
-/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
-  *
-  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
-  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
-  * \a Side==OnTheRight.
-  *
-  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
-  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
-  * is an upper (resp. lower) triangular matrix.
-  *
-  * Example: \include Triangular_solve.cpp
-  * Output: \verbinclude Triangular_solve.out
-  *
-  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
-  * to the same matrix or vector \a other.
-  *
-  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
-  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
-  *
-  * \sa TriangularView::solveInPlace()
-  */
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
new file mode 100644
index 000000000..8a4adc229
--- /dev/null
+++ b/Eigen/src/Core/SolverBase.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVERBASE_H
+#define EIGEN_SOLVERBASE_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+
+} // end namespace internal
+
+/** \class SolverBase
+  * \brief A base class for matrix decomposition and solvers
+  *
+  * \tparam Derived the actual type of the decomposition/solver.
+  *
+  * Any matrix decomposition inheriting this base class provide the following API:
+  *
+  * \code
+  * MatrixType A, b, x;
+  * DecompositionType dec(A);
+  * x = dec.solve(b);             // solve A   * x = b
+  * x = dec.transpose().solve(b); // solve A^T * x = b
+  * x = dec.adjoint().solve(b);   // solve A'  * x = b
+  * \endcode
+  *
+  * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
+  *
+  * \sa class PartialPivLU, class FullPivLU
+  */
+template<typename Derived>
+class SolverBase : public EigenBase<Derived>
+{
+  public:
+
+    typedef EigenBase<Derived> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Scalar CoeffReturnType;
+
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                          internal::traits<Derived>::ColsAtCompileTime>::ret),
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+    };
+
+    /** Default constructor */
+    SolverBase()
+    {}
+
+    ~SolverBase()
+    {}
+
+    using Base::derived;
+
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+
+    /** \internal the return type of transpose() */
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code x = dec.transpose().solve(b); \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    inline ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(derived());
+    }
+
+    /** \internal the return type of adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code x = dec.adjoint().solve(b); \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    inline AdjointReturnType adjoint() const
+    {
+      return AdjointReturnType(derived().transpose());
+    }
+
+  protected:
+};
+
+namespace internal {
+
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, SolverStorage>
+{
+  typedef SolverBase<Derived> type;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVERBASE_H
diff --git a/Eigen/src/Core/SpecialFunctions.h b/Eigen/src/Core/SpecialFunctions.h
new file mode 100644
index 000000000..d43cf23a1
--- /dev/null
+++ b/Eigen/src/Core/SpecialFunctions.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+/****************************************************************************
+ * Implementation of lgamma                                                 *
+ ****************************************************************************/
+
+template<typename Scalar>
+struct lgamma_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct lgamma_retval
+{
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template<>
+struct lgamma_impl<float>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); }
+};
+
+template<>
+struct lgamma_impl<double>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); }
+};
+#endif
+
+/****************************************************************************
+ * Implementation of erf                                                    *
+ ****************************************************************************/
+
+template<typename Scalar>
+struct erf_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct erf_retval
+{
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template<>
+struct erf_impl<float>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); }
+};
+
+template<>
+struct erf_impl<double>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc                                                   *
+****************************************************************************/
+
+template<typename Scalar>
+struct erfc_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct erfc_retval
+{
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template<>
+struct erfc_impl<float>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+};
+
+template<>
+struct erfc_impl<double>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+}  // end namespace internal
+
+
+namespace numext {
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 2152405d5..5b66eb5e1 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -39,7 +39,7 @@ struct traits<Transpose<MatrixType> > : public traits<MatrixType>
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
     Flags1 = Flags0 | FlagsLvalueBit,
     Flags = Flags1 ^ RowMajorBit,
     InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 63a1af8c1..099a02ec3 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -222,18 +222,23 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     TriangularView& operator=(const TriangularView &other)
     { return Base::operator=(other); }
 
+    /** \copydoc EigenBase::rows() */
     EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
+    /** \copydoc EigenBase::cols() */
     EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols(); }
 
+    /** \returns a const reference to the nested expression */
     EIGEN_DEVICE_FUNC
     const NestedExpression& nestedExpression() const { return m_matrix; }
+
+    /** \returns a reference to the nested expression */
     EIGEN_DEVICE_FUNC
     NestedExpression& nestedExpression() { return *const_cast<NestedExpression*>(&m_matrix); }
     
-    /** \sa MatrixBase::conjugate() const */
     typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
     EIGEN_DEVICE_FUNC
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
@@ -279,19 +284,28 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     using Base::solve;
   #endif
 
-    EIGEN_DEVICE_FUNC
-    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
+    /** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
+      *
+      * This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
+      * \sa MatrixBase::selfadjointView() */
     EIGEN_DEVICE_FUNC
     SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
     {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
       return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
     }
 
+    /** This is the const version of selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+
+    /** \returns the determinant of the triangular matrix
+      * \sa MatrixBase::determinant() */
     EIGEN_DEVICE_FUNC
     Scalar determinant() const
     {
@@ -341,12 +355,16 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       Flags = internal::traits<TriangularViewType>::Flags
     };
 
+    /** \returns the outer-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::outerStride() */
     EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    /** \returns the inner-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::innerStride() */
     EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
 
-    /** \sa MatrixBase::operator+=() */    
+    /** \sa MatrixBase::operator+=() */
     template<typename Other>
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator+=(const DenseBase<Other>& other) {
@@ -364,7 +382,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     /** \sa MatrixBase::operator*=() */
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
-    /** \sa MatrixBase::operator/=() */
+    /** \sa DenseBase::operator/=() */
     EIGEN_DEVICE_FUNC
     TriangularViewType&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() / other; }
 
@@ -408,21 +426,26 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     EIGEN_DEVICE_FUNC
     TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
 
+    /** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
     EIGEN_DEVICE_FUNC
     TriangularViewType& operator=(const TriangularViewImpl& other)
     { return *this = other.derived().nestedExpression(); }
 
+    /** \deprecated */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     void lazyAssign(const TriangularBase<OtherDerived>& other);
 
+    /** \deprecated */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
-    void lazyAssign(const MatrixBase<OtherDerived>& other);  
+    void lazyAssign(const MatrixBase<OtherDerived>& other);
+#endif
 
     /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
@@ -442,11 +465,39 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       return Product<OtherDerived,TriangularViewType>(lhs.derived(),rhs.derived());
     }
 
+    /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+      *
+      * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+      * \a Side==OnTheRight.
+      *
+      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+      * is an upper (resp. lower) triangular matrix.
+      *
+      * Example: \include Triangular_solve.cpp
+      * Output: \verbinclude Triangular_solve.out
+      *
+      * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+      * to the same matrix or vector \a other.
+      *
+      * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+      * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+      *
+      * \sa TriangularView::solveInPlace()
+      */
     template<int Side, typename Other>
     EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
+    /** "in-place" version of TriangularView::solve() where the result is written in \a other
+      *
+      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+      * This function will const_cast it, so constness isn't honored here.
+      *
+      * See TriangularView:solve() for the details.
+      */
     template<int Side, typename OtherDerived>
     EIGEN_DEVICE_FUNC
     void solveInPlace(const MatrixBase<OtherDerived>& other) const;
@@ -456,18 +507,26 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     void solveInPlace(const MatrixBase<OtherDerived>& other) const
     { return solveInPlace<OnTheLeft>(other); }
 
+    /** Swaps the coefficients of the common triangular parts of two matrices */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    void swap(TriangularBase<OtherDerived> &other)
+#else
     void swap(TriangularBase<OtherDerived> const & other)
+#endif
     {
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
       call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
-    // TODO: this overload is ambiguous and it should be deprecated (Gael)
+    /** \deprecated
+      * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     void swap(MatrixBase<OtherDerived> const & other)
     {
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
       call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
@@ -503,7 +562,7 @@ template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
 void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
-  internal::call_assignment(derived().noalias(), other.template triangularView<Mode>());
+  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
 
 
@@ -523,7 +582,7 @@ template<typename OtherDerived>
 void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
-  internal::call_assignment(derived().noalias(), other.derived());
+  internal::call_assignment_no_alias(derived(), other.derived());
 }
 
 /***************************************************************************
@@ -745,7 +804,7 @@ EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, co
   
   enum {
       unroll = DstXprType::SizeAtCompileTime != Dynamic
-            && SrcEvaluatorType::CoeffReadCost != Dynamic
+            && SrcEvaluatorType::CoeffReadCost < HugeCost
             && DstXprType::SizeAtCompileTime * SrcEvaluatorType::CoeffReadCost / 2 <= EIGEN_UNROLLING_LIMIT
     };
   
diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h
old mode 100644
new mode 100755
index 37171aaa0..483f71909
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@@ -41,8 +41,6 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
   typedef typename MatrixType::Scalar InputScalar;
-  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
@@ -62,8 +60,6 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
 
     typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
-    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;
 
     EIGEN_DEVICE_FUNC
     explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
@@ -74,24 +70,14 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
     EIGEN_DEVICE_FUNC
     Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(j));
-      else
-        return m_functor(m_matrix.row(i));
-    }
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::Nested nestedExpression() const { return m_matrix; }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(index));
-      else
-        return m_functor(m_matrix.row(index));
-    }
+    EIGEN_DEVICE_FUNC
+    const MemberOp& functor() const { return m_functor; }
 
   protected:
-    MatrixTypeNested m_matrix;
+    typename MatrixType::Nested m_matrix;
     const MemberOp m_functor;
 };
 
@@ -124,6 +110,16 @@ EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 
+template <int p, typename ResultType>
+struct member_lpnorm {
+  typedef ResultType result_type;
+  template<typename Scalar, int Size> struct Cost
+  { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
+  EIGEN_DEVICE_FUNC member_lpnorm() {}
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const
+  { return mat.template lpNorm<p>(); }
+};
 
 template <typename BinaryOp, typename Scalar>
 struct member_redux {
@@ -290,6 +286,10 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
     typedef Reverse<ExpressionType, Direction> ReverseReturnType;
 
+    template<int p> struct LpNormReturnType {
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
+    };
+
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
       *
@@ -340,6 +340,19 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     const NormReturnType norm() const
     { return NormReturnType(_expression()); }
 
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    template<int p>
+    EIGEN_DEVICE_FUNC
+    const typename LpNormReturnType<p>::Type lpNorm() const
+    { return typename LpNormReturnType<p>::Type(_expression()); }
+
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, using
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index a4e2cebab..7aac0b6e1 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -109,14 +109,11 @@ void DenseBase<Derived>::visit(Visitor& visitor) const
   typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
-  enum { unroll =   SizeAtCompileTime != Dynamic
-                &&  ThisEvaluator::CoeffReadCost != Dynamic
-                &&  (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                &&  SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, ThisEvaluator,
-      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(thisEval, visitor);
+  enum {
+    unroll =  SizeAtCompileTime != Dynamic
+           && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
+  };
+  return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
 
 namespace internal {
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 06cd56684..7baf57eca 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -10,11 +10,6 @@
 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
 #define EIGEN_MATH_FUNCTIONS_AVX_H
 
-// For some reason, this function didn't make it into the avxintirn.h
-// used by the compiler, so we'll just wrap it.
-#define _mm256_setr_m128(lo, hi) \
-  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
-
 /* The sin, cos, exp, and log functions of this file are loosely derived from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
@@ -38,10 +33,10 @@ psin<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
   _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
   _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04);
-  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07);
-  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
+  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
 
   // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
   Packet8f z = pmul(x, p8f_one_over_pi);
@@ -55,15 +50,15 @@ psin<Packet8f>(const Packet8f& _x) {
   // is odd.
   Packet8i shift_ints = _mm256_cvtps_epi32(shift);
   Packet8i shift_isodd =
-      (__m256i)_mm256_and_ps((__m256)shift_ints, (__m256)p8i_one);
+      _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
 #ifdef EIGEN_VECTORIZE_AVX2
   Packet8i sign_flip_mask = _mm256_slli_epi32(shift_isodd, 31);
 #else
   __m128i lo =
-      _mm_slli_epi32(_mm256_extractf128_si256((__m256i)shift_isodd, 0), 31);
+      _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 0), 31);
   __m128i hi =
-      _mm_slli_epi32(_mm256_extractf128_si256((__m256i)shift_isodd, 1), 31);
-  Packet8i sign_flip_mask = _mm256_setr_m128(lo, hi);
+      _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 1), 31);
+  Packet8i sign_flip_mask = _mm256_set_m128(hi, lo);
 #endif
 
   // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
@@ -72,9 +67,9 @@ psin<Packet8f>(const Packet8f& _x) {
 
   // Evaluate the polynomial for the interval [1,3] in z.
   _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
   Packet8f z_minus_two = psub(z, p8f_two);
   Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
   Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
@@ -82,10 +77,10 @@ psin<Packet8f>(const Packet8f& _x) {
   right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
 
   // Evaluate the polynomial for the interval [-1,1] in z.
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03);
-  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
   Packet8f z2 = pmul(z, z);
   Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
   left = pmadd(left, z2, p8f_coeff_left_3);
@@ -98,7 +93,7 @@ psin<Packet8f>(const Packet8f& _x) {
   Packet8f res = _mm256_or_ps(left, right);
 
   // Flip the sign on the odd intervals and return the result.
-  res = _mm256_xor_ps(res, (__m256)sign_flip_mask);
+  res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
   return res;
 }
 
@@ -145,11 +140,11 @@ plog<Packet8f>(const Packet8f& _x) {
 // Extract the shifted exponents (No bitwise shifting in regular AVX, so
 // convert to SSE and do it there).
 #ifdef EIGEN_VECTORIZE_AVX2
-  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32((__m256i)x, 23));
+  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(x), 23));
 #else
-  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 0), 23);
-  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 1), 23);
-  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi));
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 0), 23);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 1), 23);
+  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_set_m128(hi,lo));
 #endif
   Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
 
@@ -264,7 +259,7 @@ pexp<Packet8f>(const Packet8f& _x) {
 #else
   __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23);
   __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23);
-  emm0 = _mm256_setr_m128(lo, hi);
+  emm0 = _mm256_set_m128(hi,lo);
 #endif
 
   // Return 2^m * exp(r).
@@ -348,7 +343,7 @@ pexp<Packet4d>(const Packet4d& _x) {
 
   // Construct the result 2^n * exp(g) = e * x. The max is used to catch
   // non-finite values in the input.
-  return pmax(pmul(x, Packet4d(e)), _x);
+  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
 }
 
 // Functions for sqrt.
@@ -393,7 +388,7 @@ Packet4d psqrt<Packet4d>(const Packet4d& x) {
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
- _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 32c121ab6..717ae67c5 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -43,7 +43,7 @@ template<> struct is_arithmetic<__m256d> { enum { value = true }; };
   const Packet4d p4d_##NAME = pset1<Packet4d>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
-  const Packet8f p8f_##NAME = (__m256)pset1<Packet8i>(X)
+  const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
   const Packet8i p8i_##NAME = pset1<Packet8i>(X)
@@ -66,7 +66,10 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
@@ -83,7 +86,10 @@ template<> struct packet_traits<double> : default_packet_traits
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
   };
 };
 
@@ -176,6 +182,15 @@ template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const
 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
 
diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h
index 3bea88bea..ecd5c444e 100644
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -66,6 +66,43 @@ double2 prsqrt<double2>(const double2& a)
   return make_double2(rsqrt(a.x), rsqrt(a.y));
 }
 
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
 #endif
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h
index a2d803c06..cb1b547e0 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -39,6 +39,9 @@ template<> struct packet_traits<float> : default_packet_traits
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasLGamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
 
     HasBlend = 0,
   };
@@ -59,6 +62,9 @@ template<> struct packet_traits<double> : default_packet_traits
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
+    HasLGamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
 
     HasBlend = 0,
   };
@@ -177,7 +183,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
   to[1] = from.y;
 }
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
   return __ldg((const float4*)from);
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index d2322b307..d2d467936 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -73,7 +73,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
 
   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
-  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
   // Multiply the real a with b
   v1 = vmulq_f32(v1, b.v);
@@ -325,8 +325,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
 
   // Get the real values of a 
   v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
-  // Get the real values of a 
-  v2 = vdupq_lane_f64(vget_high_f64(a.v), 1);
+  // Get the imag values of a
+  v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
   // Multiply the real a with b
   v1 = vmulq_f64(v1, b.v);
   // Multiply the imag a with b
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 2a44b6272..4f45ddfbf 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -67,7 +67,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
                                  _mm_mul_ps(_mm_movehdup_ps(a.v),
@@ -310,9 +309,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
                                  _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
                                             vec2d_swizzle1(b.v, 1, 0))));
   #else
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 7eb7278af..eb517b871 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -110,6 +110,13 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasSqrt = 1,
     HasRsqrt = 1,
     HasBlend = 1
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    ,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+#endif
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
@@ -127,6 +134,13 @@ template<> struct packet_traits<double> : default_packet_traits
     HasSqrt = 1,
     HasRsqrt = 1,
     HasBlend = 1
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    ,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+#endif
   };
 };
 #endif
@@ -135,7 +149,6 @@ template<> struct packet_traits<int>    : default_packet_traits
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
@@ -223,10 +236,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by SSE");
-  return pset1<Packet4i>(0);
-}
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
@@ -261,6 +270,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
 #endif
 }
 
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
@@ -287,8 +307,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
     #if (EIGEN_COMP_MSVC==1600)
     // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
     // (i.e., it does not generate an unaligned load!!
-    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
-    // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
     __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
     res = _mm_loadh_pi(res, (const __m64*)(from+2));
     return res;
@@ -299,24 +317,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
   template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
   template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
 #else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!
 
 #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8)))
   // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
   #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
 #elif EIGEN_COMP_CLANG
   // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
   #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #else
   #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
@@ -374,17 +384,9 @@ template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& f
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
-  _mm_storeu_pd(to, from);
-#else
-  _mm_storel_pd((to), from);
-  _mm_storeh_pd((to+1), from);
-#endif
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
@@ -547,7 +549,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 }
 
 #ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
@@ -556,11 +557,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
   return _mm_hadd_pd(vecs[0], vecs[1]);
 }
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
@@ -569,23 +565,16 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 }
 
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
-
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
-//   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
 #else
 // SSE2 versions
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
@@ -608,6 +597,18 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 #endif  // SSE3
 
+
+#ifdef EIGEN_VECTORIZE_SSSE3
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+}
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i tmp0 = _mm_hadd_epi32(a,a);
+  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
+}
+#else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
@@ -627,7 +628,7 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
   tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
   return _mm_add_epi32(tmp0, tmp2);
 }
-
+#endif
 // Other reduction functions:
 
 // mul
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index cc0e80a33..4962d625c 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -26,10 +26,10 @@ template<typename Scalar> struct scalar_sum_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::padd(a,b); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return internal::predux(a); }
 };
 template<typename Scalar>
@@ -65,10 +65,10 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmul(a,b); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux_mul(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -97,7 +97,7 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
   { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
   
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -117,10 +117,10 @@ template<typename Scalar> struct scalar_min_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::mini(a, b); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmin(a,b); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return internal::predux_min(a); }
 };
 template<typename Scalar>
@@ -140,10 +140,10 @@ template<typename Scalar> struct scalar_max_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::maxi(a, b); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmax(a,b); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return internal::predux_max(a); }
 };
 template<typename Scalar>
@@ -175,22 +175,37 @@ struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
 
 
 template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+  typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
 };
 template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+  typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
 };
 template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+  typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
 };
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GT> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GE> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>=b;}
+};
 template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+  typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
 };
 template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+  typedef bool result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
 };
@@ -252,7 +267,7 @@ template<typename Scalar> struct scalar_difference_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::psub(a,b); }
 };
 template<typename Scalar>
@@ -277,7 +292,7 @@ template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pdiv(a,b); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -349,7 +364,7 @@ struct scalar_multiple_op {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
   template <typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::pmul(a, pset1<Packet>(m_other)); }
   typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
 };
@@ -384,7 +399,7 @@ struct scalar_quotient1_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
   template <typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::pdiv(a, pset1<Packet>(m_other)); }
   typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
 };
@@ -426,7 +441,7 @@ struct scalar_add_op {
   EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
   template <typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::padd(a, pset1<Packet>(m_other)); }
   const Scalar m_other;
 };
@@ -440,11 +455,11 @@ struct functor_traits<scalar_add_op<Scalar> >
   */
 template<typename Scalar>
 struct scalar_sub_op {
-  inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
-  inline scalar_sub_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return a - m_other; }
+  EIGEN_DEVICE_FUNC inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC inline scalar_sub_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a - m_other; }
   template <typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::psub(a, pset1<Packet>(m_other)); }
   const Scalar m_other;
 };
@@ -458,11 +473,11 @@ struct functor_traits<scalar_sub_op<Scalar> >
   */
 template<typename Scalar>
 struct scalar_rsub_op {
-  inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
-  inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return m_other - a; }
+  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other - a; }
   template <typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::psub(pset1<Packet>(m_other), a); }
   const Scalar m_other;
 };
@@ -477,8 +492,8 @@ struct functor_traits<scalar_rsub_op<Scalar> >
 template<typename Scalar>
 struct scalar_pow_op {
   // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
+  EIGEN_DEVICE_FUNC inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
   EIGEN_DEVICE_FUNC
   inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
   const Scalar m_exponent;
@@ -493,10 +508,10 @@ struct functor_traits<scalar_pow_op<Scalar> >
   */
 template<typename Scalar>
 struct scalar_inverse_mult_op {
-  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
   template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pdiv(pset1<Packet>(m_other),a); }
   Scalar m_other;
 };
diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 130f20868..cd9fbf267 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -21,12 +21,11 @@ struct scalar_constant_op {
   template<typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
   template<typename Index, typename PacketType>
-  EIGEN_STRONG_INLINE const PacketType packetOp(Index, Index = 0) const { return internal::pset1<PacketType>(m_other); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp(Index, Index = 0) const { return internal::pset1<PacketType>(m_other); }
   const Scalar m_other;
 };
 template<typename Scalar>
 struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
 { enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
 
 template<typename Scalar> struct scalar_identity_op {
@@ -64,7 +63,7 @@ struct linspaced_op_impl<Scalar,Packet,false>
   }
 
   template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
 
   const Scalar m_low;
   const Scalar m_step;
@@ -86,7 +85,7 @@ struct linspaced_op_impl<Scalar,Packet,true>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
 
   template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
   { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
 
   const Scalar m_low;
@@ -121,12 +120,12 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
   }
 
   template<typename Index, typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
 
   // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
   // there row==0 and col is used for the actual iteration.
   template<typename Index, typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
   {
     eigen_assert(col==0 || row==0);
     return impl.packetOp(col + row);
@@ -135,14 +134,12 @@ template <typename Scalar, typename PacketType, bool RandomAccess> struct linspa
   // This proxy object handles the actual required temporaries, the different
   // implementations (random vs. sequential access) as well as the
   // correct piping to size 2/4 packet operations.
-  // TODO find a way to make the packet type configurable
   const linspaced_op_impl<Scalar,PacketType,RandomAccess> impl;
 };
 
 // all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
 // to indicate whether a functor allows linear access, just always answering 'yes' except for
 // scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
 template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
 template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
 
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index 2aab9d1ba..6891cfdda 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -23,7 +23,7 @@ template<typename Scalar> struct scalar_opposite_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::pnegate(a); }
 };
 template<typename Scalar>
@@ -43,7 +43,7 @@ template<typename Scalar> struct scalar_abs_op {
   typedef typename NumTraits<Scalar>::Real result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::pabs(a); }
 };
 template<typename Scalar>
@@ -94,7 +94,7 @@ template<typename Scalar> struct scalar_abs2_op {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::pmul(a,a); }
 };
 template<typename Scalar>
@@ -111,7 +111,7 @@ template<typename Scalar> struct scalar_conjugate_op {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_conjugate_op<Scalar> >
@@ -132,7 +132,7 @@ template<typename Scalar> struct scalar_arg_op {
   typedef typename NumTraits<Scalar>::Real result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); }
   template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::parg(a); }
 };
 template<typename Scalar>
@@ -232,7 +232,7 @@ template<typename Scalar> struct scalar_exp_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_exp_op<Scalar> >
@@ -248,7 +248,7 @@ template<typename Scalar> struct scalar_log_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_log_op<Scalar> >
@@ -264,7 +264,7 @@ template<typename Scalar> struct scalar_log10_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log10; return log10(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_log10_op<Scalar> >
@@ -278,7 +278,7 @@ template<typename Scalar> struct scalar_sqrt_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_sqrt_op<Scalar> >
@@ -296,7 +296,7 @@ template<typename Scalar> struct scalar_rsqrt_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return Scalar(1)/sqrt(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
 };
 
 template<typename Scalar>
@@ -315,7 +315,7 @@ template<typename Scalar> struct scalar_cos_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_cos_op<Scalar> >
@@ -334,7 +334,7 @@ template<typename Scalar> struct scalar_sin_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_sin_op<Scalar> >
@@ -354,7 +354,7 @@ template<typename Scalar> struct scalar_tan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_tan_op<Scalar> >
@@ -373,7 +373,7 @@ template<typename Scalar> struct scalar_acos_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_acos_op<Scalar> >
@@ -392,7 +392,7 @@ template<typename Scalar> struct scalar_asin_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
   EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_asin_op<Scalar> >
@@ -403,15 +403,86 @@ struct functor_traits<scalar_asin_op<Scalar> >
   };
 };
 
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::lgamma; return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erf; return erf(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErf
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erfc; return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+
 /** \internal
   * \brief Template functor to compute the atan of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::atan()
   */
 template<typename Scalar> struct scalar_atan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_atan_op<Scalar> >
@@ -422,15 +493,16 @@ struct functor_traits<scalar_atan_op<Scalar> >
   };
 };
 
+
 /** \internal
   * \brief Template functor to compute the tanh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::tanh()
   */
 template<typename Scalar> struct scalar_tanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_tanh_op<Scalar> >
@@ -447,9 +519,9 @@ struct functor_traits<scalar_tanh_op<Scalar> >
   */
 template<typename Scalar> struct scalar_sinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_sinh_op<Scalar> >
@@ -466,9 +538,9 @@ struct functor_traits<scalar_sinh_op<Scalar> >
   */
 template<typename Scalar> struct scalar_cosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); }
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_cosh_op<Scalar> >
@@ -488,7 +560,7 @@ struct scalar_inverse_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
   template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
 };
 template<typename Scalar>
@@ -504,7 +576,7 @@ struct scalar_square_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; }
   template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pmul(a,a); }
 };
 template<typename Scalar>
@@ -520,7 +592,7 @@ struct scalar_cube_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; }
   template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pmul(a,pmul(a,a)); }
 };
 template<typename Scalar>
@@ -535,7 +607,7 @@ template<typename Scalar> struct scalar_round_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pround(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pround(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_round_op<Scalar> >
@@ -554,7 +626,7 @@ template<typename Scalar> struct scalar_floor_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); }
   template <typename Packet>
-  inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_floor_op<Scalar> >
@@ -573,7 +645,7 @@ template<typename Scalar> struct scalar_ceil_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
   typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_ceil_op<Scalar> >
@@ -655,6 +727,49 @@ struct functor_traits<scalar_boolean_not_op<Scalar> > {
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the signum of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sign()
+  */
+template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
+template<typename Scalar> 
+struct scalar_sign_op<Scalar,false> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const 
+  {
+      return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar> 
+struct scalar_sign_op<Scalar,true> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const 
+  {
+    using std::abs;
+    typedef typename NumTraits<Scalar>::Real real_type;
+    real_type aa = abs(a);
+    if (aa==0)
+      return Scalar(0); 
+    aa = 1./aa; 
+    return Scalar(real(a)*aa, imag(a)*aa );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sign_op<Scalar> >
+{ enum {
+    Cost = 
+        NumTraits<Scalar>::IsComplex
+        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
+        : ( 3*NumTraits<Scalar>::AddCost),
+    PacketAccess = packet_traits<Scalar>::HasSign
+  };
+};
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 79eaa7432..229e96ceb 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -36,37 +36,40 @@ const std::ptrdiff_t defaultL3CacheSize = 512*1024;
 #endif
 
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
-{
-  static bool m_cache_sizes_initialized = false;
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  static std::ptrdiff_t m_l3CacheSize = 0;
-
-  if(!m_cache_sizes_initialized)
-  {
+struct CacheSizes { 
+  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
     int l1CacheSize, l2CacheSize, l3CacheSize;
     queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
-    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
-    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
-    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
-    m_cache_sizes_initialized = true;
+    m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
   }
 
+  std::ptrdiff_t m_l1;
+  std::ptrdiff_t m_l2;
+  std::ptrdiff_t m_l3;
+};
+
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+  static CacheSizes m_cacheSizes;
+
   if(action==SetAction)
   {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
     eigen_internal_assert(l1!=0 && l2!=0);
-    m_l1CacheSize = *l1;
-    m_l2CacheSize = *l2;
-    m_l3CacheSize = *l3;
+    m_cacheSizes.m_l1 = *l1;
+    m_cacheSizes.m_l2 = *l2;
+    m_cacheSizes.m_l3 = *l3;
   }
   else if(action==GetAction)
   {
     eigen_internal_assert(l1!=0 && l2!=0);
-    *l1 = m_l1CacheSize;
-    *l2 = m_l2CacheSize;
-    *l3 = m_l3CacheSize;
+    *l1 = m_cacheSizes.m_l1;
+    *l2 = m_cacheSizes.m_l2;
+    *l3 = m_cacheSizes.m_l3;
   }
   else
   {
@@ -200,8 +203,6 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     const Index actual_l2 = 1572864; // == 1.5 MB
     #endif
     
-    
-    
     // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
     // The second half is implicitly reserved to access the result and lhs coefficients.
     // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 3fba82ff8..d830dfb96 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -149,7 +149,7 @@ static void run(Index rows, Index cols, Index depth,
       {
       for(Index i=0; i<threads; ++i)
         #pragma omp atomic
-        --(info[i].users);
+        info[i].users -= 1;
       }
     }
   }
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 91d37a123..e0bfcc356 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -102,21 +102,17 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
   // - we are not already in a parallel code
   // - the sizes are large enough
 
-  // 1- are we already in a parallel session?
-  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
-  if((!Condition) || (omp_get_num_threads()>1))
-    return func(0,rows, 0,cols);
-
-  Index size = transpose ? rows : cols;
-
-  // 2- compute the maximal number of threads from the size of the product:
+  // compute the maximal number of threads from the size of the product:
   // FIXME this has to be fine tuned
-  Index max_threads = std::max<Index>(1,size / 32);
+  Index size = transpose ? rows : cols;
+  Index pb_max_threads = std::max<Index>(1,size / 32);
+  // compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  // 3 - compute the number of threads we are going to use
-  Index threads = std::min<Index>(nbThreads(), max_threads);
-
-  if(threads==1)
+  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
+  // then abort multi-threading
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
     return func(0,rows, 0,cols);
 
   Eigen::initParallel();
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index f3443bd10..d8d30267e 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -30,7 +30,7 @@ struct selfadjoint_matrix_vector_product
 static EIGEN_DONT_INLINE void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha);
 };
@@ -39,11 +39,12 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha)
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
 
   enum {
@@ -54,23 +55,13 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+  conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd;
 
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
-  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
-  // this is because we need to extract packets
-  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
-  if (rhsIncr!=1)
-  {
-    const Scalar* it = _rhs;
-    for (Index i=0; i<size; ++i, it+=rhsIncr)
-      rhs[i] = *it;
-  }
 
   Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
   if (FirstTriangular)
@@ -97,7 +88,6 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
     size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
     size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
 
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j]   += cjd.pmul(numext::real(A0[j]), t0);
     res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
     if(FirstTriangular)
@@ -151,7 +141,6 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
     Scalar t1 = cjAlpha * rhs[j];
     Scalar t2(0);
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j] += cjd.pmul(numext::real(A0[j]), t1);
     for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
     {
@@ -238,7 +227,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
       (
         lhs.rows(),                             // size
         &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
-        actualRhsPtr, 1,                        // rhs info
+        actualRhsPtr,                           // rhs info
         actualDestPtr,                          // result info
         actualAlpha                             // scale factor
       );
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
old mode 100644
new mode 100755
index 86684b66d..a08f385bc
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
@@ -52,16 +52,16 @@ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool Con
 struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
 static void run( \
   Index size, const Scalar*  lhs, Index lhsStride, \
-  const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
+  const Scalar* _rhs, Scalar* res, Scalar alpha) { \
     enum {\
       IsColMajor = StorageOrder==ColMajor \
     }; \
     if (IsColMajor == ConjugateLhs) {\
       selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
     } else {\
       selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
     }\
   } \
 }; \
@@ -79,13 +79,13 @@ typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
 \
 static void run( \
 Index size, const EIGTYPE*  lhs, Index lhsStride, \
-const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
+const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
 { \
   enum {\
     IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
     IsLower = UpLo == Lower ? 1 : 0 \
   }; \
-  MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
+  MKL_INT n=size, lda=lhsStride, incx=1, incy=1; \
   MKLTYPE alpha_, beta_; \
   const EIGTYPE *x_ptr, myone(1); \
   char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
@@ -93,10 +93,9 @@ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
   assign_scalar_eig2mkl(beta_, myone); \
   SYMVVector x_tmp; \
   if (ConjugateRhs) { \
-    Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
+    Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
     x_tmp=map_x.conjugate(); \
     x_ptr=x_tmp.data(); \
-    incx=1; \
   } else x_ptr=_rhs; \
   MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
 }\
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index a9a198d64..208593718 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -304,9 +304,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
                 for (Index i=0; i<actual_mc; ++i)
                   r[i] -= a[i] * b;
               }
-              Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
-              for (Index i=0; i<actual_mc; ++i)
-                r[i] *= b;
+              if((Mode & UnitDiag)==0)
+              {
+                Scalar b = conj(rhs(j,j));
+                for (Index i=0; i<actual_mc; ++i)
+                  r[i] /= b;
+              }
             }
 
             // pack the just computed part of lhs to A
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 8fcde07d3..a364f48d1 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -30,6 +30,14 @@ const int DynamicIndex = 0xffffff;
   */
 const int Infinity = -1;
 
+/** This value means that the cost to evaluate an expression coefficient is either very expensive or
+  * cannot be known at compile time.
+  *
+  * This value has to be positive to (1) simplify cost computation, and (2) allow to distinguish between a very expensive and very very expensive expressions.
+  * It thus must also be large enough to make sure unrolling won't happen and that sub expressions will be evaluated, but not too large to avoid overflow.
+  */
+const int HugeCost = 10000;
+
 /** \defgroup flags Flags
   * \ingroup Core_Module
   *
@@ -189,8 +197,8 @@ const unsigned int HereditaryBits = RowMajorBit
   */
 
 /** \ingroup enums
-  * Enum containing possible values for the \p Mode parameter of 
-  * MatrixBase::selfadjointView() and MatrixBase::triangularView(). */
+  * Enum containing possible values for the \c Mode or \c UpLo parameter of
+  * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
 enum {
   /** View matrix as a lower triangular matrix. */
   Lower=0x1,                      
@@ -484,6 +492,9 @@ struct Dense {};
 /** The type used to identify a general sparse storage. */
 struct Sparse {};
 
+/** The type used to identify a general solver (foctored) storage. */
+struct SolverStorage {};
+
 /** The type used to identify a permutation storage. */
 struct PermutationStorage {};
 
@@ -498,6 +509,7 @@ struct ArrayXpr {};
 
 // An evaluator must define its shape. By default, it can be one of the following:
 struct DenseShape             { static std::string debugName() { return "DenseShape"; } };
+struct SolverShape            { static std::string debugName() { return "SolverShape"; } };
 struct HomogeneousShape       { static std::string debugName() { return "HomogeneousShape"; } };
 struct DiagonalShape          { static std::string debugName() { return "DiagonalShape"; } };
 struct BandShape              { static std::string debugName() { return "BandShape"; } };
@@ -523,7 +535,9 @@ enum ComparisonName {
   cmp_LT = 1,
   cmp_LE = 2,
   cmp_UNORD = 3,
-  cmp_NEQ = 4
+  cmp_NEQ = 4,
+  cmp_GT = 5,
+  cmp_GE = 6
 };
 } // end namespace internal
 
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
old mode 100644
new mode 100755
index 6a0bf0629..747232938
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -10,6 +10,7 @@
   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
   // 4273 - QtAlignedMalloc, inconsistent DLL linkage
   // 4324 - structure was padded due to declspec(align())
+  // 4503 - decorated name length exceeded, name was truncated
   // 4512 - assignment operator could not be generated
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
@@ -17,17 +18,19 @@
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4512 4522 4700 4717 )
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 )
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
   //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
   //        typedef that may be a reference type.
   // 279  - controlling expression is constant
   //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
+  // 1684 - conversion from pointer to same-sized integral type (potential portability problem)
+  // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning push
   #endif
-  #pragma warning disable 2196 279
+  #pragma warning disable 2196 279 1684 2259
 #elif defined __clang__
   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
   //     this is really a stupid warning as it warns on compile-time expressions involving enums
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 7c20fed5e..483af876f 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -132,6 +132,7 @@ template<typename MatrixType> struct CommaInitializer;
 template<typename Derived> class ReturnByValue;
 template<typename ExpressionType> class ArrayWrapper;
 template<typename ExpressionType> class MatrixWrapper;
+template<typename Derived> class SolverBase;
 template<typename XprType> class InnerIterator;
 
 namespace internal {
@@ -160,8 +161,7 @@ template< typename T,
           typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
           typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
           typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
-          typename RhsScalar = typename traits<typename T::Rhs>::Scalar,
-          typename = EnableIf<true> // extra template parameter for SFINAE-based specialization
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar
         > struct product_evaluator;
 }
 
@@ -209,6 +209,7 @@ template<typename Scalar> struct scalar_random_op;
 template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
+template<typename Scalar,bool iscpx> struct scalar_sign_op;
 
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
@@ -266,7 +267,6 @@ template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
 template<typename Scalar,int Dim> class Translation;
 template<typename Scalar,int Dim> class AlignedBox;
-
 template<typename Scalar, int Options = AutoAlign> class Quaternion;
 template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
@@ -274,6 +274,9 @@ template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperp
 template<typename Scalar> class UniformScaling;
 template<typename MatrixType,int Direction> class Homogeneous;
 
+// Sparse module:
+template<typename Derived> class SparseMatrixBase;
+
 // MatrixFunctions module
 template<typename Derived> struct MatrixExponentialReturnValue;
 template<typename Derived> class MatrixFunctionReturnValue;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index abc69f866..9b4f8faa7 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -13,7 +13,7 @@
 
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 91
+#define EIGEN_MINOR_VERSION 92
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -341,6 +341,13 @@
   #define EIGEN_HAVE_RVALUE_REFERENCES
 #endif
 
+// Does the compiler support C99?
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define EIGEN_HAS_C99_MATH 1
+#endif
+
 // Does the compiler support result_of?
 #if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))
 #define EIGEN_HAS_STD_RESULT_OF 1
@@ -353,16 +360,19 @@
 
 // Does the compiler support const expressions?
 #ifdef __CUDACC__
-  // Const expressions are not supported regardless of what host compiler is used 
+// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above
+#if defined(__CUDACC_VER__) &&  __CUDACC_VER__ >= 70500 && __cplusplus > 199711L
+  #define EIGEN_HAS_CONSTEXPR 1
+#endif
 #elif (defined(__cplusplus) && __cplusplus >= 201402L) || \
-    EIGEN_GNUC_AT_LEAST(4,9)
+    EIGEN_GNUC_AT_LEAST(4,8)
 #define EIGEN_HAS_CONSTEXPR 1
 #endif
 
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
 #ifndef EIGEN_HAS_CXX11_MATH
-  #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+  #if (__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
       && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
     #define EIGEN_HAS_CXX11_MATH 1
   #else
@@ -372,17 +382,30 @@
 
 // Does the compiler support proper C++11 containers?
 #ifndef EIGEN_HAS_CXX11_CONTAINERS
-  #if ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG)) || EIGEN_COMP_MSVC >= 1900
+  #if    (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
+      || EIGEN_COMP_MSVC >= 1900
     #define EIGEN_HAS_CXX11_CONTAINERS 1
   #else
     #define EIGEN_HAS_CXX11_CONTAINERS 0
   #endif
 #endif
 
+// Does the compiler support C++11 noexcept?
+#ifndef EIGEN_HAS_CXX11_NOEXCEPT
+  #if    (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
+      || EIGEN_COMP_MSVC >= 1900
+    #define EIGEN_HAS_CXX11_NOEXCEPT 1
+  #else
+    #define EIGEN_HAS_CXX11_NOEXCEPT 0
+  #endif
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
-  *   - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled.
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
   */
 #ifndef EIGEN_FAST_MATH
 #define EIGEN_FAST_MATH 1
@@ -609,10 +632,14 @@ namespace Eigen {
   // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
   // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
   // certain common platform (compiler+architecture combinations) to avoid these problems.
-  // Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
-  // work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-  // when we have to disable static alignment.
-  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
   #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
   #else
   #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
@@ -747,8 +774,6 @@ namespace Eigen {
 * documentation in a single line.
 **/
 
-// TODO The EIGEN_DENSE_PUBLIC_INTERFACE should not exists anymore
-  
 #define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
   typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
@@ -761,17 +786,17 @@ namespace Eigen {
         Flags = Eigen::internal::traits<Derived>::Flags, \
         SizeAtCompileTime = Base::SizeAtCompileTime, \
         MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime };
-
-
-#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
-  typedef typename Base::PacketScalar PacketScalar; \
-  enum { MaxRowsAtCompileTime = Eigen::internal::traits<Derived>::MaxRowsAtCompileTime, \
-        MaxColsAtCompileTime = Eigen::internal::traits<Derived>::MaxColsAtCompileTime}; \
+        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
   using Base::derived; \
   using Base::const_cast_derived;
 
+
+// FIXME Maybe the EIGEN_DENSE_PUBLIC_INTERFACE could be removed as importing PacketScalar is rarely needed
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::PacketScalar PacketScalar;
+
+
 #define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
 #define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
 
@@ -837,4 +862,12 @@ namespace Eigen {
 #  define EIGEN_CATCH(X) else
 #endif
 
+#if EIGEN_HAS_CXX11_NOEXCEPT
+#   define EIGEN_NO_THROW noexcept(true)
+#   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
+#else
+#   define EIGEN_NO_THROW throw()
+#   define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#endif
+
 #endif // EIGEN_MACROS_H
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 69a489d43..1fc535a3a 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -732,7 +732,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 
 #if EIGEN_MAX_ALIGN_BYTES!=0
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
+      void* operator new(size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
         EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
         EIGEN_CATCH (...) { return 0; } \
       }
@@ -743,20 +743,20 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
       void *operator new[](size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
-      void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete(void * ptr, std::size_t /* sz */) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr, std::size_t /* sz */) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
       /* in-place new and delete. since (at least afaik) there is no actual   */ \
       /* memory allocated we can safely let the default implementation handle */ \
       /* this particular case. */ \
       static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
       static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
-      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
-      void operator delete[](void * memory, void *ptr) throw() { return ::operator delete[](memory,ptr); } \
+      void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
+      void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
       } \
       typedef void eigen_aligned_operator_new_marker_type;
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 6eb409194..3dee2bd7c 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,6 +11,11 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
+#if defined(__CUDA_ARCH__)
+#include <cfloat>
+#include <math_constants.h>
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -68,6 +73,18 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
+template<typename T> struct is_integral        { enum { value = false }; };
+template<> struct is_integral<bool>            { enum { value = true }; };
+template<> struct is_integral<char>            { enum { value = true }; };
+template<> struct is_integral<signed char>     { enum { value = true }; };
+template<> struct is_integral<unsigned char>   { enum { value = true }; };
+template<> struct is_integral<signed short>    { enum { value = true }; };
+template<> struct is_integral<unsigned short>  { enum { value = true }; };
+template<> struct is_integral<signed int>      { enum { value = true }; };
+template<> struct is_integral<unsigned int>    { enum { value = true }; };
+template<> struct is_integral<signed long>     { enum { value = true }; };
+template<> struct is_integral<unsigned long>   { enum { value = true }; };
+
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
 
@@ -138,16 +155,16 @@ template<> struct numeric_limits<float>
   EIGEN_DEVICE_FUNC
   static float (max)() { return CUDART_MAX_NORMAL_F; }
   EIGEN_DEVICE_FUNC
-  static float (min)() { return __FLT_EPSILON__; }
+  static float (min)() { return FLT_MIN; }
 };
 template<> struct numeric_limits<double>
 {
   EIGEN_DEVICE_FUNC
   static double epsilon() { return __DBL_EPSILON__; }
   EIGEN_DEVICE_FUNC
-  static double (max)() { return CUDART_INF; }
+  static double (max)() { return DBL_MAX; }
   EIGEN_DEVICE_FUNC
-  static double (min)() { return __DBL_EPSILON__; }
+  static double (min)() { return DBL_MIN; }
 };
 template<> struct numeric_limits<int>
 {
@@ -158,6 +175,15 @@ template<> struct numeric_limits<int>
   EIGEN_DEVICE_FUNC
   static int (min)() { return INT_MIN; }
 };
+template<> struct numeric_limits<unsigned int>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (max)() { return UINT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (min)() { return 0; }
+};
 template<> struct numeric_limits<long>
 {
   EIGEN_DEVICE_FUNC
@@ -167,6 +193,15 @@ template<> struct numeric_limits<long>
   EIGEN_DEVICE_FUNC
   static long (min)() { return LONG_MIN; }
 };
+template<> struct numeric_limits<unsigned long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (max)() { return ULONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (min)() { return 0; }
+};
 template<> struct numeric_limits<long long>
 {
   EIGEN_DEVICE_FUNC
@@ -176,6 +211,15 @@ template<> struct numeric_limits<long long>
   EIGEN_DEVICE_FUNC
   static long long (min)() { return LLONG_MIN; }
 };
+template<> struct numeric_limits<unsigned long long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (max)() { return ULLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (min)() { return 0; }
+};
 
 }
 
@@ -193,7 +237,6 @@ protected:
   EIGEN_DEVICE_FUNC ~noncopyable() {}
 };
 
-
 /** \internal
   * Convenient struct to get the result type of a unary or binary functor.
   *
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 7538a0633..1fe365aa7 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -93,7 +93,11 @@
         THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
         OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
         IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH
+        STORAGE_LAYOUT_DOES_NOT_MATCH,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE,
+        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS,
+        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
+        THIS_TYPE_IS_NOT_SUPPORTED
       };
     };
 
@@ -200,5 +204,9 @@
                                             >::value), \
                           YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
 
+// Check that a cost value is positive, and that is stay within a reasonable range
+// TODO this check could be enabled for internal debugging only
+#define EIGEN_INTERNAL_CHECK_COST_VALUE(C) \
+      EIGEN_STATIC_ASSERT((C)>=0 && (C)<=HugeCost*HugeCost, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE);
 
 #endif // EIGEN_STATIC_ASSERT_H
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index c31cd4801..f9e2959cc 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -233,33 +233,33 @@ template<typename XprType> struct size_of_xpr_at_compile_time
  */
 
 template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
-template<typename T, typename BaseClassType> struct plain_matrix_type_dense;
+template<typename T, typename BaseClassType, int Flags> struct plain_matrix_type_dense;
 template<typename T> struct plain_matrix_type<T,Dense>
 {
-  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind>::type type;
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, traits<T>::Flags>::type type;
 };
 template<typename T> struct plain_matrix_type<T,DiagonalShape>
 {
   typedef typename T::PlainObject type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,MatrixXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,MatrixXpr,Flags>
 {
   typedef Matrix<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,ArrayXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,ArrayXpr,Flags>
 {
   typedef Array<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
@@ -303,6 +303,15 @@ struct eval<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
 };
 
 
+/* similar to plain_matrix_type, but using the evaluator's Flags */
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_object_eval;
+
+template<typename T>
+struct plain_object_eval<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+};
+
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
@@ -385,29 +394,23 @@ struct transfer_constness
   * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
   * \param PlainObject the type of the temporary if needed.
   */
-template<typename T, int n, typename PlainObject = typename eval<T>::type> struct nested_eval
+template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
 {
   enum {
-    // For the purpose of this test, to keep it reasonably simple, we arbitrarily choose a value of Dynamic values.
-    // the choice of 10000 makes it larger than any practical fixed value and even most dynamic values.
-    // in extreme cases where these assumptions would be wrong, we would still at worst suffer performance issues
-    // (poor choice of temporaries).
-    // It's important that this value can still be squared without integer overflowing.
-    DynamicAsInteger = 10000,
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    ScalarReadCostAsInteger = ScalarReadCost == Dynamic ? int(DynamicAsInteger) : int(ScalarReadCost),
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // TODO What if an evaluator evaluate itself into a tempory?
-                                                  // Then CoeffReadCost will be small but we still have to evaluate if n>1... 
-                                                  // The solution might be to ask the evaluator if it creates a temp. Perhaps we could even ask the number of temps?
-    CoeffReadCostAsInteger = CoeffReadCost == Dynamic ? int(DynamicAsInteger) : int(CoeffReadCost),
-    NAsInteger = n == Dynamic ? int(DynamicAsInteger) : n,
-    CostEvalAsInteger   = (NAsInteger+1) * ScalarReadCostAsInteger + CoeffReadCostAsInteger,
-    CostNoEvalAsInteger = NAsInteger * CoeffReadCostAsInteger
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+                                                  //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
+                                                  //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
+                                                  //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
+                                                  //      Another solution could be to count the number of temps?
+    NAsInteger = n == Dynamic ? HugeCost : n,
+    CostEval   = (NAsInteger+1) * ScalarReadCost + CoeffReadCost,
+    CostNoEval = NAsInteger * CoeffReadCost
   };
 
   typedef typename conditional<
         ( (int(evaluator<T>::Flags) & EvalBeforeNestingBit) ||
-          (int(CostEvalAsInteger) < int(CostNoEvalAsInteger)) ),
+          (int(CostEval) < int(CostNoEval)) ),
         PlainObject,
         typename ref_selector<T>::type
   >::type type;
@@ -449,9 +452,9 @@ struct generic_xpr_base<Derived, XprKind, Dense>
 
 /** \internal Helper base class to add a scalar multiple operator
   * overloads for complex types */
-template<typename Derived,typename Scalar,typename OtherScalar,
+template<typename Derived, typename Scalar, typename OtherScalar, typename BaseType,
          bool EnableIt = !is_same<Scalar,OtherScalar>::value >
-struct special_scalar_op_base : public DenseCoeffsBase<Derived>
+struct special_scalar_op_base : public BaseType
 {
   // dummy operator* so that the
   // "using special_scalar_op_base::operator*" compiles
@@ -460,8 +463,8 @@ struct special_scalar_op_base : public DenseCoeffsBase<Derived>
   void operator/(dummy) const;
 };
 
-template<typename Derived,typename Scalar,typename OtherScalar>
-struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCoeffsBase<Derived>
+template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType>
+struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : public BaseType
 {
   const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
   operator*(const OtherScalar& scalar) const
@@ -654,6 +657,43 @@ bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_acces
   return false;
 }
 
+template<typename T, typename U> struct is_same_or_void { enum { value = is_same<T,U>::value }; };
+template<typename T> struct is_same_or_void<void,T>     { enum { value = 1 }; };
+template<typename T> struct is_same_or_void<T,void>     { enum { value = 1 }; };
+template<>           struct is_same_or_void<void,void>  { enum { value = 1 }; };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+std::string demangle_traversal(int t)
+{
+  if(t==DefaultTraversal) return "DefaultTraversal";
+  if(t==LinearTraversal) return "LinearTraversal";
+  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
+  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
+  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
+  return "?";
+}
+std::string demangle_unrolling(int t)
+{
+  if(t==NoUnrolling) return "NoUnrolling";
+  if(t==InnerUnrolling) return "InnerUnrolling";
+  if(t==CompleteUnrolling) return "CompleteUnrolling";
+  return "?";
+}
+std::string demangle_flags(int f)
+{
+  std::string res;
+  if(f&RowMajorBit)                 res += " | RowMajor";
+  if(f&PacketAccessBit)             res += " | Packet";
+  if(f&LinearAccessBit)             res += " | Linear";
+  if(f&LvalueBit)                   res += " | Lvalue";
+  if(f&DirectAccessBit)             res += " | Direct";
+  if(f&NestByRefBit)                res += " | NestByRef";
+  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
+  
+  return res;
+}
+#endif
+
 } // end namespace internal
 
 // we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
@@ -666,7 +706,7 @@ bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_acces
 #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
   EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
                         ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same<LHS, RHS>::value)), \
+                        : int(internal::is_same_or_void<LHS, RHS>::value)), \
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
     
 } // end namespace Eigen
diff --git a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
old mode 100644
new mode 100755
index 27aed923c..e20c3725b
--- a/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
@@ -40,9 +40,9 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+template<> template<typename InputType> inline \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
   typedef MatrixType::RealScalar RealScalar; \
@@ -53,7 +53,7 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
   m_matUisUptodate = false; \
   if(matrix.cols() == 1) \
   { \
-    m_matT = matrix.cast<ComplexScalar>(); \
+    m_matT = matrix.derived().template cast<ComplexScalar>(); \
     if(computeU)  m_matU = ComplexMatrixType::Identity(1,1); \
       m_info = Success; \
       m_isInitialized = true; \
@@ -61,7 +61,6 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
       return *this; \
   } \
   lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
   lapack_int matrix_order = MKLCOLROW; \
   char jobvs, sort='N'; \
   LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
@@ -69,6 +68,7 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
   m_matU.resize(n, n); \
   lapack_int ldvs  = m_matU.outerStride(); \
   m_matT = matrix; \
+  lapack_int lda = m_matT.outerStride(); \
   Matrix<EIGTYPE, Dynamic, Dynamic> w; \
   w.resize(n, 1);\
   info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
old mode 100644
new mode 100755
index e2e28cd4a..a9d6790d5
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -145,7 +145,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       *
       * \sa compute()
       */
-    explicit GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true)
+    GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true)
       : m_eivec(A.rows(), A.cols()),
         m_alphas(A.cols()),
         m_betas(A.cols()),
diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h
old mode 100644
new mode 100755
index 02ebb7d17..a62071d42
--- a/Eigen/src/Eigenvalues/RealQZ.h
+++ b/Eigen/src/Eigenvalues/RealQZ.h
@@ -101,7 +101,7 @@ namespace Eigen {
        *
        * This constructor calls compute() to compute the QZ decomposition.
        */
-      explicit RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true) :
+      RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true) :
         m_S(A.rows(),A.cols()),
         m_T(A.rows(),A.cols()),
         m_Q(A.rows(),A.cols()),
diff --git a/Eigen/src/Eigenvalues/RealSchur_MKL.h b/Eigen/src/Eigenvalues/RealSchur_MKL.h
old mode 100644
new mode 100755
index c3089b468..e80926400
--- a/Eigen/src/Eigenvalues/RealSchur_MKL.h
+++ b/Eigen/src/Eigenvalues/RealSchur_MKL.h
@@ -40,14 +40,13 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+template<> template<typename InputType> inline \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
   eigen_assert(matrix.cols() == matrix.rows()); \
 \
   lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
   lapack_int matrix_order = MKLCOLROW; \
   char jobvs, sort='N'; \
   LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
@@ -55,6 +54,7 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<E
   m_matU.resize(n, n); \
   lapack_int ldvs  = m_matU.outerStride(); \
   m_matT = matrix; \
+  lapack_int lda = m_matT.outerStride(); \
   Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
   wr.resize(n, 1); wi.resize(n, 1); \
   info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 4d62708ad..c64555096 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -411,7 +411,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 
   if(n==1)
   {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0));
+    m_eivalues.coeffRef(0,0) = numext::real(matrix(0,0));
     if(computeEigenvectors)
       m_eivec.setOnes(n,n);
     m_info = Success;
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
old mode 100644
new mode 100755
index 17c0dadd2..3499dc78a
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
@@ -40,9 +40,9 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
-template<> inline \
+template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, int options) \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
 { \
   eigen_assert(matrix.cols() == matrix.rows()); \
   eigen_assert((options&~(EigVecMask|GenEigMask))==0 \
@@ -56,7 +56,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
 \
   if(n==1) \
   { \
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0)); \
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); \
     if(computeEigenvectors) m_eivec.setOnes(n,n); \
     m_info = Success; \
     m_isInitialized = true; \
@@ -64,7 +64,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
     return *this; \
   } \
 \
-  lda = matrix.outerStride(); \
+  lda = m_eivec.outerStride(); \
   matrix_order=MKLCOLROW; \
   char jobz, uplo='L'/*, range='A'*/; \
   jobz = computeEigenvectors ? 'V' : 'N'; \
diff --git a/Eigen/src/Geometry/AlignedBox.h b/Eigen/src/Geometry/AlignedBox.h
index 186d4ecad..03f1a11f8 100644
--- a/Eigen/src/Geometry/AlignedBox.h
+++ b/Eigen/src/Geometry/AlignedBox.h
@@ -163,7 +163,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
    * a uniform distribution */
   inline VectorType sample() const
   {
-    VectorType r;
+    VectorType r(dim());
     for(Index d=0; d<dim(); ++d)
     {
       if(!ScalarTraits::IsInteger)
diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h
index 636712c2b..7fdb8ae83 100644
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@@ -85,10 +85,17 @@ public:
   template<typename Derived>
   inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
 
+  /** \returns the value of the rotation angle in radian */
   Scalar angle() const { return m_angle; }
+  /** \returns a read-write reference to the stored angle in radian */
   Scalar& angle() { return m_angle; }
 
+  /** \returns the rotation axis */
   const Vector3& axis() const { return m_axis; }
+  /** \returns a read-write reference to the stored rotation axis.
+    *
+    * \warning The rotation axis must remain a \b unit vector.
+    */
   Vector3& axis() { return m_axis; }
 
   /** Concatenates two rotations */
@@ -133,7 +140,7 @@ public:
     m_angle = Scalar(other.angle());
   }
 
-  static inline const AngleAxis Identity() { return AngleAxis(0, Vector3::UnitX()); }
+  static inline const AngleAxis Identity() { return AngleAxis(Scalar(0), Vector3::UnitX()); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
@@ -170,8 +177,8 @@ AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived
   }
   else
   {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
+    m_angle = Scalar(0);
+    m_axis << Scalar(1), Scalar(0), Scalar(0);
   }
   return *this;
 }
diff --git a/Eigen/src/Geometry/Homogeneous.h b/Eigen/src/Geometry/Homogeneous.h
index e23758d86..4107fba4d 100644
--- a/Eigen/src/Geometry/Homogeneous.h
+++ b/Eigen/src/Geometry/Homogeneous.h
@@ -445,6 +445,11 @@ struct generic_product_impl<Transform<Scalar,Dim,Mode,Options>, Homogeneous<RhsA
   }
 };
 
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, HomogeneousShape>
+  : public permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h
index 56fa2bfbf..32e7e76fa 100644
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@@ -739,8 +739,9 @@ template<typename Other>
 struct quaternionbase_assign_impl<Other,3,3>
 {
   typedef typename Other::Scalar Scalar;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& mat)
+  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& a_mat)
   {
+    const typename internal::nested_eval<Other,2>::type mat(a_mat);
     using std::sqrt;
     // This algorithm comes from  "Quaternion Calculus and Fast Animation",
     // Ken Shoemake, 1987 SIGGRAPH course notes
diff --git a/Eigen/src/Geometry/Rotation2D.h b/Eigen/src/Geometry/Rotation2D.h
index 65aa83be5..8b0ddcfb0 100644
--- a/Eigen/src/Geometry/Rotation2D.h
+++ b/Eigen/src/Geometry/Rotation2D.h
@@ -64,6 +64,16 @@ public:
   /** Default constructor wihtout initialization. The represented rotation is undefined. */
   Rotation2D() {}
 
+  /** Construct a 2D rotation from a 2x2 rotation matrix \a mat.
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  explicit Rotation2D(const MatrixBase<Derived>& m)
+  {
+    fromRotationMatrix(m.derived());
+  }
+
   /** \returns the rotation angle */
   inline Scalar angle() const { return m_angle; }
 
@@ -103,6 +113,17 @@ public:
   Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
   Matrix2 toRotationMatrix() const;
 
+  /** Set \c *this from a 2x2 rotation matrix \a mat.
+    * In other words, this function extract the rotation angle from the rotation matrix.
+    *
+    * This method is an alias for fromRotationMatrix()
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  Rotation2D& operator=(const  MatrixBase<Derived>& m)
+  { return fromRotationMatrix(m.derived()); }
+
   /** \returns the spherical interpolation between \c *this and \a other using
     * parameter \a t. It is in fact equivalent to a linear interpolation.
     */
diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 8c9d7049b..75f20bda6 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@@ -118,15 +118,15 @@ template<int Mode> struct transform_make_affine;
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
-  * with any other transformations (Transform,Translation,RotationBase,Matrix)
+  * with any other transformations (Transform,Translation,RotationBase,DiagonalMatrix)
   * and can be directly used to transform implicit homogeneous vectors. All these
   * operations are handled via the operator*. For the composition of transformations,
   * its principle consists to first convert the right/left hand sides of the product
   * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
   * Of course, internally, operator* tries to perform the minimal number of operations
   * according to the nature of each terms. Likewise, when applying the transform
-  * to non homogeneous vectors, the latters are automatically promoted to homogeneous
-  * one before doing the matrix product. The convertions to homogeneous representations
+  * to points, the latters are automatically promoted to homogeneous vectors
+  * before doing the matrix product. The conventions to homogeneous representations
   * are performed as follow:
   *
   * \b Translation t (Dim)x(1):
@@ -140,7 +140,7 @@ template<int Mode> struct transform_make_affine;
   * R & 0\\
   * 0\,...\,0 & 1
   * \end{array} \right) \f$
-  *
+  *<!--
   * \b Linear \b Matrix L (Dim)x(Dim):
   * \f$ \left( \begin{array}{cc}
   * L & 0\\
@@ -152,14 +152,20 @@ template<int Mode> struct transform_make_affine;
   * A\\
   * 0\,...\,0\,1
   * \end{array} \right) \f$
+  *-->
+  * \b Scaling \b DiagonalMatrix S (Dim)x(Dim):
+  * \f$ \left( \begin{array}{cc}
+  * S & 0\\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
   *
-  * \b Column \b vector v (Dim)x(1):
+  * \b Column \b point v (Dim)x(1):
   * \f$ \left( \begin{array}{c}
   * v\\
   * 1
   * \end{array} \right) \f$
   *
-  * \b Set \b of \b column \b vectors V1...Vn (Dim)x(n):
+  * \b Set \b of \b column \b points V1...Vn (Dim)x(n):
   * \f$ \left( \begin{array}{ccc}
   * v_1 & ... & v_n\\
   * 1 & ... & 1
@@ -404,26 +410,39 @@ public:
   /** \returns a writable expression of the translation vector of the transformation */
   inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
 
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other.
     *
-    * The right hand side \a other might be either:
-    * \li a vector of size Dim,
+    * The right-hand-side \a other can be either:
     * \li an homogeneous vector of size Dim+1,
-    * \li a set of vectors of size Dim x Dynamic,
-    * \li a set of homogeneous vectors of size Dim+1 x Dynamic,
-    * \li a linear transformation matrix of size Dim x Dim,
-    * \li an affine transformation matrix of size Dim x Dim+1,
+    * \li a set of homogeneous vectors of size Dim+1 x N,
     * \li a transformation matrix of size Dim+1 x Dim+1.
+    *
+    * Moreover, if \c *this represents an affine transformation (i.e., Mode!=Projective), then \a other can also be:
+    * \li a point of size Dim (computes: \code this->linear() * other + this->translation()\endcode),
+    * \li a set of N points as a Dim x N matrix (computes: \code (this->linear() * other).colwise() + this->translation()\endcode),
+    *
+    * In all cases, the return type is a matrix or vector of same sizes as the right-hand-side \a other.
+    *
+    * If you want to interpret \a other as a linear or affine transformation, then first convert it to a Transform<> type,
+    * or do your own cooking.
+    *
+    * Finally, if you want to apply Affine transformations to vectors, then explicitly apply the linear part only:
+    * \code
+    * Affine3f A;
+    * Vector3f v1, v2;
+    * v2 = A.linear() * v1;
+    * \endcode
+    *
     */
   // note: this function is defined here because some compilers cannot find the respective declaration
   template<typename OtherDerived>
-  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
+  EIGEN_STRONG_INLINE const typename OtherDerived::PlainObject
   operator * (const EigenBase<OtherDerived> &other) const
   { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
 
   /** \returns the product expression of a transformation matrix \a a times a transform \a b
     *
-    * The left hand side \a other might be either:
+    * The left hand side \a other can be either:
     * \li a linear transformation matrix of size Dim x Dim,
     * \li an affine transformation matrix of size Dim x Dim+1,
     * \li a general transformation matrix of size Dim+1 x Dim+1.
diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index ff7f08c1c..358444aff 100644
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -23,6 +23,8 @@ namespace Eigen {
   *
   * \tparam _Scalar the type of the scalar.
   *
+  * \implsparsesolverconcept
+  *
   * This preconditioner is suitable for both selfadjoint and general problems.
   * The diagonal entries are pre-inverted and stored into a dense vector.
   *
@@ -37,8 +39,10 @@ class DiagonalPreconditioner
     typedef Matrix<Scalar,Dynamic,1> Vector;
   public:
     typedef typename Vector::StorageIndex StorageIndex;
-    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
     DiagonalPreconditioner() : m_isInitialized(false) {}
 
@@ -114,6 +118,8 @@ class DiagonalPreconditioner
   *
   * \tparam _Scalar the type of the scalar.
   *
+  * \implsparsesolverconcept
+  *
   * The diagonal entries are pre-inverted and stored into a dense vector.
   * 
   * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
@@ -172,6 +178,8 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A naive preconditioner which approximates any matrix as the identity matrix
   *
+  * \implsparsesolverconcept
+  *
   * \sa class DiagonalPreconditioner
   */
 class IdentityPreconditioner
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index a34ee7628..454f46814 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -132,6 +132,8 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
@@ -148,13 +150,15 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
+  * BiCGSTAB can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
 class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<BiCGSTAB> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -180,7 +184,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  explicit BiCGSTAB(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit BiCGSTAB(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~BiCGSTAB() {}
 
@@ -195,7 +200,7 @@ public:
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+      if(!internal::bicgstab(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 8f33c446d..395daa8e4 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -118,6 +118,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   *               Default is \c Lower, best performance is \c Lower|Upper.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
@@ -147,13 +149,15 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
+  * ConjugateGradient can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
 {
   typedef IterativeSolverBase<ConjugateGradient> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -183,7 +187,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  explicit ConjugateGradient(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit ConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~ConjugateGradient() {}
 
@@ -191,12 +196,19 @@ public:
   template<typename Rhs,typename Dest>
   void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
-    typedef Ref<const MatrixType> MatRef;
-    typedef typename internal::conditional<UpLo==(Lower|Upper) && (!MatrixType::IsRowMajor) && (!NumTraits<Scalar>::IsComplex),
-                                           Transpose<const MatRef>, MatRef const&>::type RowMajorWrapper;
+    typedef typename Base::MatrixWrapper MatrixWrapper;
+    typedef typename Base::ActualMatrixType ActualMatrixType;
+    enum {
+      TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                      &&  (UpLo==(Lower|Upper))
+                      &&  (!MatrixType::IsRowMajor)
+                      &&  (!NumTraits<Scalar>::IsComplex)
+    };
+    typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
     typedef typename internal::conditional<UpLo==(Lower|Upper),
                                            RowMajorWrapper,
-                                           typename MatRef::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                           typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
                                           >::type SelfAdjointWrapper;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
@@ -207,7 +219,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      RowMajorWrapper row_mat(mp_matrix);
+      RowMajorWrapper row_mat(matrix());
       internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
diff --git a/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
similarity index 68%
rename from unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
rename to Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index 2e2d9a851..284e37f13 100644
--- a/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -9,24 +10,42 @@
 
 #ifndef EIGEN_INCOMPLETE_CHOlESKY_H
 #define EIGEN_INCOMPLETE_CHOlESKY_H
-#include "Eigen/src/IterativeLinearSolvers/IncompleteLUT.h" 
-#include <Eigen/OrderingMethods>
+
+#include <vector>
 #include <list>
 
 namespace Eigen {  
 /** 
- * \brief Modified Incomplete Cholesky with dual threshold
- * 
- * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
- *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
- * 
- * \tparam _MatrixType The type of the sparse matrix. It should be a symmetric 
- *                     matrix. It is advised to give  a row-oriented sparse matrix 
- * \tparam _UpLo The triangular part of the matrix to reference. 
- * \tparam _OrderingType 
- */
-
-template <typename Scalar, int _UpLo = Lower, typename _OrderingType = AMDOrdering<int> >
+  * \brief Modified Incomplete Cholesky with dual threshold
+  *
+  * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+  *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+  *
+  * \tparam _MatrixType The type of the sparse matrix. It is advised to give a row-oriented sparse matrix
+  * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower
+    *               or Upper. Default is Lower.
+  * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
+  *                       unless EIGEN_MPL2_ONLY is defined, in which case the default is NaturalOrdering<int>.
+  *
+  * \implsparsesolverconcept
+  *
+  * It performs the following incomplete factorization: \f$ S P A P' S \approx L L' \f$
+  * where L is a lower triangular factor, S is a diagonal scaling matrix, and P is a
+  * fill-in reducing permutation as computed by the ordering method.
+  *
+  * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
+  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
+  * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
+  * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
+  *
+  */
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
+#ifndef EIGEN_MPL2_ONLY
+AMDOrdering<int>
+#else
+NaturalOrdering<int>
+#endif
+>
 class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> >
 {
   protected:
@@ -38,45 +57,60 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
     typedef typename OrderingType::PermutationType PermutationType;
     typedef typename PermutationType::StorageIndex StorageIndex; 
     typedef SparseMatrix<Scalar,ColMajor,StorageIndex> FactorType;
-    typedef FactorType MatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorSx;
     typedef Matrix<RealScalar,Dynamic,1> VectorRx;
     typedef Matrix<StorageIndex,Dynamic, 1> VectorIx;
     typedef std::vector<std::list<StorageIndex> > VectorList; 
     enum { UpLo = _UpLo };
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
   public:
+
+    /** Default constructor leaving the object in a partly non-initialized stage.
+      *
+      * You must call compute() or the pair analyzePattern()/factorize() to make it valid.
+      *
+      * \sa IncompleteCholesky(const MatrixType&)
+      */
     IncompleteCholesky() : m_initialShift(1e-3),m_factorizationIsOk(false) {}
     
+    /** Constructor computing the incomplete factorization for the given matrix \a matrix.
+      */
     template<typename MatrixType>
     IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false)
     {
       compute(matrix);
     }
     
+    /** \returns number of rows of the factored matrix */
     Index rows() const { return m_L.rows(); }
     
+    /** \returns number of columns of the factored matrix */
     Index cols() const { return m_L.cols(); }
     
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * It triggers an assertion if \c *this has not been initialized through the respective constructor,
+      * or a call to compute() or analyzePattern().
+      *
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the matrix appears to be negative.
       */
     ComputationInfo info() const
     {
-      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
+      eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
       return m_info;
     }
     
-    /** 
-     * \brief Set the initial shift parameter
-     */
+    /** \brief Set the initial shift parameter \f$ \sigma \f$.
+      */
     void setInitialShift(RealScalar shift) { m_initialShift = shift; }
     
-    /**
-    * \brief Computes the fill reducing permutation vector. 
-    */
+    /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
+      */
     template<typename MatrixType>
     void analyzePattern(const MatrixType& mat)
     {
@@ -85,19 +119,36 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       ord(mat.template selfadjointView<UpLo>(), pinv); 
       if(pinv.size()>0) m_perm = pinv.inverse();
       else              m_perm.resize(0);
-      m_analysisIsOk = true; 
+      m_L.resize(mat.rows(), mat.cols());
+      m_analysisIsOk = true;
+      m_isInitialized = true;
+      m_info = Success;
     }
     
+    /** \brief Performs the numerical factorization of the input matrix \a mat
+      *
+      * The method analyzePattern() or compute() must have been called beforehand
+      * with a matrix having the same pattern.
+      *
+      * \sa compute(), analyzePattern()
+      */
     template<typename MatrixType>
-    void factorize(const MatrixType& amat);
+    void factorize(const MatrixType& mat);
     
+    /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
+      *
+      * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
+      *
+      * \sa analyzePattern(), factorize()
+      */
     template<typename MatrixType>
-    void compute(const MatrixType& matrix)
+    void compute(const MatrixType& mat)
     {
-      analyzePattern(matrix); 
-      factorize(matrix);
+      analyzePattern(mat);
+      factorize(mat);
     }
     
+    // internal
     template<typename Rhs, typename Dest>
     void _solve_impl(const Rhs& b, Dest& x) const
     {
@@ -110,9 +161,17 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       x = m_scale.asDiagonal() * x;
       if (m_perm.rows() == b.rows())
         x = m_perm.inverse() * x;
-      
     }
 
+    /** \returns the sparse lower triangular factor L */
+    const FactorType& matrixL() const { eigen_assert("m_factorizationIsOk"); return m_L; }
+
+    /** \returns a vector representing the scaling factor S */
+    const VectorRx& scalingS() const { eigen_assert("m_factorizationIsOk"); return m_scale; }
+
+    /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
+    const PermutationType& permutationP() const { eigen_assert("m_analysisIsOk"); return m_perm; }
+
   protected:
     FactorType m_L;              // The lower part stored in CSC
     VectorRx m_scale;            // The vector for scaling the matrix 
@@ -121,7 +180,7 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
     bool m_factorizationIsOk; 
     ComputationInfo m_info;
     PermutationType m_perm; 
-    
+
   private:
     inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
 }; 
@@ -135,8 +194,6 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
     
   // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
   
-  m_L.resize(mat.rows(), mat.cols());
-  
   // Apply the fill-reducing permutation computed in analyzePattern()
   if (m_perm.rows() == mat.rows() ) // To detect the null permutation
   {
@@ -176,13 +233,21 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
     }
   
   m_scale = m_scale.cwiseSqrt().cwiseSqrt();
+
+  for (Index j = 0; j < n; ++j)
+    if(m_scale(j)>(std::numeric_limits<RealScalar>::min)())
+      m_scale(j) = RealScalar(1)/m_scale(j);
+    else
+      m_scale(j) = 1;
+
+  // FIXME disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
   
   // Scale and compute the shift for the matrix 
   RealScalar mindiag = NumTraits<RealScalar>::highest();
   for (Index j = 0; j < n; j++)
   {
     for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
-      vals[k] /= (m_scale(j)*m_scale(rowIdx[k]));
+      vals[k] *= (m_scale(j)*m_scale(rowIdx[k]));
     eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored");
     mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
   }
@@ -240,7 +305,6 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
     // Scale the current column
     if(numext::real(diag) <= 0) 
     {
-      std::cerr << "\nNegative diagonal during Incomplete factorization at position " << j << " (value = " << diag << ")\n";
       m_info = NumericalIssue; 
       return; 
     }
@@ -276,8 +340,7 @@ void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType
     updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
   }
   m_factorizationIsOk = true; 
-  m_isInitialized = true;
-  m_info = Success; 
+  m_info = Success;
 }
 
 template<typename Scalar, int _UpLo, typename OrderingType>
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index b644163f1..338e6f10a 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -67,6 +67,8 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * \class IncompleteLUT
   * \brief Incomplete LU factorization with dual-threshold strategy
   *
+  * \implsparsesolverconcept
+  *
   * During the numerical factorization, two dropping rules are used :
   *  1) any element whose magnitude is less than some tolerance is dropped.
   *    This tolerance is obtained by multiplying the input tolerance @p droptol 
@@ -107,11 +109,13 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
     typedef Matrix<StorageIndex,Dynamic,1> VectorI;
     typedef SparseMatrix<Scalar,RowMajor,StorageIndex> FactorType;
 
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
+
   public:
     
-    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
-    
     IncompleteLUT()
       : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
         m_analysisIsOk(false), m_factorizationIsOk(false)
@@ -166,7 +170,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
     template<typename Rhs, typename Dest>
     void _solve_impl(const Rhs& b, Dest& x) const
     {
-      x = m_Pinv * b;  
+      x = m_Pinv * b;
       x = m_lu.template triangularView<UnitLower>().solve(x);
       x = m_lu.template triangularView<Upper>().solve(x);
       x = m_P * x; 
@@ -219,16 +223,25 @@ template<typename _MatrixType>
 void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
 {
   // Compute the Fill-reducing permutation
+  // Since ILUT does not perform any numerical pivoting,
+  // it is highly preferable to keep the diagonal through symmetric permutations.
+#ifndef EIGEN_MPL2_ONLY
+  // To this end, let's symmetrize the pattern and perform AMD on it.
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
-  // Symmetrize the pattern
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
   //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
   SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
-  AtA.prune(keep_diag());
-  internal::minimum_degree_ordering<Scalar, StorageIndex>(AtA, m_P);  // Then compute the AMD ordering...
-
-  m_Pinv  = m_P.inverse(); // ... and the inverse permutation
+  AMDOrdering<StorageIndex> ordering;
+  ordering(AtA,m_P);
+  m_Pinv  = m_P.inverse(); // cache the inverse permutation
+#else
+  // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  COLAMDOrdering<StorageIndex> ordering;
+  ordering(mat1,m_Pinv);
+  m_P = m_Pinv.inverse();
+#endif
 
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 5f4bcea11..3d62fef6e 100644
--- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -12,6 +12,128 @@
 
 namespace Eigen { 
 
+namespace internal {
+
+template<typename MatrixType>
+struct is_ref_compatible_impl
+{
+private:
+  template <typename T0>
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(const Ref<const T>&, int);
+  template<typename T>
+  static no  test(any_conversion<T>, ...);
+
+public:
+  static MatrixType ms_from;
+  enum { value = sizeof(test<MatrixType>(ms_from, 0))==sizeof(yes) };
+};
+
+template<typename MatrixType>
+struct is_ref_compatible
+{
+  enum { value = is_ref_compatible_impl<typename remove_all<MatrixType>::type>::value };
+};
+
+template<typename MatrixType, bool MatrixFree = !internal::is_ref_compatible<MatrixType>::value>
+class generic_matrix_wrapper;
+
+// We have an explicit matrix at hand, compatible with Ref<>
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,false>
+{
+public:
+  typedef Ref<const MatrixType> ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType {
+    typedef typename ActualMatrixType::template ConstSelfAdjointViewReturnType<UpLo>::Type Type;
+  };
+
+  enum {
+    MatrixFree = false
+  };
+
+  generic_matrix_wrapper()
+    : m_dummy(0,0), m_matrix(m_dummy)
+  {}
+
+  template<typename InputType>
+  generic_matrix_wrapper(const InputType &mat)
+    : m_matrix(mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrix;
+  }
+
+  template<typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &mat)
+  {
+    m_matrix.~Ref<const MatrixType>();
+    ::new (&m_matrix) Ref<const MatrixType>(mat.derived());
+  }
+
+  void grab(const Ref<const MatrixType> &mat)
+  {
+    if(&(mat.derived()) != &m_matrix)
+    {
+      m_matrix.~Ref<const MatrixType>();
+      ::new (&m_matrix) Ref<const MatrixType>(mat);
+    }
+  }
+
+protected:
+  MatrixType m_dummy; // used to default initialize the Ref<> object
+  ActualMatrixType m_matrix;
+};
+
+// MatrixType is not compatible with Ref<> -> matrix-free wrapper
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,true>
+{
+public:
+  typedef MatrixType ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType
+  {
+    typedef ActualMatrixType Type;
+  };
+
+  enum {
+    MatrixFree = true
+  };
+
+  generic_matrix_wrapper()
+    : mp_matrix(0)
+  {}
+
+  generic_matrix_wrapper(const MatrixType &mat)
+    : mp_matrix(&mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return *mp_matrix;
+  }
+
+  void grab(const MatrixType &mat)
+  {
+    mp_matrix = &mat;
+  }
+
+protected:
+  const ActualMatrixType *mp_matrix;
+};
+
+}
+
 /** \ingroup IterativeLinearSolvers_Module
   * \brief Base class for linear iterative solvers
   *
@@ -31,13 +153,17 @@ public:
   typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::RealScalar RealScalar;
 
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
 public:
 
   using Base::derived;
 
   /** Default constructor. */
   IterativeSolverBase()
-    : m_dummy(0,0), mp_matrix(m_dummy)
   {
     init();
   }
@@ -54,10 +180,10 @@ public:
     */
   template<typename MatrixDerived>
   explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A)
-    : mp_matrix(A.derived())
+    : m_matrixWrapper(A.derived())
   {
     init();
-    compute(mp_matrix);
+    compute(matrix());
   }
 
   ~IterativeSolverBase() {}
@@ -71,7 +197,7 @@ public:
   Derived& analyzePattern(const EigenBase<MatrixDerived>& A)
   {
     grab(A.derived());
-    m_preconditioner.analyzePattern(mp_matrix);
+    m_preconditioner.analyzePattern(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_info = m_preconditioner.info();
@@ -92,7 +218,7 @@ public:
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
     grab(A.derived());
-    m_preconditioner.factorize(mp_matrix);
+    m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
     m_info = m_preconditioner.info();
     return derived();
@@ -112,7 +238,7 @@ public:
   Derived& compute(const EigenBase<MatrixDerived>& A)
   {
     grab(A.derived());
-    m_preconditioner.compute(mp_matrix);
+    m_preconditioner.compute(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
@@ -121,10 +247,10 @@ public:
   }
 
   /** \internal */
-  Index rows() const { return mp_matrix.rows(); }
+  Index rows() const { return matrix().rows(); }
 
   /** \internal */
-  Index cols() const { return mp_matrix.cols(); }
+  Index cols() const { return matrix().cols(); }
 
   /** \returns the tolerance threshold used by the stopping criteria.
     * \sa setTolerance()
@@ -154,7 +280,7 @@ public:
     */
   Index maxIterations() const
   {
-    return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations;
+    return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations;
   }
   
   /** Sets the max number of iterations.
@@ -234,25 +360,22 @@ protected:
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  
-  template<typename MatrixDerived>
-  void grab(const EigenBase<MatrixDerived> &A)
+
+  typedef internal::generic_matrix_wrapper<MatrixType> MatrixWrapper;
+  typedef typename MatrixWrapper::ActualMatrixType ActualMatrixType;
+
+  const ActualMatrixType& matrix() const
   {
-    mp_matrix.~Ref<const MatrixType>();
-    ::new (&mp_matrix) Ref<const MatrixType>(A.derived());
+    return m_matrixWrapper.matrix();
   }
   
-  void grab(const Ref<const MatrixType> &A)
+  template<typename InputType>
+  void grab(const InputType &A)
   {
-    if(&(A.derived()) != &mp_matrix)
-    {
-      mp_matrix.~Ref<const MatrixType>();
-      ::new (&mp_matrix) Ref<const MatrixType>(A);
-    }
+    m_matrixWrapper.grab(A);
   }
   
-  MatrixType m_dummy;
-  Ref<const MatrixType> mp_matrix;
+  MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
   Index m_maxIterations;
diff --git a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
index 1d819927e..0aea0e099 100644
--- a/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -119,6 +119,8 @@ struct traits<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
   * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
   * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  * 
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
@@ -147,7 +149,7 @@ template< typename _MatrixType, typename _Preconditioner>
 class LeastSquaresConjugateGradient : public IterativeSolverBase<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -173,7 +175,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  explicit LeastSquaresConjugateGradient(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit LeastSquaresConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~LeastSquaresConjugateGradient() {}
 
@@ -190,7 +193,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::least_square_conjugate_gradient(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+      internal::least_square_conjugate_gradient(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
     m_isInitialized = true;
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 07a87cbc6..0c4d63923 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -10,12 +10,14 @@
 #ifndef EIGEN_LU_H
 #define EIGEN_LU_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
  : traits<_MatrixType>
 {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
   enum { Flags = 0 };
 };
 
@@ -53,21 +55,18 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
+  : public SolverBase<FullPivLU<_MatrixType> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivLU> Base;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::plain_row_type<MatrixType, StorageIndex>::type IntRowVectorType;
     typedef typename internal::plain_col_type<MatrixType, StorageIndex>::type IntColVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationQType;
@@ -223,6 +222,7 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa TriangularView::solve(), kernel(), inverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<FullPivLU, Rhs>
     solve(const MatrixBase<Rhs>& b) const
@@ -384,22 +384,26 @@ template<typename _MatrixType> class FullPivLU
 
     inline Index rows() const { return m_lu.rows(); }
     inline Index cols() const { return m_lu.cols(); }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     void computeInPlace();
-    
+
     MatrixType m_lu;
     PermutationPType m_p;
     PermutationQType m_q;
@@ -447,15 +451,15 @@ template<typename InputType>
 FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
   check_template_parameters();
-  
+
   // the permutations are stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-  
+
   m_isInitialized = true;
   m_lu = matrix.derived();
-  
+
   computeInPlace();
-  
+
   return *this;
 }
 
@@ -709,7 +713,7 @@ struct image_retval<FullPivLU<_MatrixType> >
 template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
-{  
+{
   /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
   * So we proceed as follows:
   * Step 1: compute c = P * rhs.
@@ -720,7 +724,7 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 
   const Index rows = this->rows(),
               cols = this->cols(),
-              nonzero_pivots = this->nonzeroPivots();
+              nonzero_pivots = this->rank();
   eigen_assert(rhs.rows() == rows);
   const Index smalldim = (std::min)(rows, cols);
 
@@ -753,6 +757,70 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
   for(Index i = nonzero_pivots; i < m_lu.cols(); ++i)
     dst.row(permutationQ().indices().coeff(i)).setZero();
 }
+
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
+   * and since permutations are real and unitary, we can write this
+   * as   A^T = Q U^T L^T P,
+   * So we proceed as follows:
+   * Step 1: compute c = Q^T rhs.
+   * Step 2: replace c by the solution x to U^T x = c. May or may not exist.
+   * Step 3: replace c by the solution x to L^T x = c.
+   * Step 4: result = P^T c.
+   * If Conjugate is true, replace "^T" by "^*" above.
+   */
+
+  const Index rows = this->rows(), cols = this->cols(),
+    nonzero_pivots = this->rank();
+   eigen_assert(rhs.rows() == cols);
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationQ().inverse() * rhs;
+
+  if (Conjugate) {
+    // Step 2
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .adjoint()
+        .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
+        .template triangularView<UnitLower>()
+        .adjoint()
+        .solveInPlace(c.topRows(smalldim));
+  } else {
+    // Step 2
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .transpose()
+        .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
+        .template triangularView<UnitLower>()
+        .transpose()
+        .solveInPlace(c.topRows(smalldim));
+  }
+
+  // Step 4
+  PermutationPType invp = permutationP().inverse().eval();
+  for(Index i = 0; i < smalldim; ++i)
+    dst.row(invp.indices().coeff(i)) = c.row(i);
+  for(Index i = smalldim; i < rows; ++i)
+    dst.row(invp.indices().coeff(i)).setZero();
+}
+
 #endif
 
 namespace internal {
@@ -765,7 +833,7 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
   typedef FullPivLU<MatrixType> LuType;
   typedef Inverse<LuType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
-  {    
+  {
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 2c28818a3..50e920609 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -11,12 +11,14 @@
 #ifndef EIGEN_PARTIALLU_H
 #define EIGEN_PARTIALLU_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
  : traits<_MatrixType>
 {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
   typedef traits<_MatrixType> BaseTraits;
   enum {
     Flags = BaseTraits::Flags & RowMajorBit,
@@ -58,33 +60,29 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
+  : public SolverBase<PartialPivLU<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<PartialPivLU> Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    // FIXME should be int
-    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
     typedef typename MatrixType::PlainObject PlainObject;
 
 
     /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via PartialPivLU::compute(const MatrixType&).
-    */
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via PartialPivLU::compute(const MatrixType&).
+      */
     PartialPivLU();
 
     /** \brief Default Constructor with memory preallocation
@@ -145,6 +143,7 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<PartialPivLU, Rhs>
     solve(const MatrixBase<Rhs>& b) const
@@ -185,7 +184,7 @@ template<typename _MatrixType> class PartialPivLU
 
     inline Index rows() const { return m_lu.rows(); }
     inline Index cols() const { return m_lu.cols(); }
-    
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
@@ -206,17 +205,44 @@ template<typename _MatrixType> class PartialPivLU
       m_lu.template triangularView<UnitLower>().solveInPlace(dst);
 
       // Step 3
-      m_lu.template triangularView<Upper>().solveInPlace(dst); 
+      m_lu.template triangularView<Upper>().solveInPlace(dst);
+    }
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+      * So we proceed as follows:
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
+      */
+
+      eigen_assert(rhs.rows() == m_lu.cols());
+
+      if (Conjugate) {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
+      } else {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
+      }
+      // Step 3
+      dst = permutationP().transpose() * dst;
     }
     #endif
 
   protected:
-    
+
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-    
+
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
@@ -295,7 +321,7 @@ struct partial_lu_impl
     {
       Index rrows = rows-k-1;
       Index rcols = cols-k-1;
-        
+
       Index row_of_biggest_in_col;
       Score biggest_in_corner
         = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
@@ -436,10 +462,10 @@ template<typename InputType>
 PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
   check_template_parameters();
-  
+
   // the row permutation is stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<NumTraits<int>::highest());
-  
+
   m_lu = matrix.derived();
 
   eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
@@ -481,7 +507,7 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
   return res;
 }
 
-/***** Implementation of solve() *****************************************************/
+/***** Implementation details *****************************************************/
 
 namespace internal {
 
@@ -492,7 +518,7 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
   typedef PartialPivLU<MatrixType> LuType;
   typedef Inverse<LuType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
-  {    
+  {
     dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 4e73edf5b..1999fd289 100644
--- a/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -141,6 +141,10 @@ class PastixBase : public SparseSolverBase<Derived>
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
     
@@ -398,7 +402,9 @@ bool PastixBase<Base>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x
   * NOTE : Note that if the analysis and factorization phase are called separately, 
   * the input matrix will be symmetrized at each call, hence it is advised to 
   * symmetrize the matrix in a end-user program and set \p IsStrSym to true
-  * 
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   * 
   */
@@ -509,7 +515,9 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType, int _UpLo>
@@ -590,7 +598,9 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType, int _UpLo>
diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h
index 234e3213b..7c238ce3c 100755
--- a/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -117,7 +117,9 @@ class PardisoImpl : public SparseSolverBase<Derived>
     typedef Matrix<StorageIndex, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef Array<StorageIndex,64,1,DontAlign> ParameterType;
     enum {
-      ScalarIsComplex = NumTraits<Scalar>::IsComplex
+      ScalarIsComplex = NumTraits<Scalar>::IsComplex,
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
     };
 
     PardisoImpl()
@@ -371,6 +373,8 @@ void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename MatrixType>
@@ -421,6 +425,8 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
   * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename MatrixType, int _UpLo>
@@ -479,6 +485,8 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename MatrixType, int Options>
diff --git a/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/Eigen/src/QR/ColPivHouseholderQR_MKL.h
old mode 100644
new mode 100755
index 7b6ba0a5e..1203d0d36
--- a/Eigen/src/QR/ColPivHouseholderQR_MKL.h
+++ b/Eigen/src/QR/ColPivHouseholderQR_MKL.h
@@ -41,10 +41,10 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+template<> template<typename InputType> inline \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
-              const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix) \
+              const EigenBase<InputType>& matrix) \
 \
 { \
   using std::abs; \
@@ -52,9 +52,9 @@ ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynami
   typedef MatrixType::RealScalar RealScalar; \
   Index rows = matrix.rows();\
   Index cols = matrix.cols();\
-  Index size = matrix.diagonalSize();\
 \
   m_qr = matrix;\
+  Index size = m_qr.diagonalSize();\
   m_hCoeffs.resize(size);\
 \
   m_colsTranspositions.resize(cols);\
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 4ad22f8b4..d9c3113e7 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -33,27 +33,29 @@ namespace Eigen {
   } // End namespace internal
   
 /**
- * \ingroup SPQRSupport_Module
- * \class SPQR
- * \brief Sparse QR factorization based on SuiteSparseQR library
- * 
- * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition 
- * of sparse matrices. The result is then used to solve linear leasts_square systems.
- * Clearly, a QR factorization is returned such that A*P = Q*R where :
- * 
- * P is the column permutation. Use colsPermutation() to get it.
- * 
- * Q is the orthogonal matrix represented as Householder reflectors. 
- * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
- * You can then apply it to a vector.
- * 
- * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
- * NOTE : The Index type of R is always UF_long. You can get it with SPQR::Index
- * 
- * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
- * NOTE 
- * 
- */
+  * \ingroup SPQRSupport_Module
+  * \class SPQR
+  * \brief Sparse QR factorization based on SuiteSparseQR library
+  *
+  * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition
+  * of sparse matrices. The result is then used to solve linear leasts_square systems.
+  * Clearly, a QR factorization is returned such that A*P = Q*R where :
+  *
+  * P is the column permutation. Use colsPermutation() to get it.
+  *
+  * Q is the orthogonal matrix represented as Householder reflectors.
+  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * You can then apply it to a vector.
+  *
+  * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
+  * NOTE : The Index type of R is always SuiteSparse_long. You can get it with SPQR::Index
+  *
+  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  *
+  */
 template<typename _MatrixType>
 class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
 {
@@ -63,9 +65,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
   public:
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef UF_long StorageIndex ;
+    typedef SuiteSparse_long StorageIndex ;
     typedef SparseMatrix<Scalar, ColMajor, StorageIndex> MatrixType;
     typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
   public:
     SPQR() 
       : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
old mode 100644
new mode 100755
index e29d36cf2..59c965e15
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -539,7 +539,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       * according to the specified problem size.
       * \sa JacobiSVD()
       */
-    explicit JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
+    JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
     {
       allocate(rows, cols, computationOptions);
     }
@@ -666,7 +666,7 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
   
   if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
   if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
-  if(m_cols!=m_cols)  m_scaledMatrix.resize(rows,cols);
+  if(m_rows!=m_cols)  m_scaledMatrix.resize(rows,cols);
 }
 
 template<typename MatrixType, int QRPreconditioner>
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index f56298e8c..1343eb15c 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -71,6 +71,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef Matrix<StorageIndex,Dynamic,1> VectorI;
 
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+
   public:
     
     using Base::derived;
@@ -319,6 +324,8 @@ template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<Simp
   *               or Upper. Default is Lower.
   * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
   */
 template<typename _MatrixType, int _UpLo, typename _Ordering>
@@ -408,6 +415,8 @@ public:
   *               or Upper. Default is Lower.
   * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
   */
 template<typename _MatrixType, int _UpLo, typename _Ordering>
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 6e664515d..0f6835846 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -138,7 +138,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
   {
     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
-    typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime>::type ColMajorMatrix;
+    typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
     
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
@@ -255,6 +255,89 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
   }
 };
 
+} // end namespace internal
+
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+{
+  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  Index cols = rhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+
+  for (Index j=0; j<cols; ++j)
+  {
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
+    {
+      Scalar y = rhsIt.value();
+      Index k = rhsIt.index();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
+      {
+        Index i = lhsIt.index();
+        Scalar x = lhsIt.value();
+        res.coeffRef(i,j) += x * y;
+      }
+    }
+  }
+}
+
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor>
+struct sparse_sparse_to_dense_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    internal::sparse_sparse_to_dense_product_impl<Lhs,Rhs,ResultType>(lhs, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
+    ColMajorMatrix lhsCol(lhs);
+    internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
+    ColMajorMatrix rhsCol(rhs);
+    internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    Transpose<ResultType> trRes(res);
+    internal::sparse_sparse_to_dense_product_impl<Rhs,Lhs,Transpose<ResultType> >(rhs, lhs, trRes);
+  }
+};
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h
index e984bbdb3..4a8dd12e4 100644
--- a/Eigen/src/SparseCore/SparseAssign.h
+++ b/Eigen/src/SparseCore/SparseAssign.h
@@ -64,6 +64,7 @@ struct Sparse2Dense  {};
 template<> struct AssignmentKind<SparseShape, SparseShape>           { typedef Sparse2Sparse Kind; };
 template<> struct AssignmentKind<SparseShape, SparseTriangularShape> { typedef Sparse2Sparse Kind; };
 template<> struct AssignmentKind<DenseShape,  SparseShape>           { typedef Sparse2Dense  Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseTriangularShape> { typedef Sparse2Dense  Kind; };
 
 
 template<typename DstXprType, typename SrcXprType>
@@ -132,13 +133,16 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse, Scalar>
   }
 };
 
-// Sparse to Dense assignment
+// Generic Sparse to Dense assignment
 template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
 struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
 {
   static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+
+    if(internal::is_same<Functor,internal::assign_op<Scalar> >::value)
+      dst.setZero();
     
     internal::evaluator<SrcXprType> srcEval(src);
     internal::evaluator<DstXprType> dstEval(dst);
@@ -149,23 +153,6 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
   }
 };
 
-template< typename DstXprType, typename SrcXprType, typename Scalar>
-struct Assignment<DstXprType, SrcXprType, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense, Scalar>
-{
-  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &)
-  {
-    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-    
-    dst.setZero();
-    internal::evaluator<SrcXprType> srcEval(src);
-    internal::evaluator<DstXprType> dstEval(dst);
-    const Index outerEvaluationSize = (internal::evaluator<SrcXprType>::Flags&RowMajorBit) ? src.rows() : src.cols();
-    for (Index j=0; j<outerEvaluationSize; ++j)
-      for (typename internal::evaluator<SrcXprType>::InnerIterator i(srcEval,j); i; ++i)
-        dstEval.coeffRef(i.row(),i.col()) = i.value();
-  }
-};
-
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 9afb5327e..10be84856 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -23,6 +23,8 @@ public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
 protected:
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 
@@ -88,10 +90,11 @@ class sparse_matrix_block_impl
 {
     typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
+    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 protected:
     typedef typename Base::IndexVector IndexVector;
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
@@ -114,7 +117,8 @@ public:
       // and/or it is not at the end of the nonzeros of the underlying matrix.
 
       // 1 - eval to a temporary to avoid transposition and/or aliasing issues
-      SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, StorageIndex> tmp(other);
+      Ref<const SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, StorageIndex> > tmp(other.derived());
+      eigen_internal_assert(tmp.outerSize()==m_outerSize.value());
 
       // 2 - let's check whether there is enough allocated memory
       Index nnz           = tmp.nonZeros();
@@ -127,6 +131,7 @@ public:
                           ? Index(matrix.data().allocatedSize()) + block_size
                           : block_size;
 
+      bool update_trailing_pointers = false;
       if(nnz>free_size) 
       {
         // realloc manually to reduce copies
@@ -135,8 +140,8 @@ public:
         internal::smart_copy(&m_matrix.data().value(0),  &m_matrix.data().value(0) + start, &newdata.value(0));
         internal::smart_copy(&m_matrix.data().index(0),  &m_matrix.data().index(0) + start, &newdata.index(0));
 
-        internal::smart_copy(&tmp.data().value(0),  &tmp.data().value(0) + nnz, &newdata.value(start));
-        internal::smart_copy(&tmp.data().index(0),  &tmp.data().index(0) + nnz, &newdata.index(start));
+        internal::smart_copy(tmp.valuePtr(), tmp.valuePtr() + nnz, &newdata.value(start));
+        internal::smart_copy(tmp.innerIndexPtr(), tmp.innerIndexPtr() + nnz, &newdata.index(start));
 
         internal::smart_copy(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &newdata.value(start+nnz));
         internal::smart_copy(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &newdata.index(start+nnz));
@@ -144,35 +149,53 @@ public:
         newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
 
         matrix.data().swap(newdata);
+
+        update_trailing_pointers = true;
       }
       else
       {
-        // no need to realloc, simply copy the tail at its respective position and insert tmp
-        matrix.data().resize(start + nnz + tail_size);
+        if(m_matrix.isCompressed())
+        {
+          // no need to realloc, simply copy the tail at its respective position and insert tmp
+          matrix.data().resize(start + nnz + tail_size);
 
-        internal::smart_memmove(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &matrix.data().value(start + nnz));
-        internal::smart_memmove(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &matrix.data().index(start + nnz));
+          internal::smart_memmove(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &matrix.data().value(start + nnz));
+          internal::smart_memmove(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &matrix.data().index(start + nnz));
 
-        internal::smart_copy(&tmp.data().value(0),  &tmp.data().value(0) + nnz, &matrix.data().value(start));
-        internal::smart_copy(&tmp.data().index(0),  &tmp.data().index(0) + nnz, &matrix.data().index(start));
+          update_trailing_pointers = true;
+        }
+
+        internal::smart_copy(tmp.valuePtr(),  tmp.valuePtr() + nnz, &matrix.data().value(start));
+        internal::smart_copy(tmp.innerIndexPtr(),  tmp.innerIndexPtr() + nnz, &matrix.data().index(start));
       }
-      
-      // update innerNonZeros
-      if(!m_matrix.isCompressed())
-        for(Index j=0; j<m_outerSize.value(); ++j)
-          matrix.innerNonZeroPtr()[m_outerStart+j] = StorageIndex(tmp.innerVector(j).nonZeros());
 
-      // update outer index pointers
-      StorageIndex p = StorageIndex(start);
-      for(Index k=0; k<m_outerSize.value(); ++k)
+      // update outer index pointers and innerNonZeros
+      if(IsVectorAtCompileTime)
       {
-        matrix.outerIndexPtr()[m_outerStart+k] = p;
-        p += tmp.innerVector(k).nonZeros();
+        if(!m_matrix.isCompressed())
+          matrix.innerNonZeroPtr()[m_outerStart] = StorageIndex(nnz);
+        matrix.outerIndexPtr()[m_outerStart] = StorageIndex(start);
       }
-      StorageIndex offset = internal::convert_index<StorageIndex>(nnz - block_size);
-      for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+      else
       {
-        matrix.outerIndexPtr()[k] += offset;
+        StorageIndex p = StorageIndex(start);
+        for(Index k=0; k<m_outerSize.value(); ++k)
+        {
+          Index nnz_k = tmp.innerVector(k).nonZeros();
+          if(!m_matrix.isCompressed())
+            matrix.innerNonZeroPtr()[m_outerStart+k] = StorageIndex(nnz_k);
+          matrix.outerIndexPtr()[m_outerStart+k] = p;
+          p += nnz_k;
+        }
+      }
+
+      if(update_trailing_pointers)
+      {
+        StorageIndex offset = internal::convert_index<StorageIndex>(nnz - block_size);
+        for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+        {
+          matrix.outerIndexPtr()[k] += offset;
+        }
       }
 
       return derived();
@@ -289,7 +312,7 @@ private:
   template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr, Index i);
   template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr);
 };
-  
+
 //----------
 
 /** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
@@ -339,7 +362,9 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,InnerPanel> >, internal::no_assignment_operator
 {
-  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index 0dbb94faf..c223e4f42 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -28,7 +28,7 @@ class SparseCompressedBase
 {
   public:
     typedef SparseMatrixBase<Derived> Base;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
     using Base::operator=;
     using Base::IsRowMajor;
     
@@ -45,13 +45,14 @@ class SparseCompressedBase
     /** \returns the number of non zero coefficients */
     inline Index nonZeros() const
     {
-      if(isCompressed())
+      if(Derived::IsVectorAtCompileTime && outerIndexPtr()==0)
+        return derived().nonZeros();
+      else if(isCompressed())
         return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
       else if(derived().outerSize()==0)
         return 0;
       else
         return innerNonZeros().sum();
-      
     }
     
     /** \returns a const pointer to the array of values.
@@ -74,10 +75,12 @@ class SparseCompressedBase
 
     /** \returns a const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
       * \sa valuePtr(), innerIndexPtr() */
     inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
     /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
       * \sa valuePtr(), innerIndexPtr() */
     inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
 
@@ -92,7 +95,12 @@ class SparseCompressedBase
     
     /** \returns whether \c *this is in compressed form. */
     inline bool isCompressed() const { return innerNonZeroPtr()==0; }
-  
+
+  protected:
+    /** Default constructor. Do nothing. */
+    SparseCompressedBase() {}
+  private:
+    template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
 };
 
 template<typename Derived>
@@ -100,12 +108,33 @@ class SparseCompressedBase<Derived>::InnerIterator
 {
   public:
     InnerIterator(const SparseCompressedBase& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer])
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
     {
-      if(mat.isCompressed())
-        m_end = mat.outerIndexPtr()[outer+1];
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_id = 0;
+        m_end = mat.nonZeros();
+      }
       else
-        m_end = m_id + mat.innerNonZeroPtr()[outer];
+      {
+        m_id = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_end = mat.outerIndexPtr()[outer+1];
+        else
+          m_end = m_id + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit InnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_id(0), m_end(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit InnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(&data.value(0)), m_indices(&data.index(0)), m_outer(0), m_id(0), m_end(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
     }
 
     inline InnerIterator& operator++() { m_id++; return *this; }
@@ -114,16 +143,16 @@ class SparseCompressedBase<Derived>::InnerIterator
     inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
 
     inline StorageIndex index() const { return m_indices[m_id]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
 
     inline operator bool() const { return (m_id < m_end); }
 
   protected:
     const Scalar* m_values;
     const StorageIndex* m_indices;
-    const Index m_outer;
+    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
     Index m_id;
     Index m_end;
   private:
@@ -138,12 +167,33 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator
 {
   public:
     ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer])
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
     {
-      if(mat.isCompressed())
-        m_id = mat.outerIndexPtr()[outer+1];
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_start = 0;
+        m_id = mat.nonZeros();
+      }
       else
-        m_id = m_start + mat.innerNonZeroPtr()[outer];
+      {
+        m_start.value() = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_id = mat.outerIndexPtr()[outer+1];
+        else
+          m_id = m_start.value() + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit ReverseInnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_start(0), m_id(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit ReverseInnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(&data.value(0)), m_indices(&data.index(0)), m_outer(0), m_start(0), m_id(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
     }
 
     inline ReverseInnerIterator& operator--() { --m_id; return *this; }
@@ -152,18 +202,18 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator
     inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
 
     inline StorageIndex index() const { return m_indices[m_id-1]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
 
-    inline operator bool() const { return (m_id > m_start); }
+    inline operator bool() const { return (m_id > m_start.value()); }
 
   protected:
     const Scalar* m_values;
     const StorageIndex* m_indices;
-    const Index m_outer;
+    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
     Index m_id;
-    const Index m_start;
+    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_start;
 };
 
 namespace internal {
@@ -181,8 +231,14 @@ struct evaluator<SparseCompressedBase<Derived> >
     Flags = Derived::Flags
   };
   
-  evaluator() : m_matrix(0) {}
-  explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
+  evaluator() : m_matrix(0)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  explicit evaluator(const Derived &mat) : m_matrix(&mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   inline Index nonZerosEstimate() const {
     return m_matrix->nonZeros();
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 973b80095..d9420ac63 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -35,13 +35,13 @@ class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
 {
   public:
     typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+    typedef SparseMatrixBase<Derived> Base;
     EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
     CwiseBinaryOpImpl()
     {
-      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
-      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
       EIGEN_STATIC_ASSERT((
-                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
+                (!internal::is_same<typename internal::traits<Lhs>::StorageKind,
+                                    typename internal::traits<Rhs>::StorageKind>::value)
             ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
             THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
     }
@@ -138,7 +138,10 @@ public:
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   inline Index nonZerosEstimate() const {
     return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
@@ -219,7 +222,10 @@ public:
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   inline Index nonZerosEstimate() const {
     return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
@@ -288,7 +294,10 @@ public:
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   inline Index nonZerosEstimate() const {
     return m_rhsImpl.nonZerosEstimate();
@@ -358,7 +367,10 @@ public:
     : m_functor(xpr.functor()),
       m_lhsImpl(xpr.lhs()), 
       m_rhsImpl(xpr.rhs())  
-  { }
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   inline Index nonZerosEstimate() const {
     return m_lhsImpl.nonZerosEstimate();
@@ -410,10 +422,10 @@ Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>&
     
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+EIGEN_STRONG_INLINE const typename SparseMatrixBase<Derived>::template CwiseProductDenseReturnType<OtherDerived>::Type
 SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) const
 {
-  return EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE(derived(), other.derived());
+  return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 469bac36e..fe4a97120 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -29,7 +29,11 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
       Flags = XprType::Flags
     };
     
-    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {}
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
     
     inline Index nonZerosEstimate() const {
       return m_argImpl.nonZerosEstimate();
@@ -108,7 +112,11 @@ struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
       Flags = XprType::Flags
     };
     
-    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {}
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<ViewOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
 
   protected:
     typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 67b3c9c1b..87c946b9b 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -160,8 +160,8 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
   template<typename Dest>
   static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
-    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
-    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? 1 : Rhs::ColsAtCompileTime>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==0) ? 1 : Dynamic>::type RhsNested;
     LhsNested lhsNested(lhs);
     RhsNested rhsNested(rhs);
     internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha);
@@ -182,8 +182,8 @@ struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
   template<typename Dst>
   static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
-    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
-    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? Dynamic : 1>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==RowMajorBit) ? 1 : Lhs::RowsAtCompileTime>::type RhsNested;
     LhsNested lhsNested(lhs);
     RhsNested rhsNested(rhs);
     
@@ -221,7 +221,7 @@ protected:
 public:
   enum {
     Flags = NeedToTranspose ? RowMajorBit : 0,
-    CoeffReadCost = Dynamic
+    CoeffReadCost = HugeCost
   };
   
   class InnerIterator : public LhsIterator
@@ -263,12 +263,16 @@ public:
   
   sparse_dense_outer_product_evaluator(const Lhs1 &lhs, const ActualRhs &rhs)
      : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
-  {}
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   // transpose case
   sparse_dense_outer_product_evaluator(const ActualRhs &rhs, const Lhs1 &lhs)
      : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
-  {}
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
     
 protected:
   const LhsArg m_lhs;
@@ -278,7 +282,7 @@ protected:
 
 // sparse * dense outer product
 template<typename Lhs, typename Rhs>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape>
   : sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor>
 {
   typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor> Base;
@@ -293,7 +297,7 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, Sparse
 };
 
 template<typename Lhs, typename Rhs>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape>
   : sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor>
 {
   typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor> Base;
diff --git a/Eigen/src/SparseCore/SparseDiagonalProduct.h b/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 42e29cf70..e4af49e09 100644
--- a/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -35,22 +35,22 @@ template<typename SparseXprType, typename DiagonalCoeffType, int SDP_Tag>
 struct sparse_diagonal_product_evaluator;
 
 template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DiagonalShape, SparseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DiagonalShape, SparseShape>
   : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct>
 {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  enum { CoeffReadCost = Dynamic, Flags = Rhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
   
   typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct> Base;
   explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 };
 
 template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseShape, DiagonalShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseShape, DiagonalShape>
   : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct>
 {
   typedef Product<Lhs, Rhs, DefaultProduct> XprType;
-  enum { CoeffReadCost = Dynamic, Flags = Lhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
   
   typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct> Base;
   explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal().transpose()) {}
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index 7c512d9fe..36c09ab0c 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -63,7 +63,7 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
 
     Index   m_outerSize;
     Index   m_innerSize;
-    Index   m_nnz;
+    Array<StorageIndex,2,1>  m_zero_nnz;
     IndexPointer  m_outerIndex;
     IndexPointer  m_innerIndices;
     ScalarPointer m_values;
@@ -75,6 +75,7 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
     inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
     inline Index innerSize() const { return m_innerSize; }
     inline Index outerSize() const { return m_outerSize; }
+    inline Index nonZeros() const { return m_zero_nnz[1]; }
     
     bool isCompressed() const { return m_innerNonZeros==0; }
 
@@ -107,12 +108,21 @@ class SparseMapBase<Derived,ReadOnlyAccessors>
 
     inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
                               ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
-      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(outerIndexPtr),
         m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
     {}
 
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, IndexPointer innerIndexPtr, ScalarPointer valuePtr)
+      : m_outerSize(1), m_innerSize(size), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(m_zero_nnz.data()),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(0)
+    {}
+
     /** Empty destructor */
     inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
 };
 
 template<typename Derived>
@@ -163,8 +173,16 @@ class SparseMapBase<Derived,WriteAccessors>
       : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
     {}
 
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, StorageIndex* innerIndexPtr, Scalar* valuePtr)
+      : Base(size, nnz, innerIndexPtr, valuePtr)
+    {}
+
     /** Empty destructor */
     inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
 };
 
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
@@ -173,7 +191,7 @@ class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
 {
   public:
     typedef SparseMapBase<Map> Base;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
     enum { IsRowMajor = Base::IsRowMajor };
 
   public:
@@ -193,7 +211,7 @@ class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
 {
   public:
     typedef SparseMapBase<Map> Base;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
     enum { IsRowMajor = Base::IsRowMajor };
 
   public:
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index f18829866..91bada40f 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -92,11 +92,12 @@ template<typename _Scalar, int _Options, typename _Index>
 class SparseMatrix
   : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
 {
-  public:
     typedef SparseCompressedBase<SparseMatrix> Base;
+    using Base::convert_index;
+  public:
     using Base::isCompressed;
     using Base::nonZeros;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
     using Base::operator+=;
     using Base::operator-=;
 
@@ -436,7 +437,13 @@ class SparseMatrix
     template<typename InputIterators>
     void setFromTriplets(const InputIterators& begin, const InputIterators& end);
 
-    void sumupDuplicates();
+    template<typename InputIterators,typename DupFunctor>
+    void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar>()); }
+
+    template<typename DupFunctor>
+    void collapseDuplicates(DupFunctor dup_func = DupFunctor());
 
     //---
     
@@ -508,7 +515,6 @@ class SparseMatrix
     void prune(const KeepFunc& keep = KeepFunc())
     {
       // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
-      // TODO also implement a unit test
       makeCompressed();
 
       StorageIndex k = 0;
@@ -532,7 +538,7 @@ class SparseMatrix
     }
 
     /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      * \sa reserve(), setZero()
       */
     void conservativeResize(Index rows, Index cols) 
     {
@@ -600,7 +606,7 @@ class SparseMatrix
       * This function does not free the currently allocated memory. To release as much as memory as possible,
       * call \code mat.data().squeeze(); \endcode after resizing it.
       * 
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      * \sa reserve(), setZero()
       */
     void resize(Index rows, Index cols)
     {
@@ -627,7 +633,6 @@ class SparseMatrix
       * Resize the nonzero vector to \a size */
     void resizeNonZeros(Index size)
     {
-      // TODO remove this function
       m_data.resize(size);
     }
 
@@ -665,8 +670,15 @@ class SparseMatrix
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
       check_template_parameters();
       const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
-      if (needToTranspose)  *this = other.derived();
-      else                  internal::call_assignment_no_alias(*this, other.derived());
+      if (needToTranspose)
+        *this = other.derived();
+      else
+      {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
+        internal::call_assignment_no_alias(*this, other.derived());
+      }
     }
     
     /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
@@ -717,7 +729,8 @@ class SparseMatrix
       m_data.swap(other.m_data);
     }
 
-    /** Sets *this to the identity matrix */
+    /** Sets *this to the identity matrix.
+      * This function also turns the matrix into compressed mode, and drop any reserved memory. */
     inline void setIdentity()
     {
       eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
@@ -725,6 +738,8 @@ class SparseMatrix
       Eigen::Map<IndexVector>(&this->m_data.index(0), rows()).setLinSpaced(0, StorageIndex(rows()-1));
       Eigen::Map<ScalarVector>(&this->m_data.value(0), rows()).setOnes();
       Eigen::Map<IndexVector>(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows()));
+      std::free(m_innerNonZeros);
+      m_innerNonZeros = 0;
     }
     inline SparseMatrix& operator=(const SparseMatrix& other)
     {
@@ -883,10 +898,9 @@ private:
 
 namespace internal {
 
-template<typename InputIterator, typename SparseMatrixType>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, int Options = 0)
+template<typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, DupFunctor dup_func)
 {
-  EIGEN_UNUSED_VARIABLE(Options);
   enum { IsRowMajor = SparseMatrixType::IsRowMajor };
   typedef typename SparseMatrixType::Scalar Scalar;
   typedef typename SparseMatrixType::StorageIndex StorageIndex;
@@ -909,7 +923,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
       trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
 
     // pass 3:
-    trMat.sumupDuplicates();
+    trMat.collapseDuplicates(dup_func);
   }
 
   // pass 4: transposed copy -> implicit sorting
@@ -960,12 +974,29 @@ template<typename Scalar, int _Options, typename _Index>
 template<typename InputIterators>
 void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
 {
-  internal::set_from_triplets(begin, end, *this);
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index> >(begin, end, *this, internal::scalar_sum_op<Scalar>());
+}
+
+/** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
+  * \code
+  * value = dup_func(OldValue, NewValue)
+  * \endcode 
+  * Here is a C++11 example keeping the latest entry only:
+  * \code
+  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+  * \endcode
+  */
+template<typename Scalar, int _Options, typename _Index>
+template<typename InputIterators,typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+{
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index>, DupFunctor>(begin, end, *this, dup_func);
 }
 
 /** \internal */
 template<typename Scalar, int _Options, typename _Index>
-void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
+template<typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_Index>::collapseDuplicates(DupFunctor dup_func)
 {
   eigen_assert(!isCompressed());
   // TODO, in practice we should be able to use m_innerNonZeros for that task
@@ -983,7 +1014,7 @@ void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
       if(wi(i)>=start)
       {
         // we already meet this entry => accumulate it
-        m_data.value(wi(i)) += m_data.value(k);
+        m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
       }
       else
       {
@@ -1017,6 +1048,9 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
   if (needToTranspose)
   {
+    #ifdef EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+      EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+    #endif
     // two passes algorithm:
     //  1 - compute the number of coeffs per dest inner vector
     //  2 - do the actual copy/eval
@@ -1101,6 +1135,14 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op
       for(Index j=1; j<=m_outerSize; ++j)
         m_outerIndex[j] = end;
     }
+    else
+    {
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      for(Index j=0; j<m_outerSize; ++j)
+        m_innerNonZeros[j] = m_outerIndex[j+1]-m_outerIndex[j];
+    }
   }
   
   // check whether we can do a fast "push back" insertion
@@ -1179,7 +1221,7 @@ typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Op
   {
     // make sure the matrix is compatible to random un-compressed insertion:
     m_data.resize(m_data.allocatedSize());
-    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(2*m_outerSize, convert_index(m_outerSize)));
+    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(m_outerSize, 2));
   }
   
   return insertUncompressed(row,col);
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 4e720904e..648ae1f8a 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -23,7 +23,14 @@ namespace Eigen {
   * This class can be extended with the help of the plugin mechanism described on the page
   * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
   */
-template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
+template<typename Derived> class SparseMatrixBase
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                                            EigenBase<Derived> >
+#else
+  : public EigenBase<Derived>
+#endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
 
@@ -42,7 +49,7 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
                      >::type PacketReturnType;
 
     typedef SparseMatrixBase StorageBaseType;
-    typedef EigenBase<Derived> Base;
+
     typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
     
@@ -134,6 +141,10 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     inline Derived& derived() { return *static_cast<Derived*>(this); }
     inline Derived& const_cast_derived() const
     { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
+
+    typedef internal::special_scalar_op_base<Derived, Scalar, RealScalar, EigenBase<Derived> > Base;
+    using Base::operator*;
+    using Base::operator/;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
@@ -251,20 +262,18 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     Derived& operator*=(const Scalar& other);
     Derived& operator/=(const Scalar& other);
 
-    #define EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE \
-      CwiseBinaryOp< \
-        internal::scalar_product_op< \
-          typename internal::scalar_product_traits< \
-            typename internal::traits<Derived>::Scalar, \
-            typename internal::traits<OtherDerived>::Scalar \
-          >::ReturnType \
-        >, \
-        const Derived, \
-        const OtherDerived \
-      >
+    template<typename OtherDerived> struct CwiseProductDenseReturnType {
+      typedef CwiseBinaryOp<internal::scalar_product_op<typename internal::scalar_product_traits<
+                                                          typename internal::traits<Derived>::Scalar,
+                                                          typename internal::traits<OtherDerived>::Scalar
+                                                        >::ReturnType>,
+                            const Derived,
+                            const OtherDerived
+                          > Type;
+    };
 
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+    EIGEN_STRONG_INLINE const typename CwiseProductDenseReturnType<OtherDerived>::Type
     cwiseProduct(const MatrixBase<OtherDerived> &other) const;
 
     // sparse * diagonal
@@ -281,7 +290,7 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     
     // sparse * sparse
     template<typename OtherDerived>
-    const Product<Derived,OtherDerived>
+    const Product<Derived,OtherDerived,AliasFreeProduct>
     operator*(const SparseMatrixBase<OtherDerived> &other) const;
     
     // sparse * dense
diff --git a/Eigen/src/SparseCore/SparsePermutation.h b/Eigen/src/SparseCore/SparsePermutation.h
index d63607b6c..ef38357ae 100644
--- a/Eigen/src/SparseCore/SparsePermutation.h
+++ b/Eigen/src/SparseCore/SparsePermutation.h
@@ -16,15 +16,17 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename MatrixType, int Side, bool Transposed>
-struct permutation_matrix_product<MatrixType, Side, Transposed, SparseShape>
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape>
 {
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-    typedef typename MatrixTypeNestedCleaned::StorageIndex StorageIndex;
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+    typedef typename MatrixTypeCleaned::Scalar Scalar;
+    typedef typename MatrixTypeCleaned::StorageIndex StorageIndex;
 
     enum {
-      SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
+      SrcStorageOrder = MatrixTypeCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
       MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
     };
     
@@ -33,8 +35,9 @@ struct permutation_matrix_product<MatrixType, Side, Transposed, SparseShape>
         SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> >::type ReturnType;
 
     template<typename Dest,typename PermutationType>
-    static inline void run(Dest& dst, const PermutationType& perm, const MatrixType& mat)
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
     {
+      MatrixType mat(xpr);
       if(MoveOuter)
       {
         SparseMatrix<Scalar,SrcStorageOrder,StorageIndex> tmp(mat.rows(), mat.cols());
@@ -50,7 +53,7 @@ struct permutation_matrix_product<MatrixType, Side, Transposed, SparseShape>
           Index jp = perm.indices().coeff(j);
           Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
           Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(mat,jsrc); it; ++it)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,jsrc); it; ++it)
             tmp.insertByOuterInner(jdst,it.index()) = it.value();
         }
         dst = tmp;
@@ -67,11 +70,11 @@ struct permutation_matrix_product<MatrixType, Side, Transposed, SparseShape>
           perm_cpy = perm.transpose();
 
         for(Index j=0; j<mat.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(mat,j); it; ++it)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
             sizes[perm_cpy.indices().coeff(it.index())]++;
         tmp.reserve(sizes);
         for(Index j=0; j<mat.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(mat,j); it; ++it)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
             tmp.insertByOuterInner(perm_cpy.indices().coeff(it.index()),j) = it.value();
         dst = tmp;
       }
@@ -90,40 +93,48 @@ template <int ProductTag> struct product_promote_storage_type<PermutationStorage
 // whereas it should be correctly handled by traits<Product<> >::PlainObject
 
 template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, PermutationShape, SparseShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
-  : public evaluator<typename permutation_matrix_product<Rhs,OnTheRight,false,SparseShape>::ReturnType>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, PermutationShape, SparseShape>
+  : public evaluator<typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType>
 {
   typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
-  typedef typename permutation_matrix_product<Rhs,OnTheRight,false,SparseShape>::ReturnType PlainObject;
+  typedef typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType PlainObject;
   typedef evaluator<PlainObject> Base;
 
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
   explicit product_evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
     generic_product_impl<Lhs, Rhs, PermutationShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
 template<typename Lhs, typename Rhs, int ProductTag>
-struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, SparseShape, PermutationShape, typename traits<Lhs>::Scalar, typename traits<Rhs>::Scalar> 
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, SparseShape, PermutationShape >
   : public evaluator<typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType>
 {
   typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
   typedef typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType PlainObject;
   typedef evaluator<PlainObject> Base;
 
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
   explicit product_evaluator(const XprType& xpr)
     : m_result(xpr.rows(), xpr.cols())
   {
     ::new (static_cast<Base*>(this)) Base(m_result);
     generic_product_impl<Lhs, Rhs, SparseShape, PermutationShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
   }
-  
-protected:  
+
+protected:
   PlainObject m_result;
 };
 
@@ -132,35 +143,34 @@ protected:
 /** \returns the matrix with the permutation applied to the columns
   */
 template<typename SparseDerived, typename PermDerived>
-inline const Product<SparseDerived, PermDerived>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct>
 operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{ return Product<SparseDerived, PermDerived>(matrix.derived(), perm.derived()); }
+{ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived()); }
 
 /** \returns the matrix with the permutation applied to the rows
   */
 template<typename SparseDerived, typename PermDerived>
-inline const Product<PermDerived, SparseDerived>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct>
 operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{ return  Product<PermDerived, SparseDerived>(perm.derived(), matrix.derived()); }
+{ return  Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived()); }
 
 
-// TODO, the following specializations should not be needed as Transpose<Permutation*> should be a PermutationBase.
 /** \returns the matrix with the inverse permutation applied to the columns.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const Product<SparseDerived, Transpose<PermutationBase<PermDerived> > >
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const Transpose<PermutationBase<PermDerived> >& tperm)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>
+operator*(const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm)
 {
-  return Product<SparseDerived, Transpose<PermutationBase<PermDerived> > >(matrix.derived(), tperm);
+  return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
 }
 
 /** \returns the matrix with the inverse permutation applied to the rows.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const Product<Transpose<PermutationBase<PermDerived> >, SparseDerived>
-operator*(const Transpose<PermutationBase<PermDerived> >& tperm, const SparseMatrixBase<SparseDerived>& matrix)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
+operator*(const InverseImpl<PermutationType,PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix)
 {
-  return Product<Transpose<PermutationBase<PermDerived> >, SparseDerived>(tperm, matrix.derived());
+  return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h
index da8919ecc..cbd0db71b 100644
--- a/Eigen/src/SparseCore/SparseProduct.h
+++ b/Eigen/src/SparseCore/SparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -25,10 +25,10 @@ namespace Eigen {
   * */
 template<typename Derived>
 template<typename OtherDerived>
-inline const Product<Derived,OtherDerived>
+inline const Product<Derived,OtherDerived,AliasFreeProduct>
 SparseMatrixBase<Derived>::operator*(const SparseMatrixBase<OtherDerived> &other) const
 {
-  return Product<Derived,OtherDerived>(derived(), other.derived());
+  return Product<Derived,OtherDerived,AliasFreeProduct>(derived(), other.derived());
 }
 
 namespace internal {
@@ -39,6 +39,34 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
 {
   template<typename Dest>
   static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    evalTo(dst, lhs, rhs, typename evaluator_traits<Dest>::Shape());
+  }
+
+  // dense += sparse * sparse
+  template<typename Dest,typename ActualLhs>
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  {
+    typedef typename nested_eval<ActualLhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_sparse_to_dense_product_selector<typename remove_all<LhsNested>::type,
+                                                      typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense -= sparse * sparse
+  template<typename Dest>
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  {
+    addTo(dst, -lhs, rhs);
+  }
+
+protected:
+
+  // sparse = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, SparseShape)
   {
     typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
     typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
@@ -47,6 +75,14 @@ struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
     internal::conservative_sparse_sparse_product_selector<typename remove_all<LhsNested>::type,
                                                           typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
   }
+
+  // dense = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, DenseShape)
+  {
+    dst.setZero();
+    addTo(dst, lhs, rhs);
+  }
 };
 
 // sparse * sparse-triangular
@@ -61,6 +97,39 @@ struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, SparseShape, Produc
  : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
 {};
 
+// dense = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense += sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::addTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense -= sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::subTo(dst,src.lhs(),src.rhs());
+  }
+};
+
 template<typename Lhs, typename Rhs, int Options>
 struct evaluator<SparseView<Product<Lhs, Rhs, Options> > > 
  : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>
diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h
index 8df62a119..19e06fc80 100644
--- a/Eigen/src/SparseCore/SparseRef.h
+++ b/Eigen/src/SparseCore/SparseRef.h
@@ -19,7 +19,7 @@ enum {
 namespace internal {
 
 template<typename Derived> class SparseRefBase;
-  
+
 template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
 struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
   : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
@@ -27,7 +27,7 @@ struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _Stride
   typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
   enum {
     Options = _Options,
-    Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
   };
 
   template<typename Derived> struct match {
@@ -48,7 +48,35 @@ struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _
     Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
   };
 };
-  
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseVector<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && Derived::IsVectorAtCompileTime
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseVector<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
 template<typename Derived>
 struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
 
@@ -58,7 +86,7 @@ template<typename Derived> class SparseRefBase
 public:
 
   typedef SparseMapBase<Derived> Base;
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
 
   SparseRefBase()
     : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
@@ -66,11 +94,13 @@ public:
   
 protected:
 
-
   template<typename Expression>
   void construct(Expression& expr)
   {
-    ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+    if(expr.outerIndexPtr()==0)
+      ::new (static_cast<Base*>(this)) Base(expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr());
+    else
+      ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
   }
 };
 
@@ -102,7 +132,7 @@ class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
   public:
 
     typedef internal::SparseRefBase<Ref> Base;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
 
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -146,7 +176,7 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
   public:
 
     typedef internal::SparseRefBase<Ref> Base;
-    _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
     inline Ref(const SparseMatrixBase<Derived>& expr)
@@ -170,8 +200,9 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     {
       if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
       {
-        m_object = expr;
-        Base::construct(m_object);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        ::new (obj) TPlainObjectType(expr);
+        Base::construct(*obj);
       }
       else
       {
@@ -182,17 +213,113 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     template<typename Expression>
     void construct(const Expression& expr, internal::false_type)
     {
-      m_object = expr;
-      Base::construct(m_object);
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      ::new (obj) TPlainObjectType(expr);
+      Base::construct(*obj);
     }
 
   protected:
-    TPlainObjectType m_object;
+    char m_object_bytes[sizeof(TPlainObjectType)];
 };
 
 
+
+/**
+  * \ingroup Sparse_Module
+  *
+  * \brief A sparse vector expression referencing an existing sparse vector expression
+  *
+  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
+  * \tparam Options Not used for SparseVector.
+  * \tparam StrideType Only used for dense Ref
+  *
+  * \sa class Ref
+  */
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseVector<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseVector<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseVector<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      ::new (obj) TPlainObjectType(expr);
+      Base::construct(*obj);
+    }
+
+  protected:
+    char m_object_bytes[sizeof(TPlainObjectType)];
+};
+
 namespace internal {
 
+// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove this copy-pasta thing...
+
 template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
 struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
   : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
@@ -213,6 +340,26 @@ struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options,
   explicit evaluator(const XprType &mat) : Base(mat) {}
 };
 
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h
index b0c2e472e..46c6ce1d3 100644
--- a/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -137,14 +137,14 @@ template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
 
     SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
 
     template<typename SrcMatrixType,unsigned int SrcMode>
     SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcMode>& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
     
@@ -336,7 +336,7 @@ struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, Pr
 // TODO: maybe the copy could be handled by generic_product_impl so that these overloads would not be needed anymore
 
 template<typename LhsView, typename Rhs, int ProductTag>
-struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, SparseSelfAdjointShape, SparseShape, typename traits<LhsView>::Scalar, typename traits<Rhs>::Scalar>
+struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, SparseSelfAdjointShape, SparseShape>
   : public evaluator<typename Product<typename Rhs::PlainObject, Rhs, DefaultProduct>::PlainObject>
 {
   typedef Product<LhsView, Rhs, DefaultProduct> XprType;
@@ -356,7 +356,7 @@ protected:
 };
 
 template<typename Lhs, typename RhsView, int ProductTag>
-struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, SparseShape, SparseSelfAdjointShape, typename traits<Lhs>::Scalar, typename traits<RhsView>::Scalar>
+struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, SparseShape, SparseSelfAdjointShape>
   : public evaluator<typename Product<Lhs, typename Lhs::PlainObject, DefaultProduct>::PlainObject>
 {
   typedef Product<Lhs, RhsView, DefaultProduct> XprType;
diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h
index c2d4ac549..b6f180a41 100644
--- a/Eigen/src/SparseCore/SparseTranspose.h
+++ b/Eigen/src/SparseCore/SparseTranspose.h
@@ -27,12 +27,14 @@ namespace internal {
     using Base::derived;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::StorageIndex StorageIndex;
+
+    inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
     
     inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
     inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
     inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
     inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
-    
+
     inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
     inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
     inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
diff --git a/Eigen/src/SparseCore/SparseTriangularView.h b/Eigen/src/SparseCore/SparseTriangularView.h
index 3d9946149..7c718e4e1 100644
--- a/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/Eigen/src/SparseCore/SparseTriangularView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -34,10 +34,11 @@ template<typename MatrixType, unsigned int Mode> class TriangularViewImpl<Matrix
     
     typedef TriangularView<MatrixType,Mode> TriangularViewType;
     
-protected:
+  protected:
     // dummy solve function to make TriangularView happy.
     void solve() const;
 
+    typedef SparseMatrixBase<TriangularViewType> Base;
   public:
     
     EIGEN_SPARSE_PUBLIC_INTERFACE(TriangularViewType)
diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h
index d53a9cb17..74df0d496 100644
--- a/Eigen/src/SparseCore/SparseUtil.h
+++ b/Eigen/src/SparseCore/SparseUtil.h
@@ -39,32 +39,16 @@ EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
 #define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
 EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
 
-// TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
-  typedef typename Eigen::internal::ref_selector<Derived >::type Nested; \
-  typedef typename Eigen::internal::traits<Derived >::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived >::StorageIndex StorageIndex; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived >::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived >::ColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived>::Flags, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
-  using Base::derived; \
-  using Base::const_cast_derived; \
-  using Base::convert_index;
-  
-#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  typedef Eigen::SparseMatrixBase<Derived > Base; \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
 
+#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)
+
+  
 const int CoherentAccessPattern     = 0x1;
 const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
 const int OuterRandomAccessPattern  = 0x4 | CoherentAccessPattern;
 const int RandomAccessPattern       = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
 
-template<typename Derived> class SparseMatrixBase;
 template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseMatrix;
 template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class DynamicSparseMatrix;
 template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseVector;
@@ -89,20 +73,20 @@ template<typename MatrixType,int UpLo> class SparseSymmetricPermutationProduct;
 
 namespace internal {
 
-template<typename T,int Rows,int Cols> struct sparse_eval;
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval;
 
 template<typename T> struct eval<T,Sparse>
-  : public sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime,traits<T>::Flags>
 {};
 
-template<typename T,int Cols> struct sparse_eval<T,1,Cols> {
+template<typename T,int Cols,int Flags> struct sparse_eval<T,1,Cols,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
     typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
     typedef SparseVector<_Scalar, RowMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
+template<typename T,int Rows,int Flags> struct sparse_eval<T,Rows,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
     typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
@@ -110,15 +94,15 @@ template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
 };
 
 // TODO this seems almost identical to plain_matrix_type<T, Sparse>
-template<typename T,int Rows,int Cols> struct sparse_eval {
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval {
     typedef typename traits<T>::Scalar _Scalar;
     typedef typename traits<T>::StorageIndex _StorageIndex;
-    enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+    enum { _Options = ((Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
     typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
 };
 
-template<typename T> struct sparse_eval<T,1,1> {
+template<typename T,int Flags> struct sparse_eval<T,1,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
   public:
     typedef Matrix<_Scalar, 1, 1> type;
@@ -133,10 +117,15 @@ template<typename T> struct plain_matrix_type<T,Sparse>
     typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
 };
 
+template<typename T>
+struct plain_object_eval<T,Sparse>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime, evaluator<T>::Flags>
+{};
+
 template<typename Decomposition, typename RhsType>
 struct solve_traits<Decomposition,RhsType,Sparse>
 {
-  typedef typename sparse_eval<RhsType, RhsType::RowsAtCompileTime, RhsType::ColsAtCompileTime>::type PlainObject;
+  typedef typename sparse_eval<RhsType, RhsType::RowsAtCompileTime, RhsType::ColsAtCompileTime,traits<RhsType>::Flags>::type PlainObject;
 };
 
 template<typename Derived>
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index ccf9364f2..7ec73a365 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -40,8 +40,7 @@ struct traits<SparseVector<_Scalar, _Options, _StorageIndex> >
     ColsAtCompileTime = IsColVector ? 1 : Dynamic,
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit),
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
@@ -63,10 +62,10 @@ struct sparse_vector_assign_selector;
 
 template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseVector
-  : public SparseMatrixBase<SparseVector<_Scalar, _Options, _StorageIndex> >
+  : public SparseCompressedBase<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
-    typedef SparseMatrixBase<SparseVector> SparseBase;
-    
+    typedef SparseCompressedBase<SparseVector> Base;
+    using Base::convert_index;
   public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
@@ -89,6 +88,11 @@ class SparseVector
 
     EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return &m_data.index(0); }
     EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return &m_data.index(0); }
+
+    inline const StorageIndex* outerIndexPtr() const { return 0; }
+    inline StorageIndex* outerIndexPtr() { return 0; }
+    inline const StorageIndex* innerNonZeroPtr() const { return 0; }
+    inline StorageIndex* innerNonZeroPtr() { return 0; }
     
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -126,8 +130,8 @@ class SparseVector
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
 
     inline void setZero() { m_data.clear(); }
 
@@ -230,12 +234,15 @@ class SparseVector
     inline SparseVector(const SparseMatrixBase<OtherDerived>& other)
       : m_size(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       check_template_parameters();
       *this = other.derived();
     }
 
     inline SparseVector(const SparseVector& other)
-      : SparseBase(other), m_size(0)
+      : Base(other), m_size(0)
     {
       check_template_parameters();
       *this = other.derived();
@@ -357,75 +364,6 @@ protected:
     Index m_size;
 };
 
-template<typename Scalar, int _Options, typename _StorageIndex>
-class SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator
-{
-  public:
-	explicit InnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(0), m_end(m_data.size())
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-	explicit InnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
-      : m_data(data), m_id(0), m_end(m_data.size())
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id)); }
-
-    inline StorageIndex index() const { return m_data.index(m_id); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,StorageIndex>& m_data;
-    Index m_id;
-    const Index m_end;
-  private:
-    // If you get here, then you're not using the right InnerIterator type, e.g.:
-    //   SparseMatrix<double,RowMajor> A;
-    //   SparseMatrix<double>::InnerIterator it(A,0);
-    template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer=0);
-};
-
-template<typename Scalar, int _Options, typename _StorageIndex>
-class SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator
-{
-  public:
-	explicit ReverseInnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(m_data.size()), m_start(0)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-	explicit ReverseInnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
-      : m_data(data), m_id(m_data.size()), m_start(0)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id-1); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id-1)); }
-
-    inline StorageIndex index() const { return m_data.index(m_id-1); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,StorageIndex>& m_data;
-    Index m_id;
-    const Index m_start;
-};
-
 namespace internal {
 
 template<typename _Scalar, int _Options, typename _Index>
@@ -441,7 +379,10 @@ struct evaluator<SparseVector<_Scalar,_Options,_Index> >
     Flags = SparseVectorType::Flags
   };
   
-  explicit evaluator(const SparseVectorType &mat) : m_matrix(mat) {}
+  explicit evaluator(const SparseVectorType &mat) : m_matrix(mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
   
   inline Index nonZerosEstimate() const {
     return m_matrix.nonZeros();
diff --git a/Eigen/src/SparseCore/SparseView.h b/Eigen/src/SparseCore/SparseView.h
index 761e72038..c945c4dab 100644
--- a/Eigen/src/SparseCore/SparseView.h
+++ b/Eigen/src/SparseCore/SparseView.h
@@ -32,6 +32,7 @@ class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
 {
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef SparseMatrixBase<SparseView > Base;
 public:
   EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
   typedef typename internal::remove_all<MatrixType>::type NestedExpression;
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
old mode 100644
new mode 100755
index 8cdd29c7b..d33d27f46
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -64,7 +64,8 @@ template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixURetu
   * 
   * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<>
   * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
-  * 
+  *
+  * \implsparsesolverconcept
   * 
   * \sa \ref TutorialSparseDirectSolvers
   * \sa \ref OrderingMethods_Module
@@ -89,13 +90,19 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
     typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     typedef internal::SparseLUImpl<Scalar, StorageIndex> Base;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
     SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
     }
-    explicit SparseLU(const MatrixType& matrix):m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    explicit SparseLU(const MatrixType& matrix)
+      : m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
       compute(matrix);
@@ -713,7 +720,7 @@ template<typename MatrixLType, typename MatrixUType>
 struct SparseLUMatrixUReturnType : internal::no_assignment_operator
 {
   typedef typename MatrixLType::Scalar Scalar;
-  explicit SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
+  SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
   { }
   Index rows() { return m_mapL.rows(); }
diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h
index 548b3f9b0..4f26c19ca 100644
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@@ -62,6 +62,8 @@ namespace internal {
   * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
   *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
   * 
+  * \implsparsesolverconcept
+  *
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
   * 
   */
@@ -82,6 +84,12 @@ class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
     typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
     typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    
   public:
     SparseQR () :  m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     { }
diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h
index d067d8fdf..fd2b26581 100644
--- a/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,12 +14,11 @@ namespace Eigen {
 
 #define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
     extern "C" {                                                                                          \
-      typedef struct { FLOATTYPE for_lu; FLOATTYPE total_needed; int expansions; } PREFIX##mem_usage_t;   \
       extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
                                 char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
                                 void *, int, SuperMatrix *, SuperMatrix *,                                \
                                 FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                           \
+                                mem_usage_t *, SuperLUStat_t *, int *);                           \
     }                                                                                                     \
     inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
          int *perm_c, int *perm_r, int *etree, char *equed,                                               \
@@ -29,7 +28,7 @@ namespace Eigen {
          FLOATTYPE *recip_pivot_growth,                                                                   \
          FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    PREFIX##mem_usage_t mem_usage;                                                                        \
+    mem_usage_t mem_usage;                                                                        \
     PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
          ferr, berr, &mem_usage, stats, info);                                                            \
@@ -53,7 +52,7 @@ DECL_GSSVX(z,double,std::complex<double>)
       extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *,        \
                          char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,        \
                          void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, FLOATTYPE *,   \
-                         PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                        \
+                         mem_usage_t *, SuperLUStat_t *, int *);                        \
     }                                                                                           \
     inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A,                      \
          int *perm_c, int *perm_r, int *etree, char *equed,                                     \
@@ -63,7 +62,7 @@ DECL_GSSVX(z,double,std::complex<double>)
          FLOATTYPE *recip_pivot_growth,                                                         \
          FLOATTYPE *rcond,                                                                      \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                            \
-    PREFIX##mem_usage_t mem_usage;                                                              \
+    mem_usage_t mem_usage;                                                              \
     PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L,                            \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                       \
          &mem_usage, stats, info);                                                              \
@@ -305,6 +304,10 @@ class SuperLUBase : public SparseSolverBase<Derived>
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
     typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
     typedef SparseMatrix<Scalar> LUMatrixType;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
@@ -449,6 +452,10 @@ class SuperLUBase : public SparseSolverBase<Derived>
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType>
@@ -657,7 +664,7 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
   
-  if(&x.coeffRef(0) != x_ref.data())
+  if(x.derived().data() != x_ref.data())
     x = x_ref;
   
   m_info = info==0 ? Success : NumericalIssue;
@@ -796,10 +803,12 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
   * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization
   * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers.
   *
-  * \warning This class requires SuperLU 4 or later.
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB
   */
 
diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 0a5043ef2..aaec8c6f1 100644
--- a/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -16,6 +16,13 @@ namespace Eigen {
 
 // generic double/complex<double> wrapper functions:
 
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) 
+{ umfpack_di_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>) 
+{ umfpack_zi_defaults(control); }
+
 inline void umfpack_free_numeric(void **Numeric, double)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
@@ -139,16 +146,23 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     typedef SparseMatrix<Scalar> LUMatrixType;
     typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
     typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
+    typedef Array<double, UMFPACK_CONTROL, 1> UmfpackControl;
+
     UmfPackLU()
       : m_dummy(0,0), mp_matrix(m_dummy)
     {
       init();
     }
 
-    explicit UmfPackLU(const MatrixType& matrix)
+    template<typename InputMatrixType>
+    explicit UmfPackLU(const InputMatrixType& matrix)
       : mp_matrix(matrix)
     {
       init();
@@ -230,6 +244,39 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       analyzePattern_impl();
     }
 
+    /** Provides the return status code returned by UmfPack during the numeric
+      * factorization.
+      *
+      * \sa factorize(), compute()
+      */
+    inline int umfpackFactorizeReturncode() const
+    {
+      eigen_assert(m_numeric && "UmfPackLU: you must first call factorize()");
+      return m_fact_errorCode;
+    }
+
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline const UmfpackControl& umfpackControl() const
+    {
+      return m_control;
+    }
+    
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline UmfpackControl& umfpackControl()
+    {
+      return m_control;
+    }
+    
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
@@ -269,11 +316,12 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     
     void analyzePattern_impl()
     {
+      umfpack_defaults(m_control.data(), Scalar());
       int errorCode = 0;
       errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
                                    internal::convert_index<int>(mp_matrix.cols()),
                                    mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-                                   &m_symbolic, 0, 0);
+                                   &m_symbolic, m_control.data(), 0);
 
       m_isInitialized = true;
       m_info = errorCode ? InvalidInput : Success;
@@ -284,11 +332,10 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     
     void factorize_impl()
     {
-      int errorCode;
-      errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-                                  m_symbolic, &m_numeric, 0, 0);
+      m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                         m_symbolic, &m_numeric, m_control.data(), 0);
 
-      m_info = errorCode ? NumericalIssue : Success;
+      m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue;
       m_factorizationIsOk = true;
       m_extractedDataAreDirty = true;
     }
@@ -311,6 +358,9 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
   
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
+    int m_fact_errorCode;
+    UmfpackControl m_control;
+    
     mutable LUMatrixType m_u;
     mutable IntColVectorType m_p;
     mutable IntRowVectorType m_q;
@@ -390,7 +440,7 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
       x_ptr = &x.col(j).coeffRef(0);
     errorCode = umfpack_solve(UMFPACK_A,
         mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), 0);
     if(x.innerStride()!=1)
       x.col(j) = x_tmp;
     if (errorCode!=0)
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 5a3c92ea2..01432e2f3 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -4,6 +4,8 @@ typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnTy
 typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
 typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
 
@@ -19,6 +21,9 @@ typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturn
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
 typedef CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> PowReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
 typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
@@ -138,6 +143,39 @@ sqrt() const
   return SqrtReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise inverse square root of *this.
+  *
+  * This function computes the coefficient-wise inverse square root.
+  *
+  * Example: \include Cwise_sqrt.cpp
+  * Output: \verbinclude Cwise_sqrt.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const RsqrtReturnType
+rsqrt() const
+{
+  return RsqrtReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise signum of *this.
+  *
+  * This function computes the coefficient-wise signum.
+  *
+  * Example: \include Cwise_sign.cpp
+  * Output: \verbinclude Cwise_sign.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const SignReturnType
+sign() const
+{
+  return SignReturnType(derived());
+}
+
+
 /** \returns an expression of the coefficient-wise cosine of *this.
   *
   * This function computes the coefficient-wise cosine. The function MatrixBase::cos() in the
@@ -267,6 +305,47 @@ cosh() const
   return CoshReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+ *
+ * Example: \include Cwise_lgamma.cpp
+ * Output: \verbinclude Cwise_lgamma.out
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const LgammaReturnType
+lgamma() const
+{
+  return LgammaReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise Gauss error
+ * function of *this.
+ *
+ * Example: \include Cwise_erf.cpp
+ * Output: \verbinclude Cwise_erf.out
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const ErfReturnType
+erf() const
+{
+  return ErfReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise Complementary error
+ * function of *this.
+ *
+ * Example: \include Cwise_erfc.cpp
+ * Output: \verbinclude Cwise_erfc.out
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const ErfcReturnType
+erfc() const
+{
+  return ErfcReturnType(derived());
+}
+
 /** \returns an expression of the coefficient-wise power of *this to the given exponent.
   *
   * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
diff --git a/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index e339140bf..e16bb374b 100644
--- a/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -14,6 +14,7 @@
 typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
 typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
 typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
 typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
 
 /** \returns an expression of the coefficient-wise absolute value of \c *this
@@ -49,6 +50,17 @@ EIGEN_DEVICE_FUNC
 inline const CwiseSqrtReturnType
 cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
 
+/** \returns an expression of the coefficient-wise signum of *this.
+  *
+  * Example: \include MatrixBase_cwiseSign.cpp
+  * Output: \verbinclude MatrixBase_cwiseSign.out
+  *
+  */
+EIGEN_DEVICE_FUNC
+inline const CwiseSignReturnType
+cwiseSign() const { return CwiseSignReturnType(derived()); }
+
+
 /** \returns an expression of the coefficient-wise inverse of *this.
   *
   * Example: \include MatrixBase_cwiseInverse.cpp
diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index 28e2bcaea..64666d75f 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -28,6 +28,14 @@
 # include <unistd.h>
 #endif
 
+static void escape(void *p) {
+  asm volatile("" : : "g"(p) : "memory");
+}
+
+static void clobber() {
+  asm volatile("" : : : "memory");
+}
+
 #include <Eigen/Core>
 
 namespace Eigen
@@ -168,6 +176,7 @@ public:
         CODE; \
       } \
       TIMER.stop(); \
+      clobber(); \
     } \
   }
 
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 0974ebe4c..8528c5587 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -203,9 +203,10 @@ int main(int argc, char ** argv)
     return 1;
   }
 
- if(cache_size1>0)
-   setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
-
+#if EIGEN_VERSION_AT_LEAST(3,2,90)
+  if(cache_size1>0)
+    setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
+#endif
   
   A a(m,p); a.setRandom();
   B b(p,n); b.setRandom();
diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt
index 9444b450c..38ff9f483 100644
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@@ -11,29 +11,24 @@ SET(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 string(REGEX MATCH icpc IS_ICPC ${CMAKE_CXX_COMPILER})
 IF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
-  SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG")
-  SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG")
-  IF(NOT BTL_NOVEC)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
-    SET(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -msse2")
-  ELSE(NOT BTL_NOVEC)
+  SET(CMAKE_CXX_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG ${CMAKE_Fortran_FLAGS}")
+  IF(BTL_NOVEC)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
-  ENDIF(NOT BTL_NOVEC)
+  ENDIF(BTL_NOVEC)
 ENDIF(CMAKE_COMPILER_IS_GNUCXX OR IS_ICPC)
 
 IF(MSVC)
   SET(CMAKE_CXX_FLAGS " /O2 /Ot /GL /fp:fast -DNDEBUG")
 #   SET(CMAKE_Fortran_FLAGS "-g0 -O3 -DNDEBUG")
-  IF(NOT BTL_NOVEC)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2")
-  ELSE(NOT BTL_NOVEC)
+  IF(BTL_NOVEC)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_DONT_VECTORIZE")
-  ENDIF(NOT BTL_NOVEC)
+  ENDIF(BTL_NOVEC)
 ENDIF(MSVC)
 
 if(IS_ICPC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fast")
-  set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fast")
+  set(CMAKE_CXX_FLAGS "-fast ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_Fortran_FLAGS "-fast ${CMAKE_Fortran_FLAGS}")
 endif(IS_ICPC)
 
 include_directories(
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh
index 0f62bd421..2b01149f9 100644
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@@ -29,7 +29,7 @@
 // min vector size for axpy bench
 #define MIN_AXPY 5
 // max vector size for axpy bench
-#define MAX_AXPY 1000000
+#define MAX_AXPY 3000000
 // min matrix size for matrix vector product bench
 #define MIN_MV 5
 // max matrix size for matrix vector product bench
diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt
index 40a71c781..fb3e48e99 100644
--- a/bench/perf_monitoring/gemm/changesets.txt
+++ b/bench/perf_monitoring/gemm/changesets.txt
@@ -42,4 +42,6 @@ before-evaluators
 6984:45f26866c091   # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
 6986:a675d05b6f8f   # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
 7013:f875e75f07e5   # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
+7591:09a8e2186610   # 3.3-alpha1
+7650:b0f3c8f43025   # help clang inlining
 
diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp
index 72eb9cab6..614bd4737 100644
--- a/bench/perf_monitoring/gemm/gemm.cpp
+++ b/bench/perf_monitoring/gemm/gemm.cpp
@@ -53,7 +53,7 @@ int main(int argc, char **argv)
 {
   std::vector<double> results;
   
-  std::ifstream settings("settings.txt");
+  std::ifstream settings("gemm_settings.txt");
   long m, n, k;
   while(settings >> m >> n >> k)
   {
diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/gemm_settings.txt
similarity index 100%
rename from bench/perf_monitoring/gemm/settings.txt
rename to bench/perf_monitoring/gemm/gemm_settings.txt
diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp
new file mode 100644
index 000000000..b443218d7
--- /dev/null
+++ b/bench/perf_monitoring/gemm/lazy_gemm.cpp
@@ -0,0 +1,97 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <Eigen/Core>
+#include "../../BenchTimer.h"
+using namespace Eigen;
+
+#ifndef SCALAR
+#error SCALAR must be defined
+#endif
+
+typedef SCALAR Scalar;
+
+template<typename MatA, typename MatB, typename MatC>
+inline void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
+{
+  escape((void*)A.data());
+  escape((void*)B.data());
+  C.noalias() += A.lazyProduct(B);
+  escape((void*)C.data());
+}
+
+template<int m, int n, int k, int TA>
+EIGEN_DONT_INLINE
+double bench()
+{
+  typedef Matrix<Scalar,m,k,TA> MatA;
+  typedef Matrix<Scalar,k,n> MatB;
+  typedef Matrix<Scalar,m,n> MatC;
+
+  MatA A(m,k);
+  MatB B(k,n);
+  MatC C(m,n);
+  A.setRandom();
+  B.setRandom();
+  C.setZero();
+
+  BenchTimer t;
+
+  double up = 1e7*4/sizeof(Scalar);
+  double tm0 = 10, tm1 = 20;
+
+  double flops = 2. * m * n * k;
+  long rep = std::max(10., std::min(10000., up/flops) );
+  long tries = std::max(tm0, std::min(tm1, up/flops) );
+
+  BENCH(t, tries, rep, lazy_gemm(A,B,C));
+
+  return 1e-9 * rep * flops / t.best();
+}
+
+template<int m, int n, int k>
+double bench_t(int t)
+{
+  if(t)
+    return bench<m,n,k,RowMajor>();
+  else
+    return bench<m,n,k,0>();
+}
+
+EIGEN_DONT_INLINE
+double bench_mnk(int m, int n, int k, int t)
+{
+  int id = m*10000 + n*100 + k;
+  switch(id) {
+    case  10101 : return bench_t< 1, 1, 1>(t); break;
+    case  20202 : return bench_t< 2, 2, 2>(t); break;
+    case  30303 : return bench_t< 3, 3, 3>(t); break;
+    case  40404 : return bench_t< 4, 4, 4>(t); break;
+    case  50505 : return bench_t< 5, 5, 5>(t); break;
+    case  60606 : return bench_t< 6, 6, 6>(t); break;
+    case  70707 : return bench_t< 7, 7, 7>(t); break;
+    case  80808 : return bench_t< 8, 8, 8>(t); break;
+    case  90909 : return bench_t< 9, 9, 9>(t); break;
+    case 101010 : return bench_t<10,10,10>(t); break;
+    case 111111 : return bench_t<11,11,11>(t); break;
+    case 121212 : return bench_t<12,12,12>(t); break;
+  }
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  std::vector<double> results;
+  
+  std::ifstream settings("lazy_gemm_settings.txt");
+  long m, n, k, t;
+  while(settings >> m >> n >> k >> t)
+  {
+    //std::cerr << "  Testing " << m << " " << n << " " << k << std::endl;
+    results.push_back( bench_mnk(m, n, k, t) );
+  }
+  
+  std::cout << RowVectorXd::Map(results.data(), results.size());
+  
+  return 0;
+}
diff --git a/bench/perf_monitoring/gemm/lazy_gemm_settings.txt b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
new file mode 100644
index 000000000..407d5d4fa
--- /dev/null
+++ b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt
@@ -0,0 +1,15 @@
+1 1 1 0
+2 2 2 0
+3 3 3 0
+4 4 4 0
+4 4 4 1
+5 5 5 0
+6 6 6 0
+7 7 7 0
+7 7 7 1
+8 8 8 0
+9 9 9 0
+10 10 10 0
+11 11 11 0
+12 12 12 0
+12 12 12 1
diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh
index 609c471f9..4d6053501 100755
--- a/bench/perf_monitoring/gemm/make_plot.sh
+++ b/bench/perf_monitoring/gemm/make_plot.sh
@@ -4,6 +4,7 @@
 # it reads $1.out
 # and generates $1.pdf
 WHAT=$1
+bench=$2
 
 header="rev "
 while read line
@@ -11,7 +12,7 @@ do
   if [ ! -z '$line' ]; then
     header="$header  \"$line\""
   fi
-done < settings.txt
+done < $bench"_settings.txt"
 
 echo $header > $WHAT.out.header
 cat $WHAT.out >> $WHAT.out.header
diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run.sh
similarity index 78%
rename from bench/perf_monitoring/gemm/run_gemm.sh
rename to bench/perf_monitoring/gemm/run.sh
index 3fa6a3661..bfb4ecfac 100755
--- a/bench/perf_monitoring/gemm/run_gemm.sh
+++ b/bench/perf_monitoring/gemm/run.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# ./run.sh gemm
+# ./run.sh lazy_gemm
+
 # Examples of environment variables to be set:
 #   PREFIX="haswell-fma-"
 #   CXX_FLAGS="-mfma"
@@ -8,6 +11,7 @@
 #   -up : enforce the recomputation of existing data, and keep best results as a merging strategy
 #   -s  : recompute selected changesets only and keep bests
 
+bench=$1
 
 if echo "$*" | grep '\-up' > /dev/null; then
   update=true
@@ -84,7 +88,7 @@ function test_current
   fi
   res=$prev
   count_rev=`echo $prev |  wc -w`
-  count_ref=`cat "settings.txt" |  wc -l`
+  count_ref=`cat $bench"_settings.txt" |  wc -l`
   if echo "$global_args" | grep "$rev" > /dev/null; then
     rev_found=true
   else
@@ -93,7 +97,7 @@ function test_current
 #  echo $update et $selected et $rev_found because $rev et "$global_args"
 #  echo $count_rev et $count_ref
   if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] &&  [ $rev_found == true ]); then
-    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
+    if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then
       curr=`./$name`
       if [ $count_rev == $count_ref ]; then
         echo "merge previous $prev"
@@ -113,9 +117,9 @@ function test_current
   fi
 }
 
-make_backup $PREFIX"sgemm"
-make_backup $PREFIX"dgemm"
-make_backup $PREFIX"cgemm"
+make_backup $PREFIX"s"$bench
+make_backup $PREFIX"d"$bench
+make_backup $PREFIX"c"$bench
 
 cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
 do
@@ -126,27 +130,27 @@ do
     actual_rev=`hg identify | cut -f1 -d' '`
     cd ..
     
-    test_current $actual_rev float                  $PREFIX"sgemm"
-    test_current $actual_rev double                 $PREFIX"dgemm"
-    test_current $actual_rev "std::complex<double>" $PREFIX"cgemm"
+    test_current $actual_rev float                  $PREFIX"s"$bench
+    test_current $actual_rev double                 $PREFIX"d"$bench
+    test_current $actual_rev "std::complex<double>" $PREFIX"c"$bench
   fi
   
 done
 
 echo "Float:"
-cat $PREFIX"sgemm.out"
+cat $PREFIX"s"$bench.out"
 echo ""
 
 echo "Double:"
-cat $PREFIX"dgemm.out"
+cat $PREFIX"d"$bench.out"
 echo ""
 
 echo "Complex:"
-cat $PREFIX"cgemm.out"
+cat $PREFIX"c"$bench.out"
 echo ""
 
-./make_plot.sh $PREFIX"sgemm"
-./make_plot.sh $PREFIX"dgemm"
-./make_plot.sh $PREFIX"cgemm"
+./make_plot.sh $PREFIX"s"$bench $bench
+./make_plot.sh $PREFIX"d"$bench $bench
+./make_plot.sh $PREFIX"c"$bench $bench
 
 
diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt
index 6e0e1b103..8d53f4ae2 100644
--- a/bench/spbench/CMakeLists.txt
+++ b/bench/spbench/CMakeLists.txt
@@ -29,7 +29,7 @@ if(UMFPACK_FOUND AND BLAS_FOUND)
   set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endif()
 
-find_package(SuperLU)
+find_package(SuperLU 4.0)
 if(SUPERLU_FOUND AND BLAS_FOUND)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
diff --git a/blas/level2_cplx_impl.h b/blas/level2_cplx_impl.h
index afa9a7493..9b845de22 100644
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@@ -18,7 +18,7 @@
   */
 int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
-  typedef void (*functype)(int, const Scalar*, int, const Scalar*, int, Scalar*, Scalar);
+  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
   static functype func[2];
 
   static bool init = false;
@@ -67,7 +67,7 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
     if(code>=2 || func[code]==0)
       return 0;
 
-    func[code](*n, a, *lda, actual_x, 1, actual_y, alpha);
+    func[code](*n, a, *lda, actual_x, actual_y, alpha);
   }
 
   if(actual_x!=x) delete[] actual_x;
diff --git a/blas/level2_real_impl.h b/blas/level2_real_impl.h
index 9722a4674..cac89b268 100644
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@@ -12,7 +12,7 @@
 // y = alpha*A*x + beta*y
 int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
 {
-  typedef void (*functype)(int, const Scalar*, int, const Scalar*, int, Scalar*, Scalar);
+  typedef void (*functype)(int, const Scalar*, int, const Scalar*, Scalar*, Scalar);
   static functype func[2];
 
   static bool init = false;
@@ -59,7 +59,7 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
   if(code>=2 || func[code]==0)
     return 0;
 
-  func[code](*n, a, *lda, actual_x, 1, actual_y, alpha);
+  func[code](*n, a, *lda, actual_x, actual_y, alpha);
 
   if(actual_x!=x) delete[] actual_x;
   if(actual_y!=y) delete[] copy_back(actual_y,y,*n,*incy);
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 563101dfc..6a6b00728 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -6,7 +6,7 @@
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
+#include <iostream>
 #include "common.h"
 
 int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
@@ -133,6 +133,9 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"TRSM ",&info,6);
 
+  if(*m==0 || *n==0)
+    return 0;
+
   int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
   
   if(SIDE(*side)==LEFT)
@@ -358,6 +361,9 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
       else                matrix(c, *n, *n, *ldc).triangularView<Lower>() *= beta;
   }
 
+  if(*n==0 || *k==0)
+    return 0;
+
   #if ISCOMPLEX
   // FIXME add support for symmetric complex matrix
   if(UPLO(*uplo)==UP)
@@ -392,6 +398,8 @@ int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
   Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
 
+//   std::cerr << "in syr2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
+
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
   else if(OP(*op)==INVALID)                                           info = 2;
@@ -506,6 +514,8 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 // c = alpha*conj(a')*a + beta*c  for op  = 'C'or'c'
 int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
+//   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
+
   typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, const Scalar&);
   static functype func[8];
 
@@ -577,6 +587,8 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
   RealScalar beta  = *pbeta;
 
+//   std::cerr << "in her2k " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << *ldb << " " << beta << " " << *ldc << "\n";
+
   int info = 0;
   if(UPLO(*uplo)==INVALID)                                            info = 1;
   else if((OP(*op)==INVALID) || (OP(*op)==TR))                        info = 2;
diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f
index a4c996fda..8ca67fb19 100644
--- a/blas/testing/cblat1.f
+++ b/blas/testing/cblat1.f
@@ -1,7 +1,49 @@
+*> \brief \b CBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX Level 1 BLAS.
+*>    Based upon the original BLAS test routine together with:
+*>
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT1
-*     Test program for the COMPLEX    Level 1 BLAS.
-*     Based upon the original BLAS test routine together with:
-*     F06GAF Example Program Text
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
 *     .. Parameters ..
       INTEGER          NOUT
       PARAMETER        (NOUT=6)
@@ -114,8 +156,8 @@
      +                  (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0),
      +                  (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0),
-     +                  (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0),
+     +                  (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0),
+     +                  (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
       DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
@@ -129,10 +171,10 @@
      +                  (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0),
      +                  (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0),
-     +                  (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0),
-     +                  (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/
-      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/
-      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/
+     +                  (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0),
+     +                  (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/
+      DATA              STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/
+      DATA              STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/
       DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
@@ -145,8 +187,8 @@
      +                  (0.11E0,-0.03E0), (-0.17E0,0.46E0),
      +                  (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (0.19E0,-0.17E0), (0.32E0,0.09E0),
-     +                  (0.23E0,-0.24E0), (0.18E0,0.01E0),
+     +                  (0.19E0,-0.17E0), (0.20E0,-0.35E0),
+     +                  (0.35E0,0.20E0), (0.14E0,0.08E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0)/
       DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
@@ -162,9 +204,9 @@
      +                  (-0.17E0,0.46E0), (4.0E0,7.0E0),
      +                  (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0),
-     +                  (0.32E0,0.09E0), (6.0E0,9.0E0),
-     +                  (0.23E0,-0.24E0), (8.0E0,3.0E0),
-     +                  (0.18E0,0.01E0), (9.0E0,4.0E0)/
+     +                  (0.20E0,-0.35E0), (6.0E0,9.0E0),
+     +                  (0.35E0,0.20E0), (8.0E0,3.0E0),
+     +                  (0.14E0,0.08E0), (9.0E0,4.0E0)/
       DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
      +                  (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0),
@@ -177,8 +219,8 @@
      +                  (0.03E0,0.03E0), (-0.18E0,0.03E0),
      +                  (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
      +                  (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0),
-     +                  (0.09E0,0.03E0), (0.03E0,0.12E0),
-     +                  (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0),
+     +                  (0.09E0,0.03E0), (0.15E0,0.00E0),
+     +                  (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0),
      +                  (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/
       DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0),
      +                  (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0),
@@ -193,8 +235,8 @@
      +                  (-0.18E0,0.03E0), (4.0E0,7.0E0),
      +                  (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0),
      +                  (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0),
-     +                  (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0),
-     +                  (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/
+     +                  (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0),
+     +                  (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/
       DATA              ITRUE3/0, 1, 2, 2, 2/
 *     .. Executable Statements ..
       DO 60 INCX = 1, 2
@@ -529,7 +571,8 @@
 *
 *     .. Parameters ..
       INTEGER          NOUT
-      PARAMETER        (NOUT=6)
+      REAL             ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0E0)
 *     .. Scalar Arguments ..
       REAL             SFAC
       INTEGER          LEN
@@ -552,7 +595,7 @@
 *
       DO 40 I = 1, LEN
          SD = SCOMP(I) - STRUE(I)
-         IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
      +       GO TO 40
 *
 *                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
diff --git a/blas/testing/cblat2.f b/blas/testing/cblat2.f
index 20f188100..5833ea81a 100644
--- a/blas/testing/cblat2.f
+++ b/blas/testing/cblat2.f
@@ -1,68 +1,114 @@
+*> \brief \b CBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'cblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT2
 *
-*  Test program for the COMPLEX          Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 17 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 35 lines:
-*  'CBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  CGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGERC  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CGERU  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -71,8 +117,8 @@
       PARAMETER          ( NSUBS = 17 )
       COMPLEX            ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
-      REAL               RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -126,7 +172,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -135,7 +181,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -240,14 +286,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   90 CONTINUE
-      IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 100
-      EPS = RHALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of CMVCH using exact data.
@@ -3079,7 +3118,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LCERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/cblat3.f b/blas/testing/cblat3.f
index b26be91e6..09f2cb9c5 100644
--- a/blas/testing/cblat3.f
+++ b/blas/testing/cblat3.f
@@ -1,50 +1,96 @@
+*> \brief \b CBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM CBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX          Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'cblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex_blas_testing
+*
+*  =====================================================================
       PROGRAM CBLAT3
 *
-*  Test program for the COMPLEX          Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 9 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 23 lines:
-*  'CBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  CGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHERK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
-*  CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -53,8 +99,8 @@
       PARAMETER          ( NSUBS = 9 )
       COMPLEX            ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
-      REAL               RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+      REAL               RZERO
+      PARAMETER          ( RZERO = 0.0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -103,7 +149,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -112,7 +158,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -189,14 +235,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   70 CONTINUE
-      IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 80
-      EPS = RHALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of CMMCH using exact data.
@@ -1946,7 +1985,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1956,12 +1995,19 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to CSYMM and CHEMM
+*            with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
 *     .. Local Scalars ..
       COMPLEX            ALPHA, BETA
       REAL               RALPHA, RBETA
@@ -1979,6 +2025,14 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = CMPLX( ONE, -ONE )
+      BETA = CMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
      $        90 )ISNUM
    10 INFOT = 1
@@ -2205,16 +2259,16 @@
       CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2272,16 +2326,16 @@
       CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -3268,7 +3322,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LCERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/dblat2.f b/blas/testing/dblat2.f
index 4002d4368..0fa80afa4 100644
--- a/blas/testing/dblat2.f
+++ b/blas/testing/dblat2.f
@@ -1,75 +1,121 @@
+*> \brief \b DBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'dblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETAC
+*> DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
       PROGRAM DBLAT2
 *
-*  Test program for the DOUBLE PRECISION Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 16 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 34 lines:
-*  'DBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 0.9       VALUES OF BETA
-*  DGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DGER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 16 )
-      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -121,7 +167,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -130,7 +176,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -235,14 +281,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   90 CONTINUE
-      IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 100
-      EPS = HALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of DMVCH using exact data.
@@ -2982,7 +3021,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LDERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/dblat3.f b/blas/testing/dblat3.f
index 082e03e5e..8d37c7453 100644
--- a/blas/testing/dblat3.f
+++ b/blas/testing/dblat3.f
@@ -1,55 +1,101 @@
+*> \brief \b DBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM DBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the DOUBLE PRECISION Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'dblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup double_blas_testing
+*
+*  =====================================================================
       PROGRAM DBLAT3
 *
-*  Test program for the DOUBLE PRECISION Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 6 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 20 lines:
-*  'DBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 1.3       VALUES OF BETA
-*  DGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  DSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 6 )
-      DOUBLE PRECISION   ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 )
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -96,7 +142,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -105,7 +151,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -182,14 +228,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   70 CONTINUE
-      IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 80
-      EPS = HALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of DMMCH using exact data.
@@ -1802,7 +1841,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, BETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1812,12 +1851,18 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
 *     .. Local Scalars ..
       DOUBLE PRECISION   ALPHA, BETA
 *     .. Local Arrays ..
@@ -1834,6 +1879,12 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
    10 INFOT = 1
       CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
@@ -1963,16 +2014,16 @@
       CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2660,7 +2711,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LDERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/sblat2.f b/blas/testing/sblat2.f
index 057a85429..71605ed31 100644
--- a/blas/testing/sblat2.f
+++ b/blas/testing/sblat2.f
@@ -1,75 +1,121 @@
+*> \brief \b SBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 16 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 34 lines:
+*> 'sblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 0.9       VALUES OF BETA
+*> SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SGER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
       PROGRAM SBLAT2
 *
-*  Test program for the REAL             Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 16 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 34 lines:
-*  'SBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 0.9       VALUES OF BETA
-*  SGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SGER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 16 )
-      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -121,7 +167,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -130,7 +176,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -235,14 +281,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   90 CONTINUE
-      IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 100
-      EPS = HALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of SMVCH using exact data.
@@ -2982,7 +3021,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LSERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/sblat3.f b/blas/testing/sblat3.f
index 325a9eb92..879269633 100644
--- a/blas/testing/sblat3.f
+++ b/blas/testing/sblat3.f
@@ -1,55 +1,101 @@
+*> \brief \b SBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM SBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the REAL             Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 6 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 20 lines:
+*> 'sblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> 0.0 1.0 0.7       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> 0.0 1.0 1.3       VALUES OF BETA
+*> SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup single_blas_testing
+*
+*  =====================================================================
       PROGRAM SBLAT3
 *
-*  Test program for the REAL             Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 6 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 20 lines:
-*  'SBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  0.0 1.0 0.7       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  0.0 1.0 1.3       VALUES OF BETA
-*  SGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  STRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  SSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
       PARAMETER          ( NIN = 5 )
       INTEGER            NSUBS
       PARAMETER          ( NSUBS = 6 )
-      REAL               ZERO, HALF, ONE
-      PARAMETER          ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 )
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0, ONE = 1.0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -96,7 +142,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -105,7 +151,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -182,14 +228,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = ONE
-   70 CONTINUE
-      IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO )
-     $   GO TO 80
-      EPS = HALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(ZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of SMMCH using exact data.
@@ -1802,7 +1841,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, BETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1812,12 +1851,18 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA and BETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to SSYMM with INFOT = 9  (eca)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0E0, TWO = 2.0E0 )
 *     .. Local Scalars ..
       REAL               ALPHA, BETA
 *     .. Local Arrays ..
@@ -1834,6 +1879,12 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA and BETA.
+*
+      ALPHA = ONE
+      BETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM
    10 INFOT = 1
       CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
@@ -1963,16 +2014,16 @@
       CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2660,7 +2711,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LSERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f
index e2415e1c4..d30112c63 100644
--- a/blas/testing/zblat1.f
+++ b/blas/testing/zblat1.f
@@ -1,7 +1,49 @@
+*> \brief \b ZBLAT1
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT1
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>    Test program for the COMPLEX*16 Level 1 BLAS.
+*>
+*>    Based upon the original BLAS test routine together with:
+*>    F06GAF Example Program Text
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT1
-*     Test program for the COMPLEX*16 Level 1 BLAS.
-*     Based upon the original BLAS test routine together with:
-*     F06GAF Example Program Text
+*
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
+*
+*  =====================================================================
+*
 *     .. Parameters ..
       INTEGER          NOUT
       PARAMETER        (NOUT=6)
@@ -114,8 +156,8 @@
      +                  (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0),
      +                  (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0),
-     +                  (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0),
+     +                  (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0),
+     +                  (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
       DATA              ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
@@ -129,10 +171,10 @@
      +                  (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0),
      +                  (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0),
-     +                  (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0),
-     +                  (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/
-      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/
-      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/
+     +                  (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0),
+     +                  (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/
+      DATA              STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/
+      DATA              STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/
       DATA              ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
@@ -145,8 +187,8 @@
      +                  (0.11D0,-0.03D0), (-0.17D0,0.46D0),
      +                  (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (0.19D0,-0.17D0), (0.32D0,0.09D0),
-     +                  (0.23D0,-0.24D0), (0.18D0,0.01D0),
+     +                  (0.19D0,-0.17D0), (0.20D0,-0.35D0),
+     +                  (0.35D0,0.20D0), (0.14D0,0.08D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0)/
       DATA              ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
@@ -162,9 +204,9 @@
      +                  (-0.17D0,0.46D0), (4.0D0,7.0D0),
      +                  (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0),
-     +                  (0.32D0,0.09D0), (6.0D0,9.0D0),
-     +                  (0.23D0,-0.24D0), (8.0D0,3.0D0),
-     +                  (0.18D0,0.01D0), (9.0D0,4.0D0)/
+     +                  (0.20D0,-0.35D0), (6.0D0,9.0D0),
+     +                  (0.35D0,0.20D0), (8.0D0,3.0D0),
+     +                  (0.14D0,0.08D0), (9.0D0,4.0D0)/
       DATA              ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
      +                  (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0),
@@ -177,8 +219,8 @@
      +                  (0.03D0,0.03D0), (-0.18D0,0.03D0),
      +                  (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
      +                  (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0),
-     +                  (0.09D0,0.03D0), (0.03D0,0.12D0),
-     +                  (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0),
+     +                  (0.09D0,0.03D0), (0.15D0,0.00D0),
+     +                  (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0),
      +                  (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/
       DATA              ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0),
      +                  (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0),
@@ -193,8 +235,8 @@
      +                  (-0.18D0,0.03D0), (4.0D0,7.0D0),
      +                  (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0),
      +                  (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0),
-     +                  (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0),
-     +                  (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/
+     +                  (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0),
+     +                  (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/
       DATA              ITRUE3/0, 1, 2, 2, 2/
 *     .. Executable Statements ..
       DO 60 INCX = 1, 2
@@ -529,7 +571,8 @@
 *
 *     .. Parameters ..
       INTEGER          NOUT
-      PARAMETER        (NOUT=6)
+      DOUBLE PRECISION ZERO
+      PARAMETER        (NOUT=6, ZERO=0.0D0)
 *     .. Scalar Arguments ..
       DOUBLE PRECISION SFAC
       INTEGER          LEN
@@ -552,7 +595,7 @@
 *
       DO 40 I = 1, LEN
          SD = SCOMP(I) - STRUE(I)
-         IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0)
+         IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO))
      +       GO TO 40
 *
 *                             HERE    SCOMP(I) IS NOT CLOSE TO STRUE(I).
diff --git a/blas/testing/zblat2.f b/blas/testing/zblat2.f
index e65cdcc70..53129a11e 100644
--- a/blas/testing/zblat2.f
+++ b/blas/testing/zblat2.f
@@ -1,68 +1,114 @@
+*> \brief \b ZBLAT2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT2
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 2 Blas.
+*>
+*> The program must be driven by a short data file. The first 18 records
+*> of the file are read using list-directed input, the last 17 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 35 lines:
+*> 'zblat2.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 4                 NUMBER OF VALUES OF K
+*> 0 1 2 4           VALUES OF K
+*> 4                 NUMBER OF VALUES OF INCX AND INCY
+*> 1 2 -1 -2         VALUES OF INCX AND INCY
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> Further Details
+*> ===============
+*>
+*>    See:
+*>
+*>       Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
+*>       An  extended  set of Fortran  Basic Linear Algebra Subprograms.
+*>
+*>       Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
+*>       and  Computer Science  Division,  Argonne  National Laboratory,
+*>       9700 South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*>       Or
+*>
+*>       NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
+*>       Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
+*>       OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
+*>       Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
+*>
+*>
+*> -- Written on 10-August-1987.
+*>    Richard Hanson, Sandia National Labs.
+*>    Jeremy Du Croz, NAG Central Office.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT2
 *
-*  Test program for the COMPLEX*16       Level 2 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 18 records
-*  of the file are read using list-directed input, the last 17 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 35 lines:
-*  'ZBLAT2.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'CBLA2T.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  4                 NUMBER OF VALUES OF K
-*  0 1 2 4           VALUES OF K
-*  4                 NUMBER OF VALUES OF INCX AND INCY
-*  1 2 -1 -2         VALUES OF INCX AND INCY
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  ZGEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHEMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTBMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTPMV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTBSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTPSV  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGERC  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZGERU  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER   T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPR   T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER2  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHPR2  T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*     See:
-*
-*        Dongarra J. J., Du Croz J. J., Hammarling S.  and Hanson R. J..
-*        An  extended  set of Fortran  Basic Linear Algebra Subprograms.
-*
-*        Technical  Memoranda  Nos. 41 (revision 3) and 81,  Mathematics
-*        and  Computer Science  Division,  Argonne  National Laboratory,
-*        9700 South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*        Or
-*
-*        NAG  Technical Reports TR3/87 and TR4/87,  Numerical Algorithms
-*        Group  Ltd.,  NAG  Central  Office,  256  Banbury  Road, Oxford
-*        OX2 7DE, UK,  and  Numerical Algorithms Group Inc.,  1101  31st
-*        Street,  Suite 100,  Downers Grove,  Illinois 60515-1263,  USA.
-*
-*
-*  -- Written on 10-August-1987.
-*     Richard Hanson, Sandia National Labs.
-*     Jeremy Du Croz, NAG Central Office.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -72,8 +118,8 @@
       COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
      $                   ONE = ( 1.0D0, 0.0D0 ) )
-      DOUBLE PRECISION   RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
       INTEGER            NMAX, INCMAX
       PARAMETER          ( NMAX = 65, INCMAX = 2 )
       INTEGER            NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
@@ -127,7 +173,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -136,7 +182,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -241,14 +287,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   90 CONTINUE
-      IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 100
-      EPS = RHALF*EPS
-      GO TO 90
-  100 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of ZMVCH using exact data.
@@ -3087,7 +3126,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LZERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/blas/testing/zblat3.f b/blas/testing/zblat3.f
index d6a522f2a..59ca24145 100644
--- a/blas/testing/zblat3.f
+++ b/blas/testing/zblat3.f
@@ -1,50 +1,97 @@
+*> \brief \b ZBLAT3
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at 
+*            http://www.netlib.org/lapack/explore-html/ 
+*
+*  Definition:
+*  ===========
+*
+*       PROGRAM ZBLAT3
+* 
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> Test program for the COMPLEX*16       Level 3 Blas.
+*>
+*> The program must be driven by a short data file. The first 14 records
+*> of the file are read using list-directed input, the last 9 records
+*> are read using the format ( A6, L2 ). An annotated example of a data
+*> file can be obtained by deleting the first 3 characters from the
+*> following 23 lines:
+*> 'zblat3.out'      NAME OF SUMMARY OUTPUT FILE
+*> 6                 UNIT NUMBER OF SUMMARY FILE
+*> 'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
+*> -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+*> F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+*> F        LOGICAL FLAG, T TO STOP ON FAILURES.
+*> T        LOGICAL FLAG, T TO TEST ERROR EXITS.
+*> 16.0     THRESHOLD VALUE OF TEST RATIO
+*> 6                 NUMBER OF VALUES OF N
+*> 0 1 2 3 5 9       VALUES OF N
+*> 3                 NUMBER OF VALUES OF ALPHA
+*> (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
+*> 3                 NUMBER OF VALUES OF BETA
+*> (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
+*> ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*>
+*> 
+*> Further Details
+*> ===============
+*>
+*> See:
+*>
+*>    Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+*>    A Set of Level 3 Basic Linear Algebra Subprograms.
+*>
+*>    Technical Memorandum No.88 (Revision 1), Mathematics and
+*>    Computer Science Division, Argonne National Laboratory, 9700
+*>    South Cass Avenue, Argonne, Illinois 60439, US.
+*>
+*> -- Written on 8-February-1989.
+*>    Jack Dongarra, Argonne National Laboratory.
+*>    Iain Duff, AERE Harwell.
+*>    Jeremy Du Croz, Numerical Algorithms Group Ltd.
+*>    Sven Hammarling, Numerical Algorithms Group Ltd.
+*>
+*>    10-9-00:  Change STATUS='NEW' to 'UNKNOWN' so that the testers
+*>              can be run multiple times without deleting generated
+*>              output files (susan)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee 
+*> \author Univ. of California Berkeley 
+*> \author Univ. of Colorado Denver 
+*> \author NAG Ltd. 
+*
+*> \date April 2012
+*
+*> \ingroup complex16_blas_testing
+*
+*  =====================================================================
       PROGRAM ZBLAT3
 *
-*  Test program for the COMPLEX*16       Level 3 Blas.
+*  -- Reference BLAS test routine (version 3.4.1) --
+*  -- Reference BLAS is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     April 2012
 *
-*  The program must be driven by a short data file. The first 14 records
-*  of the file are read using list-directed input, the last 9 records
-*  are read using the format ( A6, L2 ). An annotated example of a data
-*  file can be obtained by deleting the first 3 characters from the
-*  following 23 lines:
-*  'ZBLAT3.SUMM'     NAME OF SUMMARY OUTPUT FILE
-*  6                 UNIT NUMBER OF SUMMARY FILE
-*  'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
-*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
-*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-*  F        LOGICAL FLAG, T TO STOP ON FAILURES.
-*  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
-*  16.0     THRESHOLD VALUE OF TEST RATIO
-*  6                 NUMBER OF VALUES OF N
-*  0 1 2 3 5 9       VALUES OF N
-*  3                 NUMBER OF VALUES OF ALPHA
-*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA
-*  3                 NUMBER OF VALUES OF BETA
-*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA
-*  ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHERK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
-*  ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
-*
-*  See:
-*
-*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
-*     A Set of Level 3 Basic Linear Algebra Subprograms.
-*
-*     Technical Memorandum No.88 (Revision 1), Mathematics and
-*     Computer Science Division, Argonne National Laboratory, 9700
-*     South Cass Avenue, Argonne, Illinois 60439, US.
-*
-*  -- Written on 8-February-1989.
-*     Jack Dongarra, Argonne National Laboratory.
-*     Iain Duff, AERE Harwell.
-*     Jeremy Du Croz, Numerical Algorithms Group Ltd.
-*     Sven Hammarling, Numerical Algorithms Group Ltd.
+*  =====================================================================
 *
 *     .. Parameters ..
       INTEGER            NIN
@@ -54,8 +101,8 @@
       COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D0, 0.0D0 ),
      $                   ONE = ( 1.0D0, 0.0D0 ) )
-      DOUBLE PRECISION   RZERO, RHALF, RONE
-      PARAMETER          ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+      DOUBLE PRECISION   RZERO
+      PARAMETER          ( RZERO = 0.0D0 )
       INTEGER            NMAX
       PARAMETER          ( NMAX = 65 )
       INTEGER            NIDMAX, NALMAX, NBEMAX
@@ -104,7 +151,7 @@
 *
       READ( NIN, FMT = * )SUMMRY
       READ( NIN, FMT = * )NOUT
-      OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+      OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' )
       NOUTC = NOUT
 *
 *     Read name and unit number for snapshot output file and open file.
@@ -113,7 +160,7 @@
       READ( NIN, FMT = * )NTRA
       TRACE = NTRA.GE.0
       IF( TRACE )THEN
-         OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+         OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' )
       END IF
 *     Read the flag that directs rewinding of the snapshot file.
       READ( NIN, FMT = * )REWI
@@ -190,14 +237,7 @@
 *
 *     Compute EPS (the machine precision).
 *
-      EPS = RONE
-   70 CONTINUE
-      IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
-     $   GO TO 80
-      EPS = RHALF*EPS
-      GO TO 70
-   80 CONTINUE
-      EPS = EPS + EPS
+      EPS = EPSILON(RZERO)
       WRITE( NOUT, FMT = 9998 )EPS
 *
 *     Check the reliability of ZMMCH using exact data.
@@ -1949,7 +1989,7 @@
 *
 *  Tests the error exits from the Level 3 Blas.
 *  Requires a special version of the error-handling routine XERBLA.
-*  ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*  A, B and C should not need to be defined.
 *
 *  Auxiliary routine for test program for Level 3 Blas.
 *
@@ -1959,12 +1999,20 @@
 *     Jeremy Du Croz, Numerical Algorithms Group Ltd.
 *     Sven Hammarling, Numerical Algorithms Group Ltd.
 *
+*  3-19-92:  Initialize ALPHA, BETA, RALPHA, and RBETA  (eca)
+*  3-19-92:  Fix argument 12 in calls to ZSYMM and ZHEMM
+*            with INFOT = 9  (eca)
+*  10-9-00:  Declared INTRINSIC DCMPLX (susan)
+*
 *     .. Scalar Arguments ..
       INTEGER            ISNUM, NOUT
       CHARACTER*6        SRNAMT
 *     .. Scalars in Common ..
       INTEGER            INFOT, NOUTC
       LOGICAL            LERR, OK
+*     .. Parameters ..
+      REAL               ONE, TWO
+      PARAMETER          ( ONE = 1.0D0, TWO = 2.0D0 )
 *     .. Local Scalars ..
       COMPLEX*16         ALPHA, BETA
       DOUBLE PRECISION   RALPHA, RBETA
@@ -1973,6 +2021,8 @@
 *     .. External Subroutines ..
       EXTERNAL           ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM,
      $                   ZSYR2K, ZSYRK, ZTRMM, ZTRSM
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX
 *     .. Common blocks ..
       COMMON             /INFOC/INFOT, NOUTC, OK, LERR
 *     .. Executable Statements ..
@@ -1982,6 +2032,14 @@
 *     LERR is set to .TRUE. by the special version of XERBLA each time
 *     it is called, and is then tested and re-set by CHKXER.
       LERR = .FALSE.
+*
+*     Initialize ALPHA, BETA, RALPHA, and RBETA.
+*
+      ALPHA = DCMPLX( ONE, -ONE )
+      BETA = DCMPLX( TWO, -TWO )
+      RALPHA = ONE
+      RBETA = TWO
+*
       GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
      $        90 )ISNUM
    10 INFOT = 1
@@ -2208,16 +2266,16 @@
       CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -2275,16 +2333,16 @@
       CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 9
-      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+      CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
       CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
       INFOT = 12
       CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
@@ -3274,7 +3332,6 @@
    50    CONTINUE
       END IF
 *
-   60 CONTINUE
       LZERES = .TRUE.
       GO TO 80
    70 CONTINUE
diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in
index e50f6dbe0..04e7886ce 100644
--- a/cmake/Eigen3Config.cmake.in
+++ b/cmake/Eigen3Config.cmake.in
@@ -15,7 +15,7 @@
 #  EIGEN3_VERSION_PATCH      - The patch version of Eigen
 
 set ( EIGEN3_FOUND 1 )
-set ( EIGEN3_USE_FILE     "@EIGEN_USE_FILE@" )
+set ( EIGEN3_USE_FILE     "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" )
 
 set ( EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
 set ( EIGEN3_INCLUDE_DIR  "@EIGEN_INCLUDE_DIR@" )
diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index 0ee484e8c..afc24b5e9 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -46,16 +46,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(EIGEN_COVERAGE_TESTING)
     set(COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage")
     set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/")
-  else(EIGEN_COVERAGE_TESTING)
-    set(COVERAGE_FLAGS "")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}")
   endif(EIGEN_COVERAGE_TESTING)
   
-  if(CMAKE_SYSTEM_NAME MATCHES Linux)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS} -g2")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${COVERAGE_FLAGS} -O2 -g2")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${COVERAGE_FLAGS} -fno-inline-functions")
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${COVERAGE_FLAGS} -O0 -g3")
-  endif(CMAKE_SYSTEM_NAME MATCHES Linux)
 elseif(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/cmake/FindAdolc.cmake b/cmake/FindAdolc.cmake
index 1a7ff3628..937e54990 100644
--- a/cmake/FindAdolc.cmake
+++ b/cmake/FindAdolc.cmake
@@ -5,7 +5,7 @@ endif (ADOLC_INCLUDES AND ADOLC_LIBRARIES)
 
 find_path(ADOLC_INCLUDES
   NAMES
-  adolc/adouble.h
+  adolc/adtl.h
   PATHS
   $ENV{ADOLCDIR}
   ${INCLUDE_INSTALL_DIR}
diff --git a/cmake/FindSPQR.cmake b/cmake/FindSPQR.cmake
index 794c212af..1e958c3c1 100644
--- a/cmake/FindSPQR.cmake
+++ b/cmake/FindSPQR.cmake
@@ -26,7 +26,12 @@ if(SPQR_LIBRARIES)
   find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS $ENV{SPQRDIR} ${LIB_INSTALL_DIR})
   if (SUITESPARSE_LIBRARY)
     set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${SUITESPARSE_LIBRARY})
-  endif (SUITESPARSE_LIBRARY)
+  endif()
+
+  find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(CHOLMOD_LIBRARY)
+    set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARY})
+  endif()
   
 endif(SPQR_LIBRARIES)
 
diff --git a/cmake/FindSuperLU.cmake b/cmake/FindSuperLU.cmake
index 8a3df3666..e4142fe4d 100644
--- a/cmake/FindSuperLU.cmake
+++ b/cmake/FindSuperLU.cmake
@@ -17,10 +17,64 @@ find_path(SUPERLU_INCLUDES
   SRC
 )
 
-find_library(SUPERLU_LIBRARIES superlu PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib)
-  
+find_library(SUPERLU_LIBRARIES NAMES "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu" PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib)
+
+if(SUPERLU_INCLUDES AND SUPERLU_LIBRARIES)
+
+include(CheckCXXSourceCompiles)
+include(CMakePushCheckState)
+cmake_push_check_state()
+
+set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${SUPERLU_INCLUDES})
+
+# check whether struct mem_usage_t is globally defined
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <slu_util.h>
+int main() {
+  mem_usage_t mem;
+  return 0;
+}"
+SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
+
+
+check_cxx_source_compiles("
+typedef int int_t;
+#include <supermatrix.h>
+#include <superlu_enum_consts.h>
+int main() {
+  return SLU_SINGLE;
+}"
+SUPERLU_HAS_CLEAN_ENUMS)
+
+if(SUPERLU_HAS_CLEAN_ENUMS)
+  # at least 4.3
+  set(SUPERLU_VERSION_VAR "4.3")
+elseif(SUPERLU_HAS_GLOBAL_MEM_USAGE_T)
+  # at least 4.3
+  set(SUPERLU_VERSION_VAR "4.0")
+else()
+  set(SUPERLU_VERSION_VAR "3.0")
+endif()
+
+cmake_pop_check_state()
+
+if(SuperLU_FIND_VERSION)
+  if(${SUPERLU_VERSION_VAR} VERSION_LESS ${SuperLU_FIND_VERSION})
+    set(SUPERLU_VERSION_OK FALSE)
+  else()
+    set(SUPERLU_VERSION_OK TRUE)
+  endif()
+else()
+  set(SUPERLU_VERSION_OK TRUE)
+endif()
+
+endif()
+
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(SUPERLU DEFAULT_MSG
-                                  SUPERLU_INCLUDES SUPERLU_LIBRARIES)
+find_package_handle_standard_args(SUPERLU
+                                  REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES SUPERLU_VERSION_OK
+                                  VERSION_VAR SUPERLU_VERSION_VAR)
 
 mark_as_advanced(SUPERLU_INCLUDES SUPERLU_LIBRARIES)
diff --git a/cmake/FindUmfpack.cmake b/cmake/FindUmfpack.cmake
index 16b046cd6..53cf0b49b 100644
--- a/cmake/FindUmfpack.cmake
+++ b/cmake/FindUmfpack.cmake
@@ -20,24 +20,29 @@ find_library(UMFPACK_LIBRARIES umfpack PATHS $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}
 
 if(UMFPACK_LIBRARIES)
 
-  if (NOT UMFPACK_LIBDIR)
+  if(NOT UMFPACK_LIBDIR)
     get_filename_component(UMFPACK_LIBDIR ${UMFPACK_LIBRARIES} PATH)
   endif(NOT UMFPACK_LIBDIR)
 
   find_library(COLAMD_LIBRARY colamd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
-  if (COLAMD_LIBRARY)
+  if(COLAMD_LIBRARY)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${COLAMD_LIBRARY})
-  endif (COLAMD_LIBRARY)
+  endif ()
   
   find_library(AMD_LIBRARY amd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
-  if (AMD_LIBRARY)
+  if(AMD_LIBRARY)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${AMD_LIBRARY})
-  endif (AMD_LIBRARY)
+  endif ()
 
   find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
-  if (SUITESPARSE_LIBRARY)
+  if(SUITESPARSE_LIBRARY)
     set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${SUITESPARSE_LIBRARY})
-  endif (SUITESPARSE_LIBRARY)
+  endif ()
+
+  find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR})
+  if(CHOLMOD_LIBRARY)
+    set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${CHOLMOD_LIBRARY})
+  endif()
 
 endif(UMFPACK_LIBRARIES)
 
@@ -45,4 +50,4 @@ include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(UMFPACK DEFAULT_MSG
                                   UMFPACK_INCLUDES UMFPACK_LIBRARIES)
 
-mark_as_advanced(UMFPACK_INCLUDES UMFPACK_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY SUITESPARSE_LIBRARY)
+mark_as_advanced(UMFPACK_INCLUDES UMFPACK_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY)
diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake
index 93f8a8fd8..2f14f30b8 100644
--- a/cmake/language_support.cmake
+++ b/cmake/language_support.cmake
@@ -43,7 +43,7 @@ function(workaround_9220 language language_works)
   if(return_code EQUAL 0)
     # Second run
     execute_process (
-      COMMAND ${CMAKE_COMMAND} .
+      COMMAND ${CMAKE_COMMAND} . -G "${CMAKE_GENERATOR}"
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}
       RESULT_VARIABLE return_code
       OUTPUT_QUIET
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 800bb30ee..e0c6a7e34 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -223,7 +223,8 @@ ALIASES                = "only_for_vectors=This is only for vectors (either row-
                          "note_about_using_kernel_to_study_multiple_solutions=If you need a complete analysis of the space of solutions, take the one solution obtained by this method and add to it elements of the kernel, as determined by kernel()." \
                          "note_about_checking_solutions=This method just tries to find as good a solution as possible. If you want to check whether a solution exists or if it is accurate, just call this function to get a result and then compute the error of this result, or use MatrixBase::isApprox() directly, for instance like this: \code bool a_solution_exists = (A*result).isApprox(b, precision); \endcode This method avoids dividing by zero, so that the non-existence of a solution doesn't by itself mean that you'll get \c inf or \c nan values." \
                          "note_try_to_help_rvo=This function returns the result by value. In order to make that efficient, it is implemented as just a return statement using a special constructor, hopefully allowing the compiler to perform a RVO (return value optimization)." \
-                         "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\""
+                         "nonstableyet=\warning This is not considered to be part of the stable public API yet. Changes may happen in future releases. See \ref Experimental \"Experimental parts of Eigen\"" \
+                         "implsparsesolverconcept=This class follows the \link TutorialSparseSolverConcept sparse solver concept \endlink."
                          
 
 ALIASES += "eigenAutoToc=  "
@@ -866,13 +867,13 @@ STRIP_CODE_COMMENTS    = YES
 # then for each documented function all documented
 # functions referencing it will be listed.
 
-REFERENCED_BY_RELATION = YES
+REFERENCED_BY_RELATION = NO
 
 # If the REFERENCES_RELATION tag is set to YES
 # then for each documented function all documented entities
 # called/used by that function will be listed.
 
-REFERENCES_RELATION    = YES
+REFERENCES_RELATION    = NO
 
 # If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
 # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
diff --git a/doc/Manual.dox b/doc/Manual.dox
index 7f04edff4..c10c490a7 100644
--- a/doc/Manual.dox
+++ b/doc/Manual.dox
@@ -125,6 +125,8 @@ namespace Eigen {
     \ingroup Sparse_chapter */
 /** \addtogroup TopicSparseSystems
     \ingroup Sparse_chapter */
+/** \addtogroup MatrixfreeSolverExample
+    \ingroup Sparse_chapter */
 
 /** \addtogroup Sparse_Reference
     \ingroup Sparse_chapter */
diff --git a/doc/MatrixfreeSolverExample.dox b/doc/MatrixfreeSolverExample.dox
new file mode 100644
index 000000000..000cb0bbe
--- /dev/null
+++ b/doc/MatrixfreeSolverExample.dox
@@ -0,0 +1,20 @@
+
+namespace Eigen {
+
+/**
+
+\eigenManualPage MatrixfreeSolverExample Matrix-free solvers
+
+Iterative solvers such as ConjugateGradient and BiCGSTAB can be used in a matrix free context. To this end, user must provide a wrapper class inheriting EigenBase<> and implementing the following methods:
+ - Index rows() and Index cols(): returns number of rows and columns respectively
+ - operator* with and %Eigen dense column vector (its actual implementation goes in a specialization of the internal::generic_product_impl class)
+
+Eigen::internal::traits<> must also be specialized for the wrapper type.
+
+Here is a complete example wrapping a Eigen::SparseMatrix:
+\include matrixfree_cg.cpp
+Output: \verbinclude matrixfree_cg.out
+
+*/
+
+}
\ No newline at end of file
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index 76ce2eb99..7cde1a36f 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -106,6 +106,7 @@ following macros are supported; none of them are defined by default.
  - \b EIGEN_MATRIX_PLUGIN - filename of plugin for extending the Matrix class.
  - \b EIGEN_MATRIXBASE_PLUGIN - filename of plugin for extending the MatrixBase class.
  - \b EIGEN_PLAINOBJECTBASE_PLUGIN - filename of plugin for extending the PlainObjectBase class.
+ - \b EIGEN_MAPBASE_PLUGIN - filename of plugin for extending the MapBase class.
  - \b EIGEN_QUATERNION_PLUGIN - filename of plugin for extending the Quaternion class.
  - \b EIGEN_QUATERNIONBASE_PLUGIN - filename of plugin for extending the QuaternionBase class.
  - \b EIGEN_SPARSEMATRIX_PLUGIN - filename of plugin for extending the SparseMatrix class.
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index 48c18f46f..9fb3282e7 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -4,36 +4,63 @@ In Eigen, there are several methods available to solve linear systems when the c
 
 \eigenAutoToc
 
-\section TutorialSparseDirectSolvers Sparse solvers
+\section TutorialSparseSolverList List of sparse solvers
 
-%Eigen currently provides a limited set of built-in solvers, as well as wrappers to external solver libraries.
-They are summarized in the following table:
+%Eigen currently provides a wide set of built-in solvers, as well as wrappers to external solver libraries.
+They are summarized in the following tables:
+
+\subsection TutorialSparseSolverList_Direct Built-in direct solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
+    <th>License</th><th class="width20em"><p>Notes</p></th></tr>
+
+<tr><td>SimplicialLLT \n <tt>#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
+    <td>LGPL</td>
+    <td>SimplicialLDLT is often preferable</td></tr>
+
+<tr><td>SimplicialLDLT \n <tt>#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LDLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
+    <td>LGPL</td>
+    <td>Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)</td></tr>
+
+<tr><td>SparseLU \n <tt>#include<Eigen/\link SparseLU_Module SparseLU\endlink></tt></td> <td>LU factorization </td>
+    <td>Square </td><td>Fill-in reducing, Leverage fast dense algebra</td>
+    <td>MPL2</td>
+    <td>optimized for small and large problems with irregular patterns </td></tr>
+
+<tr><td>SparseQR \n <tt>#include<Eigen/\link SparseQR_Module SparseQR\endlink></tt></td> <td> QR factorization</td>
+    <td>Any, rectangular</td><td> Fill-in reducing</td>
+    <td>MPL2</td>
+    <td>recommended for least-square problems, has a basic rank-revealing feature</td></tr>
+ </table>
+
+\subsection TutorialSparseSolverList_Iterative Built-in iterative solvers
+
+<table class="manual">
+<tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Supported preconditioners, [default]</th>
+    <th>License</th><th class="width20em"><p>Notes</p></th></tr>
+
+<tr><td>ConjugateGradient \n <tt>#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td> <td>Classic iterative CG</td><td>SPD</td>
+    <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholesky</td>
+    <td>MPL2</td>
+    <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
+
+<tr><td>LeastSquaresConjugateGradient \n <tt>#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>CG for rectangular least-square problem</td><td>Rectangular</td>
+    <td>IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]</td>
+    <td>MPL2</td>
+    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
+
+<tr><td>BiCGSTAB \n <tt>#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td>
+    <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUT</td>
+    <td>MPL2</td>
+    <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
+</table>
+
+\subsection TutorialSparseSolverList_Wrapper Wrappers to external solvers
 
 <table class="manual">
 <tr><th>Class</th><th>Module</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
     <th>Dependencies,License</th><th class="width20em"><p>Notes</p></th></tr>
-<tr><td>SimplicialLLT    </td><td>\link SparseCholesky_Module SparseCholesky \endlink</td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
-    <td>built-in, LGPL</td>
-    <td>SimplicialLDLT is often preferable</td></tr>
-<tr><td>SimplicialLDLT   </td><td>\link SparseCholesky_Module SparseCholesky \endlink</td><td>Direct LDLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
-    <td>built-in, LGPL</td>
-    <td>Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)</td></tr>
-<tr><td>ConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Classic iterative CG</td><td>SPD</td><td>Preconditionning</td>
-    <td>built-in, MPL2</td>
-    <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
-<tr><td>LeastSquaresConjugateGradient</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>CG for rectangular least-square problem</td><td>Rectangular</td><td>Preconditionning</td>
-    <td>built-in, MPL2</td>
-    <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
-<tr><td>BiCGSTAB</td><td>\link IterativeLinearSolvers_Module IterativeLinearSolvers \endlink</td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td><td>Preconditionning</td>
-    <td>built-in, MPL2</td>
-    <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
-<tr><td>SparseLU</td> <td>\link SparseLU_Module SparseLU \endlink </td> <td>LU factorization </td>
-    <td>Square </td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td> built-in, MPL2</td> <td>optimized for small and large problems with irregular patterns </td></tr>
-<tr><td>SparseQR</td> <td>\link SparseQR_Module SparseQR \endlink</td> <td> QR factorization</td>
-    <td>Any, rectangular</td><td> Fill-in reducing</td>
-    <td>built-in, MPL2</td><td>recommended for least-square problems, has a basic rank-revealing feature</td></tr>
-<tr> <th colspan="7"> Wrappers to external solvers </th></tr>
 <tr><td>PastixLLT \n PastixLDLT \n PastixLU</td><td>\link PaStiXSupport_Module PaStiXSupport \endlink</td><td>Direct LLt, LDLt, LU factorizations</td><td>SPD \n SPD \n Square</td><td>Fill-in reducing, Leverage fast dense algebra, Multithreading</td>
     <td>Requires the <a href="http://pastix.gforge.inria.fr">PaStiX</a> package, \b CeCILL-C </td>
     <td>optimized for tough problems and symmetric patterns</td></tr>
@@ -53,6 +80,8 @@ They are summarized in the following table:
 
 Here \c SPD means symmetric positive definite.
 
+\section TutorialSparseSolverConcept Sparse solver concept
+
 All these solvers follow the same general concept.
 Here is a typical and general example:
 \code
@@ -104,9 +133,11 @@ x2 = solver.solve(b2);
 \endcode
 The compute() method is equivalent to calling both analyzePattern() and factorize().
 
-Finally, each solver provides some specific features, such as determinant, access to the factors, controls of the iterations, and so on.
+Each solver provides some specific features, such as determinant, access to the factors, controls of the iterations, and so on.
 More details are available in the documentations of the respective classes.
 
+Finally, most of the iterative solvers, can also be used in a \b matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+
 \section TheSparseCompute The Compute Step
 In the compute() function, the matrix is generally factorized: LLT for self-adjoint matrices, LDLT for general hermitian matrices, LU for non hermitian matrices and QR for rectangular matrices. These are the results of using direct solvers. For this class of solvers precisely, the compute step is further subdivided into analyzePattern() and factorize(). 
 
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox
index 95f6bf287..47c9b261f 100644
--- a/doc/TopicMultithreading.dox
+++ b/doc/TopicMultithreading.dox
@@ -43,6 +43,8 @@ int main(int argc, char** argv)
 }
 \endcode
 
+\note With Eigen 3.3, and a fully C++11 compliant compiler (i.e., <a href="http://en.cppreference.com/w/cpp/language/storage_duration#Static_local_variables">thread-safe static local variable initialization</a>), then calling \c initParallel() is optional.
+
 \warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature.
 
 In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section.
diff --git a/doc/TutorialReductionsVisitorsBroadcasting.dox b/doc/TutorialReductionsVisitorsBroadcasting.dox
index eb6787dbc..908a1b4b2 100644
--- a/doc/TutorialReductionsVisitorsBroadcasting.dox
+++ b/doc/TutorialReductionsVisitorsBroadcasting.dox
@@ -32,7 +32,7 @@ Eigen also provides the \link MatrixBase::norm() norm() \endlink method, which r
 
 These operations can also operate on matrices; in that case, a n-by-p matrix is seen as a vector of size (n*p), so for example the \link MatrixBase::norm() norm() \endlink method returns the "Frobenius" or "Hilbert-Schmidt" norm. We refrain from speaking of the \f$\ell^2\f$ norm of a matrix because that can mean different things.
 
-If you want other \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNorm<p>() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients.
+If you want other coefficient-wise \f$\ell^p\f$ norms, use the \link MatrixBase::lpNorm() lpNorm<p>() \endlink method. The template parameter \a p can take the special value \a Infinity if you want the \f$\ell^\infty\f$ norm, which is the maximum of the absolute values of the coefficients.
 
 The following example demonstrates these methods.
 
@@ -45,6 +45,17 @@ The following example demonstrates these methods.
 \verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.out
 </td></tr></table>
 
+\b Operator \b norm: The 1-norm and \f$\infty\f$-norm <a href="https://en.wikipedia.org/wiki/Operator_norm">matrix operator norms</a> can easily be computed as follows:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
+</td>
+<td>
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.out
+</td></tr></table>
+See below for more explanations on the syntax of these expressions.
+
 \subsection TutorialReductionsVisitorsBroadcastingReductionsBool Boolean reductions
 
 The following reductions operate on boolean values:
diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
index 835c59354..fb07adaa2 100644
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@@ -83,7 +83,7 @@ There is no notion of compressed/uncompressed mode for a SparseVector.
 
 \section TutorialSparseExample First example
 
-Before describing each individual class, let's start with the following typical example: solving the Laplace equation \f$ \nabla u = 0 \f$ on a regular 2D grid using a finite difference scheme and Dirichlet boundary conditions.
+Before describing each individual class, let's start with the following typical example: solving the Laplace equation \f$ \Delta u = 0 \f$ on a regular 2D grid using a finite difference scheme and Dirichlet boundary conditions.
 Such problem can be mathematically expressed as a linear problem of the form \f$ Ax=b \f$ where \f$ x \f$ is the vector of \c m unknowns (in our case, the values of the pixels), \f$ b \f$ is the right hand side vector resulting from the boundary conditions, and \f$ A \f$ is an \f$ m \times m \f$ matrix containing only a few non-zero elements resulting from the discretization of the Laplacian operator.
 
 <table class="manual">
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
new file mode 100644
index 000000000..62e28fc31
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_operatornorm.cpp
@@ -0,0 +1,18 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  MatrixXf m(2,2);
+  m << 1,-2,
+       -3,4;
+
+  cout << "1-norm(m)     = " << m.cwiseAbs().colwise().sum().maxCoeff()
+       << " == "             << m.colwise().lpNorm<1>().maxCoeff() << endl;
+
+  cout << "infty-norm(m) = " << m.cwiseAbs().rowwise().sum().maxCoeff()
+       << " == "             << m.rowwise().lpNorm<1>().maxCoeff() << endl;
+}
diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp
new file mode 100644
index 000000000..6a205aea3
--- /dev/null
+++ b/doc/examples/matrixfree_cg.cpp
@@ -0,0 +1,128 @@
+#include <iostream>
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <Eigen/IterativeLinearSolvers>
+#include <unsupported/Eigen/IterativeSolvers>
+
+class MatrixReplacement;
+using Eigen::SparseMatrix;
+
+namespace Eigen {
+namespace internal {
+  // MatrixReplacement looks-like a SparseMatrix, so let's inherits its traits:
+  template<>
+  struct traits<MatrixReplacement> :  public Eigen::internal::traits<Eigen::SparseMatrix<double> >
+  {};
+}
+}
+
+// Example of a matrix-free wrapper from a user type to Eigen's compatible type
+// For the sake of simplicity, this example simply wrap a Eigen::SparseMatrix.
+class MatrixReplacement : public Eigen::EigenBase<MatrixReplacement> {
+public:
+  // Required typedefs, constants, and method:
+  typedef double Scalar;
+  typedef double RealScalar;
+  typedef int StorageIndex;
+  enum {
+    ColsAtCompileTime = Eigen::Dynamic,
+    MaxColsAtCompileTime = Eigen::Dynamic,
+    IsRowMajor = false
+  };
+
+  Index rows() const { return mp_mat->rows(); }
+  Index cols() const { return mp_mat->cols(); }
+
+  template<typename Rhs>
+  Eigen::Product<MatrixReplacement,Rhs,Eigen::AliasFreeProduct> operator*(const Eigen::MatrixBase<Rhs>& x) const {
+    return Eigen::Product<MatrixReplacement,Rhs,Eigen::AliasFreeProduct>(*this, x.derived());
+  }
+
+  // Custom API:
+  MatrixReplacement() : mp_mat(0) {}
+
+  void attachMyMatrix(const SparseMatrix<double> &mat) {
+    mp_mat = &mat;
+  }
+  const SparseMatrix<double> my_matrix() const { return *mp_mat; }
+
+private:
+  const SparseMatrix<double> *mp_mat;
+};
+
+
+// Implementation of MatrixReplacement * Eigen::DenseVector though a specialization of internal::generic_product_impl:
+namespace Eigen {
+namespace internal {
+
+  template<typename Rhs>
+  struct generic_product_impl<MatrixReplacement, Rhs, SparseShape, DenseShape, GemvProduct> // GEMV stands for matrix-vector
+  : generic_product_impl_base<MatrixReplacement,Rhs,generic_product_impl<MatrixReplacement,Rhs> >
+  {
+    typedef typename Product<MatrixReplacement,Rhs>::Scalar Scalar;
+
+    template<typename Dest>
+    static void scaleAndAddTo(Dest& dst, const MatrixReplacement& lhs, const Rhs& rhs, const Scalar& alpha)
+    {
+      // This method should implement "dst += alpha * lhs * rhs" inplace,
+      // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it.
+      assert(alpha==Scalar(1) && "scaling is not implemented");
+
+      // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs,
+      // but let's do something fancier (and less efficient):
+      for(Index i=0; i<lhs.cols(); ++i)
+        dst += rhs(i) * lhs.my_matrix().col(i);
+    }
+  };
+
+}
+}
+
+int main()
+{
+  int n = 10;
+  Eigen::SparseMatrix<double> S = Eigen::MatrixXd::Random(n,n).sparseView(0.5,1);
+  S = S.transpose()*S;
+
+  MatrixReplacement A;
+  A.attachMyMatrix(S);
+
+  Eigen::VectorXd b(n), x;
+  b.setRandom();
+
+  // Solve Ax = b using various iterative solver with matrix-free version:
+  {
+    Eigen::ConjugateGradient<MatrixReplacement, Eigen::Lower|Eigen::Upper, Eigen::IdentityPreconditioner> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "CG:       #iterations: " << cg.iterations() << ", estimated error: " << cg.error() << std::endl;
+  }
+
+  {
+    Eigen::BiCGSTAB<MatrixReplacement, Eigen::IdentityPreconditioner> bicg;
+    bicg.compute(A);
+    x = bicg.solve(b);
+    std::cout << "BiCGSTAB: #iterations: " << bicg.iterations() << ", estimated error: " << bicg.error() << std::endl;
+  }
+
+  {
+    Eigen::GMRES<MatrixReplacement, Eigen::IdentityPreconditioner> gmres;
+    gmres.compute(A);
+    x = gmres.solve(b);
+    std::cout << "GMRES:    #iterations: " << gmres.iterations() << ", estimated error: " << gmres.error() << std::endl;
+  }
+
+  {
+    Eigen::DGMRES<MatrixReplacement, Eigen::IdentityPreconditioner> gmres;
+    gmres.compute(A);
+    x = gmres.solve(b);
+    std::cout << "DGMRES:   #iterations: " << gmres.iterations() << ", estimated error: " << gmres.error() << std::endl;
+  }
+
+  {
+    Eigen::MINRES<MatrixReplacement, Eigen::Lower|Eigen::Upper, Eigen::IdentityPreconditioner> minres;
+    minres.compute(A);
+    x = minres.solve(b);
+    std::cout << "MINRES:   #iterations: " << minres.iterations() << ", estimated error: " << minres.error() << std::endl;
+  }
+}
diff --git a/doc/special_examples/random_cpp11.cpp b/doc/special_examples/random_cpp11.cpp
index ccd7c77d0..adc3c110c 100644
--- a/doc/special_examples/random_cpp11.cpp
+++ b/doc/special_examples/random_cpp11.cpp
@@ -7,7 +7,7 @@ using namespace Eigen;
 int main() {
   std::default_random_engine generator;
   std::poisson_distribution<int> distribution(4.1);
-  auto poisson = [&] (int) {return distribution(generator);};
+  auto poisson = [&] (Eigen::Index) {return distribution(generator);};
 
   RowVectorXi v = RowVectorXi::NullaryExpr(10, poisson );
   std::cout << v << "\n";
diff --git a/eigen3.pc.in b/eigen3.pc.in
index c5855de33..3368a3aa1 100644
--- a/eigen3.pc.in
+++ b/eigen3.pc.in
@@ -1,6 +1,9 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+
 Name: Eigen3
 Description: A C++ template library for linear algebra: vectors, matrices, and related algorithms
 Requires:
-Version: ${EIGEN_VERSION_NUMBER}
+Version: @EIGEN_VERSION_NUMBER@
 Libs:
-Cflags: -I${INCLUDE_INSTALL_DIR}
+Cflags: -I${prefix}/@INCLUDE_INSTALL_DIR@
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 767e82f21..bbebf29cd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,7 +68,7 @@ else()
   ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ")
 endif()
 
-find_package(SuperLU)
+find_package(SuperLU 4.0)
 if(SUPERLU_FOUND)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
@@ -236,6 +236,7 @@ ei_add_test(sparse_solvers)
 ei_add_test(sparse_permutations)
 ei_add_test(simplicial_cholesky)
 ei_add_test(conjugate_gradient)
+ei_add_test(incomplete_cholesky)
 ei_add_test(bicgstab)
 ei_add_test(lscg)
 ei_add_test(sparselu)
@@ -254,6 +255,19 @@ ei_add_test(special_numbers)
 ei_add_test(rvalue_types)
 ei_add_test(dense_storage)
 ei_add_test(ctorleak)
+ei_add_test(mpl2only)
+
+check_cxx_compiler_flag("-ffast-math" COMPILER_SUPPORT_FASTMATH)
+if(COMPILER_SUPPORT_FASTMATH)
+  set(EIGEN_FASTMATH_FLAGS "-ffast-math")
+else()
+  check_cxx_compiler_flag("/fp:fast" COMPILER_SUPPORT_FPFAST)
+  if(COMPILER_SUPPORT_FPFAST)
+    set(EIGEN_FASTMATH_FLAGS "/fp:fast")
+  endif()
+endif()
+
+ei_add_test(fastmath " ${EIGEN_FASTMATH_FLAGS} ")
 
 # # ei_add_test(denseLM)
 
diff --git a/test/array.cpp b/test/array.cpp
index 9f61c4b26..6adedfb06 100644
--- a/test/array.cpp
+++ b/test/array.cpp
@@ -202,7 +202,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
             m2 = ArrayType::Random(rows, cols),
             m3(rows, cols),
             m4 = m1;
-  
+
   m4 = (m4.abs()==Scalar(0)).select(1,m4);
 
   Scalar  s1 = internal::random<Scalar>();
@@ -217,6 +217,11 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(m1.sinh(), sinh(m1));
   VERIFY_IS_APPROX(m1.cosh(), cosh(m1));
   VERIFY_IS_APPROX(m1.tanh(), tanh(m1));
+#ifdef EIGEN_HAS_C99_MATH
+  VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+  VERIFY_IS_APPROX(m1.erf(), erf(m1));
+  VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif  // EIGEN_HAS_C99_MATH
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
   VERIFY_IS_APPROX(m1.round(), round(m1));
   VERIFY_IS_APPROX(m1.floor(), floor(m1));
@@ -230,11 +235,13 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(m1.square(), square(m1));
   VERIFY_IS_APPROX(m1.cube(), cube(m1));
   VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
+  VERIFY_IS_APPROX(m1.sign(), sign(m1));
 
 
   // avoid NaNs with abs() so verification doesn't fail
   m3 = m1.abs();
   VERIFY_IS_APPROX(m3.sqrt(), sqrt(abs(m1)));
+  VERIFY_IS_APPROX(m3.rsqrt(), Scalar(1)/sqrt(abs(m1)));
   VERIFY_IS_APPROX(m3.log(), log(m3));
   VERIFY_IS_APPROX(m3.log10(), log10(m3));
 
@@ -247,7 +254,7 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1)));
   VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1)));
   VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1))));
-  VERIFY_IS_APPROX(arg(m1), ((ArrayType)(m1<0))*std::acos(-1.0));
+  VERIFY_IS_APPROX(arg(m1), ((m1<0).template cast<Scalar>())*std::acos(-1.0));
   VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all());
   VERIFY((Eigen::isnan)((m1*0.0)/0.0).all());
   VERIFY((Eigen::isinf)(m4/0.0).all());
@@ -255,6 +262,9 @@ template<typename ArrayType> void array_real(const ArrayType& m)
   VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
   VERIFY((abs(m1) == m1 || abs(m1) == -m1).all());
   VERIFY_IS_APPROX(m3, sqrt(abs2(m1)));
+  VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
+  VERIFY_IS_APPROX( m1*m1.sign(),m1.abs());
+  VERIFY_IS_APPROX(m1.sign() * m1.abs(), m1);
 
   VERIFY_IS_APPROX(numext::abs2(numext::real(m1)) + numext::abs2(numext::imag(m1)), numext::abs2(m1));
   VERIFY_IS_APPROX(numext::abs2(real(m1)) + numext::abs2(imag(m1)), numext::abs2(m1));
@@ -288,6 +298,10 @@ template<typename ArrayType> void array_real(const ArrayType& m)
 
   VERIFY_IS_APPROX(m3.pow(RealScalar(0.5)), m3.sqrt());
   VERIFY_IS_APPROX(pow(m3,RealScalar(0.5)), m3.sqrt());
+
+  VERIFY_IS_APPROX(m3.pow(RealScalar(-0.5)), m3.rsqrt());
+  VERIFY_IS_APPROX(pow(m3,RealScalar(-0.5)), m3.rsqrt());
+
   VERIFY_IS_APPROX(log10(m3), log(m3)/log(10));
 
   // scalar by array division
@@ -348,6 +362,7 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(m1.square(), square(m1));
   VERIFY_IS_APPROX(m1.cube(), cube(m1));
   VERIFY_IS_APPROX(cos(m1+RealScalar(3)*m2), cos((m1+RealScalar(3)*m2).eval()));
+  VERIFY_IS_APPROX(m1.sign(), sign(m1));
 
 
   VERIFY_IS_APPROX(m1.exp() * m2.exp(), exp(m1+m2));
@@ -365,11 +380,15 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
 
   std::complex<RealScalar> zero(0.0,0.0);
   VERIFY((Eigen::isnan)(m1*zero/zero).all());
+#if EIGEN_COMP_MSVC
+  // msvc complex division is not robust
+  VERIFY((Eigen::isinf)(m4/RealScalar(0)).all());
+#else
 #if EIGEN_COMP_CLANG
-  // clang's complex division is notoriously broken
+  // clang's complex division is notoriously broken too
   if((numext::isinf)(m4(0,0)/RealScalar(0))) {
 #endif
-  VERIFY((Eigen::isinf)(m4/zero).all());
+    VERIFY((Eigen::isinf)(m4/zero).all());
 #if EIGEN_COMP_CLANG
   }
   else
@@ -377,6 +396,8 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
     VERIFY((Eigen::isinf)(m4.real()/zero.real()).all());
   }
 #endif
+#endif // MSVC
+
   VERIFY(((Eigen::isfinite)(m1) && (!(Eigen::isfinite)(m1*zero/zero)) && (!(Eigen::isfinite)(m1/zero))).all());
 
   VERIFY_IS_APPROX(inverse(inverse(m1)),m1);
@@ -385,6 +406,9 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
   VERIFY_IS_APPROX(abs(m1), sqrt(abs2(m1)));
   VERIFY_IS_APPROX(log10(m1), log(m1)/log(10));
 
+  VERIFY_IS_APPROX( m1.sign(), -(-m1).sign() );
+  VERIFY_IS_APPROX( m1.sign() * m1.abs(), m1);
+
   // scalar by array division
   const RealScalar tiny = sqrt(std::numeric_limits<RealScalar>::epsilon());
   s1 += Scalar(tiny);
diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
index 3d895f2e0..6f22e1ab4 100644
--- a/test/dynalloc.cpp
+++ b/test/dynalloc.cpp
@@ -129,13 +129,6 @@ void test_dynalloc()
 
   for (int i=0; i<g_repeat*100; ++i)
   {
-    CALL_SUBTEST(check_dynaligned<Vector4f>() );
-    CALL_SUBTEST(check_dynaligned<Vector2d>() );
-    CALL_SUBTEST(check_dynaligned<Matrix4f>() );
-    CALL_SUBTEST(check_dynaligned<Vector4d>() );
-    CALL_SUBTEST(check_dynaligned<Vector4i>() );
-    CALL_SUBTEST(check_dynaligned<Vector8f>() );
-    
     CALL_SUBTEST( check_custom_new_delete<Vector4f>() );
     CALL_SUBTEST( check_custom_new_delete<Vector2f>() );
     CALL_SUBTEST( check_custom_new_delete<Matrix4f>() );
@@ -144,6 +137,16 @@ void test_dynalloc()
   
   // check static allocation, who knows ?
   #if EIGEN_MAX_STATIC_ALIGN_BYTES
+  for (int i=0; i<g_repeat*100; ++i)
+  {
+    CALL_SUBTEST(check_dynaligned<Vector4f>() );
+    CALL_SUBTEST(check_dynaligned<Vector2d>() );
+    CALL_SUBTEST(check_dynaligned<Matrix4f>() );
+    CALL_SUBTEST(check_dynaligned<Vector4d>() );
+    CALL_SUBTEST(check_dynaligned<Vector4i>() );
+    CALL_SUBTEST(check_dynaligned<Vector8f>() );
+  }
+
   {
     MyStruct foo0;  VERIFY(size_t(foo0.avec.data())%ALIGNMENT==0);
     MyClassA fooA;  VERIFY(size_t(fooA.avec.data())%ALIGNMENT==0);
diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp
index 0d4e2dc87..8e2bb9ef0 100644
--- a/test/eigensolver_complex.cpp
+++ b/test/eigensolver_complex.cpp
@@ -118,13 +118,19 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
   MatrixType id = MatrixType::Identity(rows, cols);
   VERIFY_IS_APPROX(id.operatorNorm(), RealScalar(1));
 
-  if (rows > 1)
+  if (rows > 1 && rows < 20)
   {
     // Test matrix with NaN
     a(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
     ComplexEigenSolver<MatrixType> eiNaN(a);
     VERIFY_IS_EQUAL(eiNaN.info(), NoConvergence);
   }
+
+  // regression test for bug 1098
+  {
+    ComplexEigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
 }
 
 template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
index 566a4bdc6..a46a2e50e 100644
--- a/test/eigensolver_generalized_real.cpp
+++ b/test/eigensolver_generalized_real.cpp
@@ -39,6 +39,14 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   VectorType realEigenvalues = eig.eigenvalues().real();
   std::sort(realEigenvalues.data(), realEigenvalues.data()+realEigenvalues.size());
   VERIFY_IS_APPROX(realEigenvalues, symmEig.eigenvalues());
+
+  // regression test for bug 1098
+  {
+    GeneralizedSelfAdjointEigenSolver<MatrixType> eig1(a.adjoint() * a,b.adjoint() * b);
+    eig1.compute(a.adjoint() * a,b.adjoint() * b);
+    GeneralizedEigenSolver<MatrixType> eig2(a.adjoint() * a,b.adjoint() * b);
+    eig2.compute(a.adjoint() * a,b.adjoint() * b);
+  }
 }
 
 void test_eigensolver_generalized_real()
diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp
index c5441ac4e..566546310 100644
--- a/test/eigensolver_generic.cpp
+++ b/test/eigensolver_generic.cpp
@@ -63,13 +63,19 @@ template<typename MatrixType> void eigensolver(const MatrixType& m)
   MatrixType id = MatrixType::Identity(rows, cols);
   VERIFY_IS_APPROX(id.operatorNorm(), RealScalar(1));
 
-  if (rows > 2)
+  if (rows > 2 && rows < 20)
   {
     // Test matrix with NaN
     a(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
     EigenSolver<MatrixType> eiNaN(a);
     VERIFY_IS_EQUAL(eiNaN.info(), NoConvergence);
   }
+
+  // regression test for bug 1098
+  {
+    EigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
 }
 
 template<typename MatrixType> void eigensolver_verify_assert(const MatrixType& m)
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 41b6d99ab..f909761a1 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -130,13 +130,13 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
   Tridiagonalization<MatrixType> tridiag(symmC);
   VERIFY_IS_APPROX(tridiag.diagonal(), tridiag.matrixT().diagonal());
   VERIFY_IS_APPROX(tridiag.subDiagonal(), tridiag.matrixT().template diagonal<-1>());
-  MatrixType T = tridiag.matrixT();
+  Matrix<RealScalar,Dynamic,Dynamic> T = tridiag.matrixT();
   if(rows>1 && cols>1) {
     // FIXME check that upper and lower part are 0:
     //VERIFY(T.topRightCorner(rows-2, cols-2).template triangularView<Upper>().isZero());
   }
-  VERIFY_IS_APPROX(tridiag.diagonal(), T.diagonal().real());
-  VERIFY_IS_APPROX(tridiag.subDiagonal(), T.template diagonal<1>().real());
+  VERIFY_IS_APPROX(tridiag.diagonal(), T.diagonal());
+  VERIFY_IS_APPROX(tridiag.subDiagonal(), T.template diagonal<1>());
   VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), tridiag.matrixQ() * tridiag.matrixT().eval() * MatrixType(tridiag.matrixQ()).adjoint());
   VERIFY_IS_APPROX(MatrixType(symmC.template selfadjointView<Lower>()), tridiag.matrixQ() * tridiag.matrixT() * tridiag.matrixQ().adjoint());
   
@@ -149,13 +149,19 @@ template<typename MatrixType> void selfadjointeigensolver(const MatrixType& m)
     VERIFY_IS_APPROX(tridiag.matrixT(), eiSymmTridiag.eigenvectors().real() * eiSymmTridiag.eigenvalues().asDiagonal() * eiSymmTridiag.eigenvectors().real().transpose());
   }
 
-  if (rows > 1)
+  if (rows > 1 && rows < 20)
   {
     // Test matrix with NaN
     symmC(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
     SelfAdjointEigenSolver<MatrixType> eiSymmNaN(symmC);
     VERIFY_IS_EQUAL(eiSymmNaN.info(), NoConvergence);
   }
+
+  // regression test for bug 1098
+  {
+    SelfAdjointEigenSolver<MatrixType> eig(a.adjoint() * a);
+    eig.compute(a.adjoint() * a);
+  }
 }
 
 void bug_854()
diff --git a/test/evaluators.cpp b/test/evaluators.cpp
index f41968da8..876dffe22 100644
--- a/test/evaluators.cpp
+++ b/test/evaluators.cpp
@@ -2,6 +2,20 @@
 #include "main.h"
 
 namespace Eigen {
+
+  template<typename Lhs,typename Rhs>
+  const Product<Lhs,Rhs>
+  prod(const Lhs& lhs, const Rhs& rhs)
+  {
+    return Product<Lhs,Rhs>(lhs,rhs);
+  }
+
+  template<typename Lhs,typename Rhs>
+  const Product<Lhs,Rhs,LazyProduct>
+  lazyprod(const Lhs& lhs, const Rhs& rhs)
+  {
+    return Product<Lhs,Rhs,LazyProduct>(lhs,rhs);
+  }
   
   template<typename DstXprType, typename SrcXprType>
   EIGEN_STRONG_INLINE
@@ -69,9 +83,18 @@ namespace Eigen {
     typedef typename DstXprType::Scalar Scalar;
     call_assignment(dst.const_cast_derived(), src.const_cast_derived(), internal::swap_assign_op<Scalar>());
   }
+
+  namespace internal {
+    template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+    EIGEN_DEVICE_FUNC void call_assignment(const NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+    {
+      call_assignment_no_alias(dst.expression(), src, func);
+    }
+  }
   
 }
 
+template<typename XprType> long get_cost(const XprType& ) { return Eigen::internal::evaluator<XprType>::CoeffReadCost; }
 
 using namespace std;
 
@@ -448,7 +471,6 @@ void test_evaluators()
     VERIFY_IS_APPROX_EVALUATOR2(B, prod(A.triangularView<Upper>(),A), MatrixXd(A.triangularView<Upper>()*A));
     
     VERIFY_IS_APPROX_EVALUATOR2(B, prod(A.selfadjointView<Upper>(),A), MatrixXd(A.selfadjointView<Upper>()*A));
-    
   }
 
   {
@@ -459,6 +481,19 @@ void test_evaluators()
     
     VERIFY_IS_APPROX_EVALUATOR2(B, lazyprod(d.asDiagonal(),A), MatrixXd(d.asDiagonal()*A));
     VERIFY_IS_APPROX_EVALUATOR2(B, lazyprod(A,d.asDiagonal()), MatrixXd(A*d.asDiagonal()));
-    
+  }
+
+  {
+    // test CoeffReadCost
+    Matrix4d a, b;
+    VERIFY_IS_EQUAL( get_cost(a), 1 );
+    VERIFY_IS_EQUAL( get_cost(a+b), 3);
+    VERIFY_IS_EQUAL( get_cost(2*a+b), 4);
+    VERIFY_IS_EQUAL( get_cost(a*b), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(b)), 15);
+    VERIFY_IS_EQUAL( get_cost(a*(a*b)), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a*b)), 15);
+    VERIFY_IS_EQUAL( get_cost(a*(a+b)), 1);
+    VERIFY_IS_EQUAL( get_cost(a.lazyProduct(a+b)), 15);
   }
 }
diff --git a/test/fastmath.cpp b/test/fastmath.cpp
new file mode 100644
index 000000000..efdd5b313
--- /dev/null
+++ b/test/fastmath.cpp
@@ -0,0 +1,98 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+void check(bool b, bool ref)
+{
+  std::cout << b;
+  if(b==ref)
+    std::cout << " OK  ";
+  else
+    std::cout << " BAD ";
+}
+
+#if EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1800
+namespace std {
+  template<typename T> bool (isfinite)(T x) { return _finite(x); }
+  template<typename T> bool (isnan)(T x) { return _isnan(x); }
+  template<typename T> bool (isinf)(T x) { return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF; }
+}
+#endif
+
+template<typename T>
+void check_inf_nan(bool dryrun) {
+  Matrix<T,Dynamic,1> m(10);
+  m.setRandom();
+  m(3) = std::numeric_limits<T>::quiet_NaN();
+
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(3) << ") = "; check((std::isfinite)(m(3)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isinf(" << m(3) << ")    = "; check((std::isinf)(m(3)),false);    std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isnan(" << m(3) << ")    = "; check((std::isnan)(m(3)),true);     std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(3)), true); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 1);    std::cout << "\n";
+    std::cout << "\n";
+  }
+  else
+  {
+    VERIFY( !(numext::isfinite)(m(3)) );
+    VERIFY( !(numext::isinf)(m(3)) );
+    VERIFY(  (numext::isnan)(m(3)) );
+    VERIFY( !m.allFinite() );
+    VERIFY(  m.hasNaN() );
+  }
+  m(4) /= 0.0;
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";
+    std::cout << "std::isinf(" << m(4) << ")    = "; check((std::isinf)(m(4)),true);     std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(4)), true); std::cout << "\n";
+    std::cout << "std::isnan(" << m(4) << ")    = "; check((std::isnan)(m(4)),false);    std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(4)), false); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 1);    std::cout << "\n";
+    std::cout << "\n";
+  }
+  else
+  {
+    VERIFY( !(numext::isfinite)(m(4)) );
+    VERIFY(  (numext::isinf)(m(4)) );
+    VERIFY( !(numext::isnan)(m(4)) );
+    VERIFY( !m.allFinite() );
+    VERIFY(  m.hasNaN() );
+  }
+  m(3) = 0;
+  if(dryrun)
+  {
+    std::cout << "std::isfinite(" << m(3) << ") = "; check((std::isfinite)(m(3)),true); std::cout << "  ; numext::isfinite = "; check((numext::isfinite)(m(3)), true); std::cout << "\n";
+    std::cout << "std::isinf(" << m(3) << ")    = "; check((std::isinf)(m(3)),false);    std::cout << "  ; numext::isinf    = "; check((numext::isinf)(m(3)), false); std::cout << "\n";
+    std::cout << "std::isnan(" << m(3) << ")    = "; check((std::isnan)(m(3)),false);     std::cout << "  ; numext::isnan    = "; check((numext::isnan)(m(3)), false); std::cout << "\n";
+    std::cout << "allFinite: "; check(m.allFinite(), 0); std::cout << "\n";
+    std::cout << "hasNaN:    "; check(m.hasNaN(), 0);    std::cout << "\n";
+    std::cout << "\n\n";
+  }
+  else
+  {
+    VERIFY(  (numext::isfinite)(m(3)) );
+    VERIFY( !(numext::isinf)(m(3)) );
+    VERIFY( !(numext::isnan)(m(3)) );
+    VERIFY( !m.allFinite() );
+    VERIFY( !m.hasNaN() );
+  }
+}
+
+void test_fastmath() {
+  std::cout << "*** float *** \n\n"; check_inf_nan<float>(true);
+  std::cout << "*** double ***\n\n"; check_inf_nan<double>(true);
+  std::cout << "*** long double *** \n\n"; check_inf_nan<long double>(true);
+
+  check_inf_nan<float>(false);
+  check_inf_nan<double>(false);
+  check_inf_nan<long double>(false);
+}
diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp
index e2792ed18..2bdb4b7f2 100644
--- a/test/geo_alignedbox.cpp
+++ b/test/geo_alignedbox.cpp
@@ -16,7 +16,7 @@
 using namespace std;
 
 template<typename T> EIGEN_DONT_INLINE
-void kill_extra_precision(T& x) { eigen_assert(&x != 0); }
+void kill_extra_precision(T& x) { eigen_assert((void*)(&x) != (void*)0); }
 
 
 template<typename BoxType> void alignedbox(const BoxType& _box)
@@ -179,6 +179,8 @@ void test_geo_alignedbox()
     CALL_SUBTEST_9( alignedbox(AlignedBox1i()) );
     CALL_SUBTEST_10( alignedbox(AlignedBox2i()) );
     CALL_SUBTEST_11( alignedbox(AlignedBox3i()) );
+
+    CALL_SUBTEST_14( alignedbox(AlignedBox<double,Dynamic>(4)) );
   }
   CALL_SUBTEST_12( specificTest1() );
   CALL_SUBTEST_13( specificTest2() );
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index 17229be4a..761bb52b4 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -49,6 +49,7 @@ template<typename Scalar, int Options> void quaternion(void)
   */
   using std::abs;
   typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,4,1> Vector4;
   typedef Quaternion<Scalar,Options> Quaternionx;
   typedef AngleAxis<Scalar> AngleAxisx;
@@ -101,6 +102,11 @@ template<typename Scalar, int Options> void quaternion(void)
   q2 = q1.toRotationMatrix();
   VERIFY_IS_APPROX(q1*v1,q2*v1);
 
+  Matrix3 rot1(q1);
+  VERIFY_IS_APPROX(q1*v1,rot1*v1);
+  Quaternionx q3(rot1.transpose()*rot1);
+  VERIFY_IS_APPROX(q3*v1,v1);
+
 
   // angle-axis conversion
   AngleAxisx aa = AngleAxisx(q1);
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index d50c7c76a..51f90036d 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -12,6 +12,12 @@
 #include <Eigen/LU>
 #include <Eigen/SVD>
 
+template<typename T>
+Matrix<T,2,1> angleToVec(T a)
+{
+  return Matrix<T,2,1>(std::cos(a), std::sin(a));
+}
+
 template<typename Scalar, int Mode, int Options> void non_projective_only()
 {
     /* this test covers the following files:
@@ -130,14 +136,16 @@ template<typename Scalar, int Mode, int Options> void transformations()
   AngleAxisx aa = AngleAxisx(q1);
   VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
   
-  if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision())
+  // The following test is stable only if 2*angle != angle and v1 is not colinear with axis
+  if( (abs(aa.angle()) > test_precision<Scalar>()) && (abs(aa.axis().dot(v1.normalized()))<(Scalar(1)-Scalar(4)*test_precision<Scalar>())) )
   {
     VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
   }
 
   aa.fromRotationMatrix(aa.toRotationMatrix());
   VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1);
-  if(abs(aa.angle()) > NumTraits<Scalar>::dummy_precision())
+  // The following test is stable only if 2*angle != angle and v1 is not colinear with axis
+  if( (abs(aa.angle()) > test_precision<Scalar>()) && (abs(aa.axis().dot(v1.normalized()))<(Scalar(1)-Scalar(4)*test_precision<Scalar>())) )
   {
     VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) );
   }
@@ -214,7 +222,9 @@ template<typename Scalar, int Mode, int Options> void transformations()
   t4 *= aa3;
   VERIFY_IS_APPROX(t3.matrix(), t4.matrix());
 
-  v3 = Vector3::Random();
+  do {
+    v3 = Vector3::Random();
+  } while (v3.cwiseAbs().minCoeff()<NumTraits<Scalar>::epsilon());
   Translation3 tv3(v3);
   Transform3 t5(tv3);
   t4 = tv3;
@@ -414,14 +424,16 @@ template<typename Scalar, int Mode, int Options> void transformations()
     Scalar angle = internal::random<Scalar>(-100,100);
     Rotation2D<Scalar> rot2(angle);
     VERIFY( rot2.smallestPositiveAngle() >= 0 );
-    VERIFY( rot2.smallestPositiveAngle() < Scalar(2)*Scalar(EIGEN_PI) );
-    VERIFY_IS_APPROX( std::cos(rot2.smallestPositiveAngle()), std::cos(rot2.angle()) );
-    VERIFY_IS_APPROX( std::sin(rot2.smallestPositiveAngle()), std::sin(rot2.angle()) );
+    VERIFY( rot2.smallestPositiveAngle() <= Scalar(2)*Scalar(EIGEN_PI) );
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestPositiveAngle()), angleToVec(rot2.angle()) );
     
     VERIFY( rot2.smallestAngle() >= -Scalar(EIGEN_PI) );
     VERIFY( rot2.smallestAngle() <=  Scalar(EIGEN_PI) );
-    VERIFY_IS_APPROX( std::cos(rot2.smallestAngle()), std::cos(rot2.angle()) );
-    VERIFY_IS_APPROX( std::sin(rot2.smallestAngle()), std::sin(rot2.angle()) );
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestAngle()), angleToVec(rot2.angle()) );
+
+    Matrix<Scalar,2,2> rot2_as_mat(rot2);
+    Rotation2D<Scalar> rot3(rot2_as_mat);
+    VERIFY_IS_APPROX( angleToVec(rot2.smallestAngle()),  angleToVec(rot3.angle()) );
   }
 
   s0 = internal::random<Scalar>(-100,100);
@@ -437,7 +449,7 @@ template<typename Scalar, int Mode, int Options> void transformations()
   VERIFY_IS_APPROX(t20,t21);
   
   VERIFY_IS_APPROX(s0, (R0.slerp(0, R1)).angle());
-  VERIFY_IS_APPROX(R1.smallestPositiveAngle(), (R0.slerp(1, R1)).smallestPositiveAngle());
+  VERIFY_IS_APPROX( angleToVec(R1.smallestPositiveAngle()), angleToVec((R0.slerp(1, R1)).smallestPositiveAngle()) );
   VERIFY_IS_APPROX(R0.smallestPositiveAngle(), (R0.slerp(0.5, R0)).smallestPositiveAngle());
 
   if(std::cos(s0)>0)
@@ -447,13 +459,14 @@ template<typename Scalar, int Mode, int Options> void transformations()
   
   // Check path length
   Scalar l = 0;
-  for(int k=0; k<100; ++k)
+  int path_steps = 100;
+  for(int k=0; k<path_steps; ++k)
   {
-    Scalar a1 = R0.slerp(Scalar(k)/Scalar(100), R1).angle();
-    Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(100), R1).angle();
+    Scalar a1 = R0.slerp(Scalar(k)/Scalar(path_steps), R1).angle();
+    Scalar a2 = R0.slerp(Scalar(k+1)/Scalar(path_steps), R1).angle();
     l += std::abs(a2-a1);
   }
-  VERIFY(l<=EIGEN_PI);
+  VERIFY(l<=EIGEN_PI*(Scalar(1)+NumTraits<Scalar>::epsilon()*Scalar(path_steps/2)));
   
   // check basic features
   {
diff --git a/unsupported/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp
similarity index 96%
rename from unsupported/test/incomplete_cholesky.cpp
rename to test/incomplete_cholesky.cpp
index cc2ed698e..435e2839a 100644
--- a/unsupported/test/incomplete_cholesky.cpp
+++ b/test/incomplete_cholesky.cpp
@@ -18,7 +18,7 @@ template<typename T, typename I> void test_incomplete_cholesky_T()
   ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, AMDOrdering<I> > >     cg_illt_lower_amd;
   ConjugateGradient<SparseMatrixType, Lower, IncompleteCholesky<T, Lower, NaturalOrdering<I> > > cg_illt_lower_nat;
   ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >     cg_illt_upper_amd;
-  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, AMDOrdering<I> > >     cg_illt_upper_nat;
+  ConjugateGradient<SparseMatrixType, Upper, IncompleteCholesky<T, Upper, NaturalOrdering<I> > >     cg_illt_upper_nat;
   
 
   CALL_SUBTEST( check_sparse_spd_solving(cg_illt_lower_amd) );
diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp
index 318ba8717..6d7904bac 100644
--- a/test/is_same_dense.cpp
+++ b/test/is_same_dense.cpp
@@ -11,9 +11,10 @@
 
 void test_is_same_dense()
 {
-  MatrixXd m1(10,10);
-  Ref<MatrixXd> ref_m1(m1);
-  Ref<const MatrixXd> const_ref_m1(m1);
+  typedef Matrix<double,Dynamic,Dynamic,ColMajor> ColMatrixXd;
+  ColMatrixXd m1(10,10);
+  Ref<ColMatrixXd> ref_m1(m1);
+  Ref<const ColMatrixXd> const_ref_m1(m1);
   VERIFY(is_same_dense(m1,m1));
   VERIFY(is_same_dense(m1,ref_m1));
   VERIFY(is_same_dense(const_ref_m1,m1));
@@ -22,9 +23,9 @@ void test_is_same_dense()
   VERIFY(is_same_dense(m1.block(0,0,m1.rows(),m1.cols()),m1));
   VERIFY(!is_same_dense(m1.row(0),m1.col(0)));
   
-  Ref<const MatrixXd> const_ref_m1_row(m1.row(1));
+  Ref<const ColMatrixXd> const_ref_m1_row(m1.row(1));
   VERIFY(!is_same_dense(m1.row(1),const_ref_m1_row));
   
-  Ref<const MatrixXd> const_ref_m1_col(m1.col(1));
+  Ref<const ColMatrixXd> const_ref_m1_col(m1.col(1));
   VERIFY(is_same_dense(m1.col(1),const_ref_m1_col));
 }
diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp
index 3c7cdbe41..292f33969 100644
--- a/test/linearstructure.cpp
+++ b/test/linearstructure.cpp
@@ -108,9 +108,11 @@ void test_linearstructure()
     CALL_SUBTEST_7( linearStructure(MatrixXi (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_8( linearStructure(MatrixXcd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
     CALL_SUBTEST_9( linearStructure(ArrayXXf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_10( linearStructure(ArrayXXcf (internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     
-    CALL_SUBTEST_10( real_complex<Matrix4cd>() );
-    CALL_SUBTEST_10( real_complex<MatrixXcf>(10,10) );
+    CALL_SUBTEST_11( real_complex<Matrix4cd>() );
+    CALL_SUBTEST_11( real_complex<MatrixXcf>(10,10) );
+    CALL_SUBTEST_11( real_complex<ArrayXXcf>(10,10) );
   }
   
 #ifdef EIGEN_TEST_PART_4
diff --git a/test/lu.cpp b/test/lu.cpp
index b90367438..f14435114 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -92,6 +92,26 @@ template<typename MatrixType> void lu_non_invertible()
   // test that the code, which does resize(), may be applied to an xpr
   m2.block(0,0,m2.rows(),m2.cols()) = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
+
+  // test solve with transposed
+  m3 = MatrixType::Random(rows,cols2);
+  m2 = m1.transpose()*m3;
+  m3 = MatrixType::Random(rows,cols2);
+  lu.template _solve_impl_transposed<false>(m2, m3);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+  m3 = MatrixType::Random(rows,cols2);
+  m3 = lu.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+
+  // test solve with conjugate transposed
+  m3 = MatrixType::Random(rows,cols2);
+  m2 = m1.adjoint()*m3;
+  m3 = MatrixType::Random(rows,cols2);
+  lu.template _solve_impl_transposed<true>(m2, m3);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
+  m3 = MatrixType::Random(rows,cols2);
+  m3 = lu.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 }
 
 template<typename MatrixType> void lu_invertible()
@@ -125,6 +145,20 @@ template<typename MatrixType> void lu_invertible()
   VERIFY_IS_APPROX(m3, m1*m2);
   VERIFY_IS_APPROX(m2, lu.inverse()*m3);
 
+  // test solve with transposed
+  lu.template _solve_impl_transposed<false>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.transpose()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = lu.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+
+  // test solve with conjugate transposed
+  lu.template _solve_impl_transposed<true>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.adjoint()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = lu.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
+
   // Regression test for Bug 302
   MatrixType m4 = MatrixType::Random(size,size);
   VERIFY_IS_APPROX(lu.solve(m3*m4), lu.solve(m3)*m4);
@@ -136,14 +170,32 @@ template<typename MatrixType> void lu_partial_piv()
      PartialPivLU.h
   */
   typedef typename MatrixType::Index Index;
-  Index rows = internal::random<Index>(1,4);
-  Index cols = rows;
+  Index size = internal::random<Index>(1,4);
 
-  MatrixType m1(cols, rows);
+  MatrixType m1(size, size), m2(size, size), m3(size, size);
   m1.setRandom();
   PartialPivLU<MatrixType> plu(m1);
 
   VERIFY_IS_APPROX(m1, plu.reconstructedMatrix());
+
+  m3 = MatrixType::Random(size,size);
+  m2 = plu.solve(m3);
+  VERIFY_IS_APPROX(m3, m1*m2);
+  VERIFY_IS_APPROX(m2, plu.inverse()*m3);
+
+  // test solve with transposed
+  plu.template _solve_impl_transposed<false>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.transpose()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = plu.transpose().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.transpose()*m3);
+
+  // test solve with conjugate transposed
+  plu.template _solve_impl_transposed<true>(m3, m2);
+  VERIFY_IS_APPROX(m3, m1.adjoint()*m2);
+  m3 = MatrixType::Random(size,size);
+  m3 = plu.adjoint().solve(m2);
+  VERIFY_IS_APPROX(m2, m1.adjoint()*m3);
 }
 
 template<typename MatrixType> void lu_verify_assert()
diff --git a/test/metis_support.cpp b/test/metis_support.cpp
index 932b04074..d87c56a13 100644
--- a/test/metis_support.cpp
+++ b/test/metis_support.cpp
@@ -3,24 +3,10 @@
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #include "sparse_solver.h"
 #include <Eigen/SparseLU>
 #include <Eigen/MetisSupport>
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 71f099bb8..32d9d0be9 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -15,9 +15,13 @@
 #define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them
 #endif
 
-// #ifndef EIGEN_DONT_VECTORIZE
-// #define EIGEN_DONT_VECTORIZE // SSE intrinsics aren't designed to allow mixing types
-// #endif
+#if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3)
+
+#ifndef EIGEN_DONT_VECTORIZE
+#define EIGEN_DONT_VECTORIZE
+#endif
+
+#endif
 
 #include "main.h"
 
@@ -56,10 +60,12 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   // this one does not even compile with C++11
   VERIFY_RAISES_ASSERT(mf+mcf);
 #endif
-  // the following do not even compile since the introduction of evaluators
-//   VERIFY_RAISES_ASSERT(vf=vd);
-//   VERIFY_RAISES_ASSERT(vf+=vd);
-//   VERIFY_RAISES_ASSERT(mcd=md);
+
+#ifdef EIGEN_DONT_VECTORIZE
+  VERIFY_RAISES_ASSERT(vf=vd);
+  VERIFY_RAISES_ASSERT(vf+=vd);
+  VERIFY_RAISES_ASSERT(mcd=md);
+#endif
   
   // check scalar products
   VERIFY_IS_APPROX(vcf * sf , vcf * complex<float>(sf));
@@ -79,6 +85,7 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_IS_APPROX(vcd.asDiagonal() * md, vcd.asDiagonal() * md.template cast<complex<double> >());
   VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast<complex<float> >().asDiagonal());
   VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast<complex<double> >() * vcd.asDiagonal());
+
 //   vd.asDiagonal() * mf;    // does not even compile
 //   vcd.asDiagonal() * mf;   // does not even compile
 
@@ -148,5 +155,9 @@ void test_mixingtypes()
     CALL_SUBTEST_1(mixingtypes<3>());
     CALL_SUBTEST_2(mixingtypes<4>());
     CALL_SUBTEST_3(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
+
+    CALL_SUBTEST_4(mixingtypes<3>());
+    CALL_SUBTEST_5(mixingtypes<4>());
+    CALL_SUBTEST_6(mixingtypes<Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE)));
   }
 }
diff --git a/test/mpl2only.cpp b/test/mpl2only.cpp
new file mode 100644
index 000000000..5ef0d2b2e
--- /dev/null
+++ b/test/mpl2only.cpp
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_MPL2_ONLY
+#include <Eigen/Dense>
+#include <Eigen/SparseCore>
+#include <Eigen/SparseLU>
+#include <Eigen/SparseQR>
+#include <Eigen/IterativeLinearSolvers>
+
+int main()
+{
+  return 0;
+}
diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp
index 6e772c70f..2f5025305 100644
--- a/test/nesting_ops.cpp
+++ b/test/nesting_ops.cpp
@@ -2,14 +2,35 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
 #include "main.h"
 
-template <typename MatrixType> void run_nesting_ops(const MatrixType& _m)
+template <int N, typename XprType>
+void use_n_times(const XprType &xpr)
+{
+  typename internal::nested_eval<XprType,N>::type mat(xpr);
+  typename XprType::PlainObject res(mat.rows(), mat.cols());
+  nb_temporaries--; // remove res
+  res.setZero();
+  for(int i=0; i<N; ++i)
+    res += mat;
+}
+
+template <int N, typename ReferenceType, typename XprType>
+bool verify_eval_type(const XprType &, const ReferenceType&)
+{
+  typedef typename internal::nested_eval<XprType,N>::type EvalType;
+  return internal::is_same<typename internal::remove_all<EvalType>::type, typename internal::remove_all<ReferenceType>::type>::value;
+}
+
+template <typename MatrixType> void run_nesting_ops_1(const MatrixType& _m)
 {
   typename internal::nested_eval<MatrixType,2>::type m(_m);
 
@@ -24,10 +45,63 @@ template <typename MatrixType> void run_nesting_ops(const MatrixType& _m)
   VERIFY_IS_APPROX( (m.transpose() * m).array().abs().sum(), (m.transpose() * m).array().abs().sum() );
 }
 
+template <typename MatrixType> void run_nesting_ops_2(const MatrixType& _m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  Index rows = _m.rows();
+  Index cols = _m.cols();
+  MatrixType m1 = MatrixType::Random(rows,cols);
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime,ColMajor> m2;
+
+  if((MatrixType::SizeAtCompileTime==Dynamic))
+  {
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1 + m1*m1), 1 );
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1 + m1*m1), 1 );
+
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1.template triangularView<Lower>().solve(m1.col(0))), 1 );
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1.template triangularView<Lower>().solve(m1.col(0))), 1 );
+
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(Scalar(2)*m1.template triangularView<Lower>().solve(m1.col(0))), 2 ); // FIXME could be one by applying the scaling in-place on the solve result
+    VERIFY_EVALUATION_COUNT( use_n_times<1>(m1.col(0)+m1.template triangularView<Lower>().solve(m1.col(0))), 2 ); // FIXME could be one by adding m1.col() inplace
+    VERIFY_EVALUATION_COUNT( use_n_times<10>(m1.col(0)+m1.template triangularView<Lower>().solve(m1.col(0))), 2 );
+  }
+
+  {
+    VERIFY( verify_eval_type<10>(m1, m1) );
+    if(!NumTraits<Scalar>::IsComplex)
+    {
+      VERIFY( verify_eval_type<3>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<4>(2*m1, m1) );
+    }
+    else
+    {
+      VERIFY( verify_eval_type<1>(2*m1, 2*m1) );
+      VERIFY( verify_eval_type<2>(2*m1, m1) );
+    }
+    VERIFY( verify_eval_type<2>(m1+m1, m1+m1) );
+    VERIFY( verify_eval_type<3>(m1+m1, m1) );
+    VERIFY( verify_eval_type<1>(m1*m1.transpose(), m2) );
+    VERIFY( verify_eval_type<1>(m1*(m1+m1).transpose(), m2) );
+    VERIFY( verify_eval_type<2>(m1*m1.transpose(), m2) );
+    VERIFY( verify_eval_type<1>(m1+m1*m1, m1) );
+
+    VERIFY( verify_eval_type<1>(m1.template triangularView<Lower>().solve(m1), m1) );
+    VERIFY( verify_eval_type<1>(m1+m1.template triangularView<Lower>().solve(m1), m1) );
+  }
+}
+
+
 void test_nesting_ops()
 {
-  CALL_SUBTEST_1(run_nesting_ops(MatrixXf::Random(25,25)));
-  CALL_SUBTEST_2(run_nesting_ops(MatrixXd::Random(25,25)));
-  CALL_SUBTEST_3(run_nesting_ops(Matrix4f::Random()));
-  CALL_SUBTEST_4(run_nesting_ops(Matrix4d::Random()));
+  CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25)));
+  CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25)));
+  CALL_SUBTEST_3(run_nesting_ops_1(Matrix4f::Random()));
+  CALL_SUBTEST_4(run_nesting_ops_1(Matrix2d::Random()));
+
+  Index s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
+  CALL_SUBTEST_1( run_nesting_ops_2(MatrixXf(s,s)) );
+  CALL_SUBTEST_2( run_nesting_ops_2(MatrixXcd(s,s)) );
+  CALL_SUBTEST_3( run_nesting_ops_2(Matrix4f()) );
+  CALL_SUBTEST_4( run_nesting_ops_2(Matrix2d()) );
+  TEST_SET_BUT_UNUSED_VARIABLE(s)
 }
diff --git a/test/nullary.cpp b/test/nullary.cpp
index 2c148e205..4844f2952 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -123,6 +123,8 @@ void test_nullary()
     CALL_SUBTEST_6( testVectorType(Vector3d()) );
     CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,300))) );
     CALL_SUBTEST_8( testVectorType(Vector3f()) );
+    CALL_SUBTEST_8( testVectorType(Vector4f()) );
+    CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
     CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
   }
 }
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index b2b1cadc9..e09a361bf 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -18,7 +18,9 @@ template<typename T> T negate(const T& x) { return -x; }
 }
 }
 
-template<typename Scalar> bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
+// NOTE: we disbale inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
+template<typename Scalar> EIGEN_DONT_INLINE
+bool isApproxAbs(const Scalar& a, const Scalar& b, const typename NumTraits<Scalar>::Real& refvalue)
 {
   return internal::isMuchSmallerThan(a-b, refvalue);
 }
@@ -29,7 +31,7 @@ template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, in
   {
     if (!isApproxAbs(a[i],b[i],refvalue))
     {
-      std::cout << "[" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != " << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "\n";
+      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
       return false;
     }
   }
@@ -42,21 +44,13 @@ template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int s
   {
     if (a[i]!=b[i] && !internal::isApprox(a[i],b[i]))
     {
-      std::cout << "[" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != " << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "\n";
+      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
       return false;
     }
   }
   return true;
 }
 
-
-#define CHECK_CWISE2(REFOP, POP) { \
-  for (int i=0; i<PacketSize; ++i) \
-    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
-  internal::pstore(data2, POP(internal::pload<Packet>(data1), internal::pload<Packet>(data1+PacketSize))); \
-  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
-}
-
 #define CHECK_CWISE1(REFOP, POP) { \
   for (int i=0; i<PacketSize; ++i) \
     ref[i] = REFOP(data1[i]); \
@@ -92,6 +86,14 @@ struct packet_helper<false,Packet>
   VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
 }
 
+#define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
+  packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = REFOP(data1[i], data1[i+PacketSize]); \
+  h.store(data2, POP(h.load(data1),h.load(data1+PacketSize))); \
+  VERIFY(areApprox(ref, data2, PacketSize) && #POP); \
+}
+
 #define REF_ADD(a,b) ((a)+(b))
 #define REF_SUB(a,b) ((a)-(b))
 #define REF_MUL(a,b) ((a)*(b))
@@ -100,8 +102,9 @@ struct packet_helper<false,Packet>
 template<typename Scalar> void packetmath()
 {
   using std::abs;
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
   const int max_size = PacketSize > 4 ? PacketSize : 4;
@@ -153,13 +156,17 @@ template<typename Scalar> void packetmath()
     VERIFY(areApprox(ref, data2, PacketSize) && "internal::palign");
   }
 
-  CHECK_CWISE2(REF_ADD,  internal::padd);
-  CHECK_CWISE2(REF_SUB,  internal::psub);
-  CHECK_CWISE2(REF_MUL,  internal::pmul);
-  #if !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
-  if (!internal::is_same<Scalar,int>::value)
-    CHECK_CWISE2(REF_DIV,  internal::pdiv);
-  #endif
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasAdd);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasSub);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMul);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasNegate);
+  VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
+
+  CHECK_CWISE2_IF(PacketTraits::HasAdd, REF_ADD,  internal::padd);
+  CHECK_CWISE2_IF(PacketTraits::HasSub, REF_SUB,  internal::psub);
+  CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL,  internal::pmul);
+  CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
+
   CHECK_CWISE1(internal::negate, internal::pnegate);
   CHECK_CWISE1(numext::conj, internal::pconj);
 
@@ -262,7 +269,7 @@ template<typename Scalar> void packetmath()
     }
   }
 
-  if (internal::packet_traits<Scalar>::HasBlend) {
+  if (PacketTraits::HasBlend) {
     Packet thenPacket = internal::pload<Packet>(data1);
     Packet elsePacket = internal::pload<Packet>(data2);
     EIGEN_ALIGN_MAX internal::Selector<PacketSize> selector;
@@ -282,42 +289,47 @@ template<typename Scalar> void packetmath()
 template<typename Scalar> void packetmath_real()
 {
   using std::abs;
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
 
   const int size = PacketSize*4;
-  EIGEN_ALIGN_MAX Scalar data1[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_MAX Scalar data2[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_MAX Scalar ref[internal::packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4];
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-3,3));
     data2[i] = internal::random<Scalar>(-1,1) * std::pow(Scalar(10), internal::random<Scalar>(-3,3));
   }
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasSin, std::sin, internal::psin);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasCos, std::cos, internal::pcos);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasTan, std::tan, internal::ptan);
+  CHECK_CWISE1_IF(PacketTraits::HasSin, std::sin, internal::psin);
+  CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
+  CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
+
+  CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
+  CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+  CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
   
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(-1,1);
     data2[i] = internal::random<Scalar>(-1,1);
   }
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasASin, std::asin, internal::pasin);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasACos, std::acos, internal::pacos);
+  CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
+  CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
 
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(-87,88);
     data2[i] = internal::random<Scalar>(-87,88);
   }
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasExp, std::exp, internal::pexp);
-  if(internal::packet_traits<Scalar>::HasExp && internal::packet_traits<Scalar>::size>=2)
+  CHECK_CWISE1_IF(PacketTraits::HasExp, std::exp, internal::pexp);
+  if(PacketTraits::HasExp && PacketTraits::size>=2)
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
     data1[1] = std::numeric_limits<Scalar>::epsilon();
-    packet_helper<internal::packet_traits<Scalar>::HasExp,Packet> h;
+    packet_helper<PacketTraits::HasExp,Packet> h;
     h.store(data2, internal::pexp(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
     VERIFY_IS_EQUAL(std::exp(std::numeric_limits<Scalar>::epsilon()), data2[1]);
@@ -326,7 +338,7 @@ template<typename Scalar> void packetmath_real()
     data1[1] = 0;
     h.store(data2, internal::pexp(h.load(data1)));
     VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::epsilon()), data2[0]);
-    VERIFY_IS_EQUAL(std::exp(0), data2[1]);
+    VERIFY_IS_EQUAL(std::exp(Scalar(0)), data2[1]);
 
     data1[0] = (std::numeric_limits<Scalar>::min)();
     data1[1] = -(std::numeric_limits<Scalar>::min)();
@@ -341,20 +353,48 @@ template<typename Scalar> void packetmath_real()
     VERIFY_IS_EQUAL(std::exp(-std::numeric_limits<Scalar>::denorm_min()), data2[1]);
   }
 
+#ifdef EIGEN_HAS_C99_MATH
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+    h.store(data2, internal::plgamma(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+    h.store(data2, internal::perf(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+    h.store(data2, internal::perfc(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
   for (int i=0; i<size; ++i)
   {
     data1[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
     data2[i] = internal::random<Scalar>(0,1) * std::pow(Scalar(10), internal::random<Scalar>(-6,6));
   }
+
   if(internal::random<float>(0,1)<0.1)
     data1[internal::random<int>(0, PacketSize)] = 0;
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasSqrt, std::sqrt, internal::psqrt);
-  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLog, std::log, internal::plog);
-  if(internal::packet_traits<Scalar>::HasLog && internal::packet_traits<Scalar>::size>=2)
+  CHECK_CWISE1_IF(PacketTraits::HasSqrt, std::sqrt, internal::psqrt);
+  CHECK_CWISE1_IF(PacketTraits::HasLog, std::log, internal::plog);
+#if defined(EIGEN_HAS_C99_MATH) && (__cplusplus > 199711L)
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+  if(PacketTraits::HasLog && PacketTraits::size>=2)
   {
     data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
     data1[1] = std::numeric_limits<Scalar>::epsilon();
-    packet_helper<internal::packet_traits<Scalar>::HasLog,Packet> h;
+    packet_helper<PacketTraits::HasLog,Packet> h;
     h.store(data2, internal::plog(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
     VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::epsilon()), data2[1]);
@@ -363,7 +403,7 @@ template<typename Scalar> void packetmath_real()
     data1[1] = 0;
     h.store(data2, internal::plog(h.load(data1)));
     VERIFY((numext::isnan)(data2[0]));
-    VERIFY_IS_EQUAL(std::log(0), data2[1]);
+    VERIFY_IS_EQUAL(std::log(Scalar(0)), data2[1]);
 
     data1[0] = (std::numeric_limits<Scalar>::min)();
     data1[1] = -(std::numeric_limits<Scalar>::min)();
@@ -391,22 +431,26 @@ template<typename Scalar> void packetmath_real()
 template<typename Scalar> void packetmath_notcomplex()
 {
   using std::abs;
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
 
-  EIGEN_ALIGN_MAX Scalar data1[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_MAX Scalar data2[internal::packet_traits<Scalar>::size*4];
-  EIGEN_ALIGN_MAX Scalar ref[internal::packet_traits<Scalar>::size*4];
+  EIGEN_ALIGN_MAX Scalar data1[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketTraits::size*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketTraits::size*4];
   
-  Array<Scalar,Dynamic,1>::Map(data1, internal::packet_traits<Scalar>::size*4).setRandom();
+  Array<Scalar,Dynamic,1>::Map(data1, PacketTraits::size*4).setRandom();
 
   ref[0] = data1[0];
   for (int i=0; i<PacketSize; ++i)
     ref[0] = (std::min)(ref[0],data1[i]);
   VERIFY(internal::isApprox(ref[0], internal::predux_min(internal::pload<Packet>(data1))) && "internal::predux_min");
 
-  CHECK_CWISE2((std::min), internal::pmin);
-  CHECK_CWISE2((std::max), internal::pmax);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMin);
+  VERIFY((!PacketTraits::Vectorizable) || PacketTraits::HasMax);
+
+  CHECK_CWISE2_IF(PacketTraits::HasMin, (std::min), internal::pmin);
+  CHECK_CWISE2_IF(PacketTraits::HasMax, (std::max), internal::pmax);
   CHECK_CWISE1(abs, internal::pabs);
 
   ref[0] = data1[0];
@@ -422,8 +466,9 @@ template<typename Scalar> void packetmath_notcomplex()
 
 template<typename Scalar,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval)
 {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
   
   internal::conj_if<ConjLhs> cj0;
   internal::conj_if<ConjRhs> cj1;
@@ -450,8 +495,9 @@ template<typename Scalar,bool ConjLhs,bool ConjRhs> void test_conj_helper(Scalar
 
 template<typename Scalar> void packetmath_complex()
 {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
+  const int PacketSize = PacketTraits::size;
 
   const int size = PacketSize*4;
   EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
@@ -478,10 +524,12 @@ template<typename Scalar> void packetmath_complex()
   }
 }
 
-template<typename Scalar> void packetmath_scatter_gather() {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
+template<typename Scalar> void packetmath_scatter_gather()
+{
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  typedef typename PacketTraits::type Packet;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  const int PacketSize = internal::packet_traits<Scalar>::size;
+  const int PacketSize = PacketTraits::size;
   EIGEN_ALIGN_MAX Scalar data1[PacketSize];
   RealScalar refvalue = 0;
   for (int i=0; i<PacketSize; ++i) {
diff --git a/test/product.h b/test/product.h
index 672d0cee9..9dfff9303 100644
--- a/test/product.h
+++ b/test/product.h
@@ -111,6 +111,15 @@ template<typename MatrixType> void product(const MatrixType& m)
   vcres.noalias() -= m1.transpose() * v1;
   VERIFY_IS_APPROX(vcres, vc2 - m1.transpose() * v1);
 
+  // test d ?= a+b*c rules
+  res.noalias() = square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+  res.noalias() += square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, 2*(square + m1 * m2.transpose()));
+  res.noalias() -= square + m1 * m2.transpose();
+  VERIFY_IS_APPROX(res, square + m1 * m2.transpose());
+
+
   tm1 = m1;
   VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1);
   VERIFY_IS_APPROX(v1.transpose() * tm1, v1.transpose() * m1);
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 84c489580..7207973c2 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -61,6 +61,17 @@ void test_product_large()
     MatrixXf r2 = mat1.row(2)*mat2;
     VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
   }
+
+  {
+    Eigen::MatrixXd A(10,10), B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
 #endif
 
   // Regression test for bug 714:
diff --git a/test/product_notemporary.cpp b/test/product_notemporary.cpp
index 9fa69d901..ff93cb881 100644
--- a/test/product_notemporary.cpp
+++ b/test/product_notemporary.cpp
@@ -47,6 +47,10 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
 
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * (m1 * m2.transpose()), 0);
 
+  VERIFY_EVALUATION_COUNT( m3.noalias() = m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() += m3 + m1 * m2.transpose(), 0);
+  VERIFY_EVALUATION_COUNT( m3.noalias() -= m3 + m1 * m2.transpose(), 0);
+
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * m2.adjoint(), 0);
   VERIFY_EVALUATION_COUNT( m3.noalias() = s1 * m1 * s2 * (m1*s3+m2*s2).adjoint(), 1);
   VERIFY_EVALUATION_COUNT( m3.noalias() = (s1 * m1).adjoint() * s2 * m2, 0);
@@ -62,7 +66,7 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( m3.noalias() -= (s1 * m1).template triangularView<Lower>() * m2, 0);
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template triangularView<Upper>() * (m2+m2), 1);
   VERIFY_EVALUATION_COUNT( rm3.noalias() = (s1 * m1.adjoint()).template triangularView<UnitUpper>() * m2.adjoint(), 0);
-  
+
   VERIFY_EVALUATION_COUNT( m3.template triangularView<Upper>() = (m1 * m2.adjoint()), 0);
   VERIFY_EVALUATION_COUNT( m3.template triangularView<Upper>() -= (m1 * m2.adjoint()), 0);
 
@@ -107,6 +111,22 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
   VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * m2.col(0), 0 );
   VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * rv1.adjoint(), 0 );
   VERIFY_EVALUATION_COUNT( cvres.noalias() -= m1 * m2.row(0).transpose(), 0 );
+
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (m1+m1) * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * cv1, 0 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (m1+m1) * (m1*cv1), 1 );
+  VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
+
+  // Check outer products
+  m3 = cv1 * rv1;
+  VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
+  VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
+  VERIFY_EVALUATION_COUNT( rm3.noalias() += (m1*cv1) * (rv1 * m1), 2 );
 }
 
 void test_product_notemporary()
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 091955a0f..c35db6f65 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -29,6 +29,153 @@ void product1x1()
                     matAdynamic.cwiseProduct(matBdynamic.transpose()).sum() );
 }
 
+template<typename TC, typename TA, typename TB>
+const TC& ref_prod(TC &C, const TA &A, const TB &B)
+{
+  for(Index i=0;i<C.rows();++i)
+    for(Index j=0;j<C.cols();++j)
+      for(Index k=0;k<A.cols();++k)
+        C.coeffRef(i,j) += A.coeff(i,k) * B.coeff(k,j);
+  return C;
+}
+
+template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
+typename internal::enable_if<! ( (Rows ==1&&Depth!=1&&OA==ColMajor)
+                              || (Depth==1&&Rows !=1&&OA==RowMajor)
+                              || (Cols ==1&&Depth!=1&&OB==RowMajor)
+                              || (Depth==1&&Cols !=1&&OB==ColMajor)
+                              || (Rows ==1&&Cols !=1&&OC==ColMajor)
+                              || (Cols ==1&&Rows !=1&&OC==RowMajor)),void>::type
+test_lazy_single(int rows, int cols, int depth)
+{
+  Matrix<T,Rows,Depth,OA> A(rows,depth); A.setRandom();
+  Matrix<T,Depth,Cols,OB> B(depth,cols); B.setRandom();
+  Matrix<T,Rows,Cols,OC>  C(rows,cols);  C.setRandom();
+  Matrix<T,Rows,Cols,OC>  D(C);
+  VERIFY_IS_APPROX(C+=A.lazyProduct(B), ref_prod(D,A,B));
+}
+
+template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
+typename internal::enable_if<  ( (Rows ==1&&Depth!=1&&OA==ColMajor)
+                              || (Depth==1&&Rows !=1&&OA==RowMajor)
+                              || (Cols ==1&&Depth!=1&&OB==RowMajor)
+                              || (Depth==1&&Cols !=1&&OB==ColMajor)
+                              || (Rows ==1&&Cols !=1&&OC==ColMajor)
+                              || (Cols ==1&&Rows !=1&&OC==RowMajor)),void>::type
+test_lazy_single(int, int, int)
+{
+}
+
+template<typename T, int Rows, int Cols, int Depth>
+void test_lazy_all_layout(int rows=Rows, int cols=Cols, int depth=Depth)
+{
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,ColMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,ColMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,ColMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,ColMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,ColMajor,RowMajor,RowMajor>(rows,cols,depth) ));
+  CALL_SUBTEST(( test_lazy_single<T,Rows,Cols,Depth,RowMajor,RowMajor,RowMajor>(rows,cols,depth) ));
+}
+
+template<typename T>
+void test_lazy_l1()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+
+  // Inner
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,3>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,9>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,1,-1>(1,1,depth) ));
+
+  // Outer
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,2,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3,3,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,8,1>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,1>(4,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,7,-1,1>(7,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,8,1>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,3,1>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,-1,1>(rows,cols) ));
+}
+
+template<typename T>
+void test_lazy_l2()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+
+  // mat-vec
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,1,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,1,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,1,4>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,1,-1>(4,1,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,1,-1>(rows,1,depth) ));
+
+  // vec-mat
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,2,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,5,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,4,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,8,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,-1, 4>(1,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1, 4,-1>(1,4,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,1,-1,-1>(1,cols,depth) ));
+}
+
+template<typename T>
+void test_lazy_l3()
+{
+  int rows = internal::random<int>(1,12);
+  int cols = internal::random<int>(1,12);
+  int depth = internal::random<int>(1,12);
+  // mat-mat
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,3,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,8,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,2,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,7,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,8,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,3,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,6,4>(rows) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,3,-1>(4,3,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,-1,6,-1>(rows,6,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,2,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,5,2,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,2>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,4,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,6,5,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,4,5>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3,4,6>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,2,6,4>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,7,8,8>() ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,8,-1, 4>(8,cols) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,3, 4,-1>(3,4,depth) ));
+  CALL_SUBTEST(( test_lazy_all_layout<T,4,-1,-1>(4,cols,depth) ));
+}
 
 void test_product_small()
 {
@@ -39,6 +186,22 @@ void test_product_small()
     CALL_SUBTEST_4( product(Matrix4d()) );
     CALL_SUBTEST_5( product(Matrix4f()) );
     CALL_SUBTEST_6( product1x1() );
+
+    CALL_SUBTEST_11( test_lazy_l1<float>() );
+    CALL_SUBTEST_12( test_lazy_l2<float>() );
+    CALL_SUBTEST_13( test_lazy_l3<float>() );
+
+    CALL_SUBTEST_21( test_lazy_l1<double>() );
+    CALL_SUBTEST_22( test_lazy_l2<double>() );
+    CALL_SUBTEST_23( test_lazy_l3<double>() );
+
+    CALL_SUBTEST_31( test_lazy_l1<std::complex<float> >() );
+    CALL_SUBTEST_32( test_lazy_l2<std::complex<float> >() );
+    CALL_SUBTEST_33( test_lazy_l3<std::complex<float> >() );
+
+    CALL_SUBTEST_41( test_lazy_l1<std::complex<double> >() );
+    CALL_SUBTEST_42( test_lazy_l2<std::complex<double> >() );
+    CALL_SUBTEST_43( test_lazy_l3<std::complex<double> >() );
   }
 
 #ifdef EIGEN_TEST_PART_6
@@ -56,5 +219,16 @@ void test_product_small()
     VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]);
     VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C);
   }
+
+  {
+    Eigen::Matrix<double, 10, 10> A, B, C;
+    A.setRandom();
+    C = A;
+    for(int k=0; k<79; ++k)
+      C = C * A;
+    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
+                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
+    VERIFY_IS_APPROX(B,C);
+  }
 #endif
 }
diff --git a/test/product_trmm.cpp b/test/product_trmm.cpp
index d715b9a36..12e554410 100644
--- a/test/product_trmm.cpp
+++ b/test/product_trmm.cpp
@@ -9,10 +9,18 @@
 
 #include "main.h"
 
+template<typename T>
+int get_random_size()
+{
+  const int factor = NumTraits<T>::ReadCost;
+  const int max_test_size = EIGEN_TEST_MAX_SIZE>2*factor ? EIGEN_TEST_MAX_SIZE/factor : EIGEN_TEST_MAX_SIZE;
+  return internal::random<int>(1,max_test_size);
+}
+
 template<typename Scalar, int Mode, int TriOrder, int OtherOrder, int ResOrder, int OtherCols>
-void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
-          int cols=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
-          int otherCols = OtherCols==Dynamic?internal::random<int>(1,EIGEN_TEST_MAX_SIZE):OtherCols)
+void trmm(int rows=get_random_size<Scalar>(),
+          int cols=get_random_size<Scalar>(),
+          int otherCols = OtherCols==Dynamic?get_random_size<Scalar>():OtherCols)
 {
   typedef Matrix<Scalar,Dynamic,Dynamic,TriOrder> TriMatrix;
   typedef Matrix<Scalar,Dynamic,OtherCols,OtherCols==1?ColMajor:OtherOrder> OnTheRight;
@@ -42,13 +50,13 @@ void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
   
   VERIFY_IS_APPROX( ge_xs.noalias() = mat.template triangularView<Mode>() * ge_right, tri * ge_right);
   VERIFY_IS_APPROX( ge_sx.noalias() = ge_left * mat.template triangularView<Mode>(), ge_left * tri);
-  
+
   VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.transpose()), s1*triTr.conjugate() * (s2*ge_left.transpose()));
   VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.transpose() * mat.adjoint().template triangularView<Mode>(), ge_right.transpose() * triTr.conjugate());
   
   VERIFY_IS_APPROX( ge_xs.noalias() = (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.adjoint()), s1*triTr.conjugate() * (s2*ge_left.adjoint()));
   VERIFY_IS_APPROX( ge_sx.noalias() = ge_right.adjoint() * mat.adjoint().template triangularView<Mode>(), ge_right.adjoint() * triTr.conjugate());
-  
+
   ge_xs_save = ge_xs;
   VERIFY_IS_APPROX( (ge_xs_save + s1*triTr.conjugate() * (s2*ge_left.adjoint())).eval(), ge_xs.noalias() += (s1*mat.adjoint()).template triangularView<Mode>() * (s2*ge_left.adjoint()) );
   ge_sx.setRandom();
@@ -61,13 +69,13 @@ void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE),
 }
 
 template<typename Scalar, int Mode, int TriOrder>
-void trmv(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE), int cols=internal::random<int>(1,EIGEN_TEST_MAX_SIZE))
+void trmv(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>())
 {
   trmm<Scalar,Mode,TriOrder,ColMajor,ColMajor,1>(rows,cols,1);
 }
 
 template<typename Scalar, int Mode, int TriOrder, int OtherOrder, int ResOrder>
-void trmm(int rows=internal::random<int>(1,EIGEN_TEST_MAX_SIZE), int cols=internal::random<int>(1,EIGEN_TEST_MAX_SIZE), int otherCols = internal::random<int>(1,EIGEN_TEST_MAX_SIZE))
+void trmm(int rows=get_random_size<Scalar>(), int cols=get_random_size<Scalar>(), int otherCols = get_random_size<Scalar>())
 {
   trmm<Scalar,Mode,TriOrder,OtherOrder,ResOrder,Dynamic>(rows,cols,otherCols);
 }
diff --git a/test/rand.cpp b/test/rand.cpp
index 7c8068a3b..6790acf15 100644
--- a/test/rand.cpp
+++ b/test/rand.cpp
@@ -35,8 +35,8 @@ template<typename Scalar> void check_all_in_range(Scalar x, Scalar y)
 void test_rand()
 {
   long long_ref = NumTraits<long>::highest()/10;
-  char char_offset = (std::min)(g_repeat,64);
-  char short_offset = (std::min)(g_repeat,16000);
+  signed char char_offset = (std::min)(g_repeat,64);
+  signed char short_offset = (std::min)(g_repeat,16000);
   
   for(int i = 0; i < g_repeat*10; i++) {
     CALL_SUBTEST(check_in_range<float>(10,11));
@@ -57,13 +57,13 @@ void test_rand()
     CALL_SUBTEST(check_in_range<long>(-long_ref,long_ref));
   }
   
-  CALL_SUBTEST(check_all_in_range<char>(11,11));
-  CALL_SUBTEST(check_all_in_range<char>(11,11+char_offset));
-  CALL_SUBTEST(check_all_in_range<char>(-5,5));
-  CALL_SUBTEST(check_all_in_range<char>(-11-char_offset,-11));
-  CALL_SUBTEST(check_all_in_range<char>(-126,-126+char_offset));
-  CALL_SUBTEST(check_all_in_range<char>(126-char_offset,126));
-  CALL_SUBTEST(check_all_in_range<char>(-126,126));
+  CALL_SUBTEST(check_all_in_range<signed char>(11,11));
+  CALL_SUBTEST(check_all_in_range<signed char>(11,11+char_offset));
+  CALL_SUBTEST(check_all_in_range<signed char>(-5,5));
+  CALL_SUBTEST(check_all_in_range<signed char>(-11-char_offset,-11));
+  CALL_SUBTEST(check_all_in_range<signed char>(-126,-126+char_offset));
+  CALL_SUBTEST(check_all_in_range<signed char>(126-char_offset,126));
+  CALL_SUBTEST(check_all_in_range<signed char>(-126,126));
   
   CALL_SUBTEST(check_all_in_range<short>(11,11));
   CALL_SUBTEST(check_all_in_range<short>(11,11+short_offset));
diff --git a/test/redux.cpp b/test/redux.cpp
index 0d176e500..6ddc59c18 100644
--- a/test/redux.cpp
+++ b/test/redux.cpp
@@ -2,11 +2,14 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
+
 #include "main.h"
 
 template<typename MatrixType> void matrixRedux(const MatrixType& m)
@@ -21,7 +24,7 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
   MatrixType m1 = MatrixType::Random(rows, cols);
 
   // The entries of m1 are uniformly distributed in [0,1], so m1.prod() is very small. This may lead to test
-  // failures if we underflow into denormals. Thus, we scale so that entires are close to 1.
+  // failures if we underflow into denormals. Thus, we scale so that entries are close to 1.
   MatrixType m1_for_prod = MatrixType::Ones(rows, cols) + RealScalar(0.2) * m1;
 
   VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
@@ -53,10 +56,24 @@ template<typename MatrixType> void matrixRedux(const MatrixType& m)
   VERIFY_IS_APPROX(m1_for_prod.block(r0,c0,r1,c1).prod(), m1_for_prod.block(r0,c0,r1,c1).eval().prod());
   VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).real().minCoeff(), m1.block(r0,c0,r1,c1).real().eval().minCoeff());
   VERIFY_IS_APPROX(m1.block(r0,c0,r1,c1).real().maxCoeff(), m1.block(r0,c0,r1,c1).real().eval().maxCoeff());
+
+  // regression for bug 1090
+  const int R1 = MatrixType::RowsAtCompileTime>=2 ? MatrixType::RowsAtCompileTime/2 : 6;
+  const int C1 = MatrixType::ColsAtCompileTime>=2 ? MatrixType::ColsAtCompileTime/2 : 6;
+  if(R1<=rows-r0 && C1<=cols-c0)
+  {
+    VERIFY_IS_APPROX( (m1.template block<R1,C1>(r0,c0).sum()), m1.block(r0,c0,R1,C1).sum() );
+  }
   
   // test empty objects
   VERIFY_IS_APPROX(m1.block(r0,c0,0,0).sum(),   Scalar(0));
   VERIFY_IS_APPROX(m1.block(r0,c0,0,0).prod(),  Scalar(1));
+
+  // test nesting complex expression
+  VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) );
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> m2(rows,rows);
+  m2.setRandom();
+  VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) );
 }
 
 template<typename VectorType> void vectorRedux(const VectorType& w)
diff --git a/test/ref.cpp b/test/ref.cpp
index 1341dfef7..769db0414 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -18,6 +18,18 @@
 
 // test Ref.h
 
+// Deal with i387 extended precision
+#if EIGEN_ARCH_i386 && !(EIGEN_ARCH_x86_64)
+
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(4,4)
+#pragma GCC optimize ("-ffloat-store")
+#else
+#undef VERIFY_IS_EQUAL
+#define VERIFY_IS_EQUAL(X,Y) VERIFY_IS_APPROX(X,Y)
+#endif
+
+#endif
+
 template<typename MatrixType> void ref_matrix(const MatrixType& m)
 {
   typedef typename MatrixType::Index Index;
@@ -55,7 +67,6 @@ template<typename MatrixType> void ref_matrix(const MatrixType& m)
   rm2 = m2.block(i,j,brows,bcols);
   VERIFY_IS_EQUAL(m1, m2);
   
-  
   ConstRefDynMat rm3 = m1.block(i,j,brows,bcols);
   m1.block(i,j,brows,bcols) *= 2;
   m2.block(i,j,brows,bcols) *= 2;
diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp
index 5e869790f..deb78e44e 100644
--- a/test/schur_complex.cpp
+++ b/test/schur_complex.cpp
@@ -25,7 +25,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
     ComplexMatrixType T = schurOfA.matrixT();
     for(int row = 1; row < size; ++row) {
       for(int col = 0; col < row; ++col) {
-	VERIFY(T(row,col) == (typename MatrixType::Scalar)0);
+        VERIFY(T(row,col) == (typename MatrixType::Scalar)0);
       }
     }
     VERIFY_IS_APPROX(A.template cast<ComplexScalar>(), U * T * U.adjoint());
@@ -70,7 +70,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   VERIFY_IS_EQUAL(cs1.matrixT(), csOnlyT.matrixT());
   VERIFY_RAISES_ASSERT(csOnlyT.matrixU());
 
-  if (size > 1)
+  if (size > 1 && size < 20)
   {
     // Test matrix with NaN
     A(0,0) = std::numeric_limits<typename MatrixType::RealScalar>::quiet_NaN();
diff --git a/test/schur_real.cpp b/test/schur_real.cpp
index 36b9c24d1..cfe4570d4 100644
--- a/test/schur_real.cpp
+++ b/test/schur_real.cpp
@@ -91,7 +91,7 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   VERIFY_IS_EQUAL(rs1.matrixT(), rsOnlyT.matrixT());
   VERIFY_RAISES_ASSERT(rsOnlyT.matrixU());
 
-  if (size > 2)
+  if (size > 2 && size < 20)
   {
     // Test matrix with NaN
     A(0,0) = std::numeric_limits<typename MatrixType::Scalar>::quiet_NaN();
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 492b3a4f2..d803e7dae 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -188,6 +188,8 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refM4.setRandom();
     // sparse cwise* dense
     VERIFY_IS_APPROX(m3.cwiseProduct(refM4), refM3.cwiseProduct(refM4));
+    // dense cwise* sparse
+    VERIFY_IS_APPROX(refM4.cwiseProduct(m3), refM4.cwiseProduct(refM3));
 //     VERIFY_IS_APPROX(m3.cwise()/refM4, refM3.cwise()/refM4);
 
     // test aliasing
@@ -219,10 +221,10 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refM2.setZero();
     int countFalseNonZero = 0;
     int countTrueNonZero = 0;
-    for (Index j=0; j<m2.outerSize(); ++j)
+    m2.reserve(VectorXi::Constant(m2.outerSize(), int(m2.innerSize())));
+    for (Index j=0; j<m2.cols(); ++j)
     {
-      m2.startVec(j);
-      for (Index i=0; i<m2.innerSize(); ++i)
+      for (Index i=0; i<m2.rows(); ++i)
       {
         float x = internal::random<float>(0,1);
         if (x<0.1)
@@ -232,22 +234,21 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
         else if (x<0.5)
         {
           countFalseNonZero++;
-          m2.insertBackByOuterInner(j,i) = Scalar(0);
+          m2.insert(i,j) = Scalar(0);
         }
         else
         {
           countTrueNonZero++;
-          m2.insertBackByOuterInner(j,i) = Scalar(1);
-          if(SparseMatrixType::IsRowMajor)
-            refM2(j,i) = Scalar(1);
-          else
-            refM2(i,j) = Scalar(1);
+          m2.insert(i,j) = Scalar(1);
+          refM2(i,j) = Scalar(1);
         }
       }
     }
-    m2.finalize();
+    if(internal::random<bool>())
+      m2.makeCompressed();
     VERIFY(countFalseNonZero+countTrueNonZero == m2.nonZeros());
-    VERIFY_IS_APPROX(m2, refM2);
+    if(countTrueNonZero>0)
+      VERIFY_IS_APPROX(m2, refM2);
     m2.prune(Scalar(1));
     VERIFY(countTrueNonZero==m2.nonZeros());
     VERIFY_IS_APPROX(m2, refM2);
@@ -259,19 +260,33 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     std::vector<TripletType> triplets;
     Index ntriplets = rows*cols;
     triplets.reserve(ntriplets);
-    DenseMatrix refMat(rows,cols);
-    refMat.setZero();
+    DenseMatrix refMat_sum  = DenseMatrix::Zero(rows,cols);
+    DenseMatrix refMat_prod = DenseMatrix::Zero(rows,cols);
+    DenseMatrix refMat_last = DenseMatrix::Zero(rows,cols);
+
     for(Index i=0;i<ntriplets;++i)
     {
       StorageIndex r = internal::random<StorageIndex>(0,StorageIndex(rows-1));
       StorageIndex c = internal::random<StorageIndex>(0,StorageIndex(cols-1));
       Scalar v = internal::random<Scalar>();
       triplets.push_back(TripletType(r,c,v));
-      refMat(r,c) += v;
+      refMat_sum(r,c) += v;
+      if(std::abs(refMat_prod(r,c))==0)
+        refMat_prod(r,c) = v;
+      else
+        refMat_prod(r,c) *= v;
+      refMat_last(r,c) = v;
     }
     SparseMatrixType m(rows,cols);
     m.setFromTriplets(triplets.begin(), triplets.end());
-    VERIFY_IS_APPROX(m, refMat);
+    VERIFY_IS_APPROX(m, refMat_sum);
+
+    m.setFromTriplets(triplets.begin(), triplets.end(), std::multiplies<Scalar>());
+    VERIFY_IS_APPROX(m, refMat_prod);
+#if (defined(__cplusplus) && __cplusplus >= 201103L)
+    m.setFromTriplets(triplets.begin(), triplets.end(), [] (Scalar,Scalar b) { return b; });
+    VERIFY_IS_APPROX(m, refMat_last);
+#endif
   }
   
   // test Map
@@ -325,6 +340,10 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     refMat3 = refMat2.template triangularView<StrictlyLower>();
     m3 = m2.template triangularView<StrictlyLower>();
     VERIFY_IS_APPROX(m3, refMat3);
+
+    // check sparse-traingular to dense
+    refMat3 = m2.template triangularView<StrictlyUpper>();
+    VERIFY_IS_APPROX(refMat3, DenseMatrix(refMat2.template triangularView<StrictlyUpper>()));
   }
   
   // test selfadjointView
@@ -421,6 +440,20 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     SparseMatrixType m1(rows, rows);
     m1.setIdentity();
     VERIFY_IS_APPROX(m1, refMat1);
+    for(int k=0; k<rows*rows/4; ++k)
+    {
+      Index i = internal::random<Index>(0,rows-1);
+      Index j = internal::random<Index>(0,rows-1);
+      Scalar v = internal::random<Scalar>();
+      m1.coeffRef(i,j) = v;
+      refMat1.coeffRef(i,j) = v;
+      VERIFY_IS_APPROX(m1, refMat1);
+      if(internal::random<Index>(0,10)<2)
+        m1.makeCompressed();
+    }
+    m1.setIdentity();
+    refMat1.setIdentity();
+    VERIFY_IS_APPROX(m1, refMat1);
   }
 }
 
@@ -480,4 +513,19 @@ void test_sparse_basic()
   // Regression test for bug 900: (manually insert higher values here, if you have enough RAM):
   CALL_SUBTEST_3((big_sparse_triplet<SparseMatrix<float, RowMajor, int> >(10000, 10000, 0.125)));
   CALL_SUBTEST_4((big_sparse_triplet<SparseMatrix<double, ColMajor, long int> >(10000, 10000, 0.125)));
+
+  // Regression test for bug 1105
+#ifdef EIGEN_TEST_PART_6
+  {
+    int n = Eigen::internal::random<int>(200,600);
+    SparseMatrix<std::complex<double>,0, long> mat(n, n);
+    std::complex<double> val;
+
+    for(int i=0; i<n; ++i)
+    {
+      mat.coeffRef(i, i%(n/10)) = val;
+      VERIFY(mat.data().allocatedSize()<20*n);
+    }
+  }
+#endif
 }
diff --git a/test/sparse_permutations.cpp b/test/sparse_permutations.cpp
index dec586776..b82cceff8 100644
--- a/test/sparse_permutations.cpp
+++ b/test/sparse_permutations.cpp
@@ -1,14 +1,46 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+
+static long int nb_transposed_copies;
+#define EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN {nb_transposed_copies++;}
+#define VERIFY_TRANSPOSITION_COUNT(XPR,N) {\
+    nb_transposed_copies = 0; \
+    XPR; \
+    if(nb_transposed_copies!=N) std::cerr << "nb_transposed_copies == " << nb_transposed_copies << "\n"; \
+    VERIFY( (#XPR) && nb_transposed_copies==N ); \
+  }
+
 #include "sparse.h"
 
+template<typename T>
+bool is_sorted(const T& mat) {
+  for(Index k = 0; k<mat.outerSize(); ++k)
+  {
+    Index prev = -1;
+    for(typename T::InnerIterator it(mat,k); it; ++it)
+    {
+      if(prev>=it.index())
+        return false;
+      prev = it.index();
+    }
+  }
+  return true;
+}
+
+template<typename T>
+typename internal::nested_eval<T,1>::type eval(const T &xpr)
+{
+  VERIFY( int(internal::nested_eval<T,1>::type::Flags&RowMajorBit) == int(internal::evaluator<T>::Flags&RowMajorBit) );
+  return xpr;
+}
+
 template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(const SparseMatrixType& ref)
 {
   const Index rows = ref.rows();
@@ -18,6 +50,8 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c
   typedef SparseMatrix<Scalar, OtherStorage, StorageIndex> OtherSparseMatrixType;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
   typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+//   bool IsRowMajor1 = SparseMatrixType::IsRowMajor;
+//   bool IsRowMajor2 = OtherSparseMatrixType::IsRowMajor;
   
   double density = (std::max)(8./(rows*cols), 0.01);
   
@@ -42,58 +76,69 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c
   randomPermutationVector(pi, cols);
   p.indices() = pi;
 
-  res = mat*p;
+  VERIFY( is_sorted( ::eval(mat*p) ));
+  VERIFY( is_sorted( res = mat*p ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p), 0);
+  //VERIFY_TRANSPOSITION_COUNT( res = mat*p, IsRowMajor ? 1 : 0 );
   res_d = mat_d*p;
   VERIFY(res.isApprox(res_d) && "mat*p");
 
-  res = p*mat;
+  VERIFY( is_sorted( ::eval(p*mat) ));
+  VERIFY( is_sorted( res = p*mat ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p*mat), 0);
   res_d = p*mat_d;
   VERIFY(res.isApprox(res_d) && "p*mat");
 
-  res = mat*p.inverse();
+  VERIFY( is_sorted( (mat*p).eval() ));
+  VERIFY( is_sorted( res = mat*p.inverse() ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(mat*p.inverse()), 0);
   res_d = mat*p.inverse();
   VERIFY(res.isApprox(res_d) && "mat*inv(p)");
 
-  res = p.inverse()*mat;
+  VERIFY( is_sorted( (p*mat+p*mat).eval() ));
+  VERIFY( is_sorted( res = p.inverse()*mat ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p.inverse()*mat), 0);
   res_d = p.inverse()*mat_d;
   VERIFY(res.isApprox(res_d) && "inv(p)*mat");
 
-  res = mat.twistedBy(p);
+  VERIFY( is_sorted( (p * mat * p.inverse()).eval() ));
+  VERIFY( is_sorted( res = mat.twistedBy(p) ));
+  VERIFY_TRANSPOSITION_COUNT( ::eval(p * mat * p.inverse()), 0);
   res_d = (p * mat_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "p*mat*inv(p)");
 
   
-  res = mat.template selfadjointView<Upper>().twistedBy(p_null);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>().twistedBy(p_null) ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint upper to full");
   
-  res = mat.template selfadjointView<Lower>().twistedBy(p_null);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>().twistedBy(p_null) ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint lower to full");
   
   
-  res = up.template selfadjointView<Upper>().twistedBy(p_null);
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>().twistedBy(p_null) ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "upper selfadjoint to full");
   
-  res = lo.template selfadjointView<Lower>().twistedBy(p_null);
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>().twistedBy(p_null) ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "lower selfadjoint full");
 
 
-  res = mat.template selfadjointView<Upper>();
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>() ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint upper to full");
 
-  res = mat.template selfadjointView<Lower>();
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>() ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "full selfadjoint lower to full");
 
-  res = up.template selfadjointView<Upper>();
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>() ));
   res_d = up_sym_d;
   VERIFY(res.isApprox(res_d) && "upper selfadjoint to full");
 
-  res = lo.template selfadjointView<Lower>();
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>() ));
   res_d = lo_sym_d;
   VERIFY(res.isApprox(res_d) && "lower selfadjoint full");
 
@@ -150,19 +195,19 @@ template<int OtherStorage, typename SparseMatrixType> void sparse_permutations(c
   VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to lower");
 
   
-  res = mat.template selfadjointView<Upper>().twistedBy(p);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Upper>().twistedBy(p) ));
   res_d = (p * up_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "full selfadjoint upper twisted to full");
   
-  res = mat.template selfadjointView<Lower>().twistedBy(p);
+  VERIFY( is_sorted( res = mat.template selfadjointView<Lower>().twistedBy(p) ));
   res_d = (p * lo_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "full selfadjoint lower twisted to full");
   
-  res = up.template selfadjointView<Upper>().twistedBy(p);
+  VERIFY( is_sorted( res = up.template selfadjointView<Upper>().twistedBy(p) ));
   res_d = (p * up_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "upper selfadjoint twisted to full");
   
-  res = lo.template selfadjointView<Lower>().twistedBy(p);
+  VERIFY( is_sorted( res = lo.template selfadjointView<Lower>().twistedBy(p) ));
   res_d = (p * lo_sym_d) * p.inverse();
   VERIFY(res.isApprox(res_d) && "lower selfadjoint twisted to full");
 }
@@ -182,4 +227,10 @@ void test_sparse_permutations()
     CALL_SUBTEST_1((  sparse_permutations_all<double>(s) ));
     CALL_SUBTEST_2((  sparse_permutations_all<std::complex<double> >(s) ));
   }
+
+  VERIFY((internal::is_same<internal::permutation_matrix_product<SparseMatrix<double>,OnTheRight,false,SparseShape>::ReturnType,
+                            internal::nested_eval<Product<SparseMatrix<double>,PermutationMatrix<Dynamic,Dynamic>,AliasFreeProduct>,1>::type>::value));
+
+  VERIFY((internal::is_same<internal::permutation_matrix_product<SparseMatrix<double>,OnTheLeft,false,SparseShape>::ReturnType,
+                            internal::nested_eval<Product<PermutationMatrix<Dynamic,Dynamic>,SparseMatrix<double>,AliasFreeProduct>,1>::type>::value));
 }
diff --git a/test/sparse_product.cpp b/test/sparse_product.cpp
index f1e5b8e4c..7ec5270e8 100644
--- a/test/sparse_product.cpp
+++ b/test/sparse_product.cpp
@@ -76,6 +76,21 @@ template<typename SparseMatrixType> void sparse_product()
     VERIFY_IS_APPROX(m4=(m2t.transpose()*m3t.transpose()).pruned(0), refMat4=refMat2t.transpose()*refMat3t.transpose());
     VERIFY_IS_APPROX(m4=(m2*m3t.transpose()).pruned(0), refMat4=refMat2*refMat3t.transpose());
 
+    // dense ?= sparse * sparse
+    VERIFY_IS_APPROX(dm4 =m2*m3, refMat4 =refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2*m3, refMat4+=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2*m3, refMat4-=refMat2*refMat3);
+    VERIFY_IS_APPROX(dm4 =m2t.transpose()*m3, refMat4 =refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4+=m2t.transpose()*m3, refMat4+=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4-=m2t.transpose()*m3, refMat4-=refMat2t.transpose()*refMat3);
+    VERIFY_IS_APPROX(dm4 =m2t.transpose()*m3t.transpose(), refMat4 =refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4+=m2t.transpose()*m3t.transpose(), refMat4+=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4-=m2t.transpose()*m3t.transpose(), refMat4-=refMat2t.transpose()*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4 =m2*m3t.transpose(), refMat4 =refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4+=m2*m3t.transpose(), refMat4+=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4-=m2*m3t.transpose(), refMat4-=refMat2*refMat3t.transpose());
+    VERIFY_IS_APPROX(dm4 = m2*m3*s1, refMat4 = refMat2*refMat3*s1);
+
     // test aliasing
     m4 = m2; refMat4 = refMat2;
     VERIFY_IS_APPROX(m4=m4*m3, refMat4=refMat4*refMat3);
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index d173ee658..f4aefbb48 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -26,7 +26,7 @@ inline void on_temporary_creation() {
 
 #define VERIFY_EVALUATION_COUNT(XPR,N) {\
     nb_temporaries = 0; \
-    XPR; \
+    CALL_SUBTEST( XPR ); \
     if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \
     VERIFY( (#XPR) && nb_temporaries==N ); \
   }
@@ -53,10 +53,14 @@ EIGEN_DONT_INLINE void call_ref_3(const Ref<const SparseMatrix<float>, StandardC
   VERIFY_IS_EQUAL(a.toDense(),b.toDense());
 }
 
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_4(Ref<SparseVector<float> > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
+template<typename B>
+EIGEN_DONT_INLINE void call_ref_5(const Ref<const SparseVector<float> >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); }
+
 void call_ref()
 {
-//   SparseVector<std::complex<float> > ca = VectorXcf::Random(10).sparseView();
-//   SparseVector<float>                a  = VectorXf::Random(10).sparseView();
   SparseMatrix<float>               A = MatrixXf::Random(10,10).sparseView(0.5,1);
   SparseMatrix<float,RowMajor>      B = MatrixXf::Random(10,10).sparseView(0.5,1);
   SparseMatrix<float>               C = MatrixXf::Random(10,10).sparseView(0.5,1);
@@ -64,6 +68,9 @@ void call_ref()
   const SparseMatrix<float>&        Ac(A);
   Block<SparseMatrix<float> >       Ab(A,0,1, 3,3);
   const Block<SparseMatrix<float> > Abc(A,0,1,3,3);
+  SparseVector<float>               vc =  VectorXf::Random(10).sparseView(0.5,1);
+  SparseVector<float,RowMajor>      vr =  VectorXf::Random(10).sparseView(0.5,1);
+  SparseMatrix<float> AA = A*A;
   
 
   VERIFY_EVALUATION_COUNT( call_ref_1(A, A),  0);
@@ -80,8 +87,8 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_3(B, B),  1);
   VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()),  0);
   VERIFY_EVALUATION_COUNT( call_ref_3(B.transpose(), B.transpose()),  0);
-  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A),  1);
-  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, A*A),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_2(A*A, AA),  1);
+  VERIFY_EVALUATION_COUNT( call_ref_3(A*A, AA),  1);
   
   VERIFY(!C.isCompressed());
   VERIFY_EVALUATION_COUNT( call_ref_3(C, C),  1);
@@ -103,8 +110,20 @@ void call_ref()
   VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)),  0);
   
   VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vr.transpose(), vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_2(vr, vr.transpose()),  0);
   
   VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)),  1); // should be 0 (allocate starts/nnz only)
+
+  VERIFY_EVALUATION_COUNT( call_ref_4(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(vr, vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(vc, vc),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(vr, vr.transpose()),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_4(A.col(2), A.col(2)),  0);
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.col(2), A.col(2)),  0);
+  // VERIFY_EVALUATION_COUNT( call_ref_4(A.row(2), A.row(2).transpose()),  1); // does not compile on purpose
+  VERIFY_EVALUATION_COUNT( call_ref_5(A.row(2), A.row(2).transpose()),  1);
 }
 
 void test_sparse_ref()
@@ -113,5 +132,8 @@ void test_sparse_ref()
     CALL_SUBTEST_1( check_const_correctness(SparseMatrix<float>()) );
     CALL_SUBTEST_1( check_const_correctness(SparseMatrix<double,RowMajor>()) );
     CALL_SUBTEST_2( call_ref() );
+
+    CALL_SUBTEST_3( check_const_correctness(SparseVector<float>()) );
+    CALL_SUBTEST_3( check_const_correctness(SparseVector<double,RowMajor>()) );
   }
 }
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index a0254ff1c..b67653496 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -63,32 +63,47 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     VERIFY(xm.isApprox(refX,test_precision<Scalar>()));
   }
   
-  // test initialization ctor
+  // if not too large, do some extra check:
+  if(A.rows()<2000)
   {
-    Rhs x(b.rows(), b.cols());
-    Solver solver2(A);
-    VERIFY(solver2.info() == Success);
-    x = solver2.solve(b);
-    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
-  }
-  
-  // test dense Block as the result and rhs:
-  {
-    DenseRhs x(refX.rows(), refX.cols());
-    DenseRhs oldb(db);
-    x.setZero();
-    x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
-    VERIFY(oldb.isApprox(db) && "sparse solver testing: the rhs should not be modified!");
-    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
-  }
-  
-  // test uncompressed inputs
-  {
-    Mat A2 = A;
-    A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
-    solver.compute(A2);
-    Rhs x = solver.solve(b);
-    VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    // test initialization ctor
+    {
+      Rhs x(b.rows(), b.cols());
+      Solver solver2(A);
+      VERIFY(solver2.info() == Success);
+      x = solver2.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test dense Block as the result and rhs:
+    {
+      DenseRhs x(refX.rows(), refX.cols());
+      DenseRhs oldb(db);
+      x.setZero();
+      x.block(0,0,x.rows(),x.cols()) = solver.solve(db.block(0,0,db.rows(),db.cols()));
+      VERIFY(oldb.isApprox(db) && "sparse solver testing: the rhs should not be modified!");
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test uncompressed inputs
+    {
+      Mat A2 = A;
+      A2.reserve((ArrayXf::Random(A.outerSize())+2).template cast<typename Mat::StorageIndex>().eval());
+      solver.compute(A2);
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+    }
+
+    // test expression as input
+    {
+      solver.compute(0.5*(A+A));
+      Rhs x = solver.solve(b);
+      VERIFY(x.isApprox(refX,test_precision<Scalar>()));
+
+      Solver solver2(0.5*(A+A));
+      Rhs x2 = solver2.solve(b);
+      VERIFY(x2.isApprox(refX,test_precision<Scalar>()));
+    }
   }
 }
 
diff --git a/test/sparselu.cpp b/test/sparselu.cpp
index 78615ff3b..bd000baf1 100644
--- a/test/sparselu.cpp
+++ b/test/sparselu.cpp
@@ -3,25 +3,9 @@
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 // SparseLU solve does not accept column major matrices for the destination.
 // However, as expected, the generic check_sparse_square_solving routines produces row-major
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index 014cc834b..e2f03ffca 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -157,7 +157,9 @@ void unalignedassert()
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
+    // Complexes are disabled because the compiler might aggressively vectorize
+    // the initialization of complex coeffs to 0 before we can check for alignedness
+    //VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
     VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
   }
   for(int b=8; b<EIGEN_MAX_ALIGN_BYTES; b+=8)
@@ -167,7 +169,7 @@ void unalignedassert()
     if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(b));
     if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(b));
     if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(b));
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
+    //if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
   }
 #endif
 }
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 6ff38ed11..35fbb9781 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -1,45 +1,22 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#undef EIGEN_DEFAULT_TO_ROW_MAJOR
+#endif
 #define EIGEN_DEBUG_ASSIGN
 #include "main.h"
 #include <typeinfo>
 
-std::string demangle_traversal(int t)
-{
-  if(t==DefaultTraversal) return "DefaultTraversal";
-  if(t==LinearTraversal) return "LinearTraversal";
-  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
-  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
-  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
-  return "?";
-}
-std::string demangle_unrolling(int t)
-{
-  if(t==NoUnrolling) return "NoUnrolling";
-  if(t==InnerUnrolling) return "InnerUnrolling";
-  if(t==CompleteUnrolling) return "CompleteUnrolling";
-  return "?";
-}
-std::string demangle_flags(int f)
-{
-  std::string res;
-  if(f&RowMajorBit)                 res += " | RowMajor";
-  if(f&PacketAccessBit)             res += " | Packet";
-  if(f&LinearAccessBit)             res += " | Linear";
-  if(f&LvalueBit)                   res += " | Lvalue";
-  if(f&DirectAccessBit)             res += " | Direct";
-  if(f&NestByRefBit)                res += " | NestByRef";
-  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
-  
-  return res;
-}
+using internal::demangle_flags;
+using internal::demangle_traversal;
+using internal::demangle_unrolling;
 
 template<typename Dst, typename Src>
 bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp
index 03f50bb5a..87476f95b 100644
--- a/test/vectorwiseop.cpp
+++ b/test/vectorwiseop.cpp
@@ -2,11 +2,13 @@
 // for linear algebra.
 //
 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#define TEST_ENABLE_TEMPORARY_TRACKING
 #define EIGEN_NO_STATIC_ASSERT
 
 #include "main.h"
@@ -156,16 +158,22 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(m2, m1.colwise() + colvec);
   VERIFY_IS_APPROX(m2.col(c), m1.col(c) + colvec);
 
-  VERIFY_RAISES_ASSERT(m2.colwise() += colvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.colwise() + colvec.transpose());
+  if(rows>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.colwise() += colvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.colwise() + colvec.transpose());
+  }
 
   m2 = m1;
   m2.rowwise() += rowvec;
   VERIFY_IS_APPROX(m2, m1.rowwise() + rowvec);
   VERIFY_IS_APPROX(m2.row(r), m1.row(r) + rowvec);
 
-  VERIFY_RAISES_ASSERT(m2.rowwise() += rowvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.rowwise() + rowvec.transpose());
+  if(cols>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.rowwise() += rowvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.rowwise() + rowvec.transpose());
+  }
 
   // test substraction
 
@@ -174,16 +182,22 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   VERIFY_IS_APPROX(m2, m1.colwise() - colvec);
   VERIFY_IS_APPROX(m2.col(c), m1.col(c) - colvec);
 
-  VERIFY_RAISES_ASSERT(m2.colwise() -= colvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.colwise() - colvec.transpose());
+  if(rows>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.colwise() -= colvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.colwise() - colvec.transpose());
+  }
 
   m2 = m1;
   m2.rowwise() -= rowvec;
   VERIFY_IS_APPROX(m2, m1.rowwise() - rowvec);
   VERIFY_IS_APPROX(m2.row(r), m1.row(r) - rowvec);
 
-  VERIFY_RAISES_ASSERT(m2.rowwise() -= rowvec.transpose());
-  VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
+  if(cols>1)
+  {
+    VERIFY_RAISES_ASSERT(m2.rowwise() -= rowvec.transpose());
+    VERIFY_RAISES_ASSERT(m1.rowwise() - rowvec.transpose());
+  }
 
   // test norm
   rrres = m1.colwise().norm();
@@ -191,6 +205,11 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   rcres = m1.rowwise().norm();
   VERIFY_IS_APPROX(rcres(r), m1.row(r).norm());
 
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().sum(), m1.colwise().template lpNorm<1>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().sum(), m1.rowwise().template lpNorm<1>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().colwise().maxCoeff(), m1.colwise().template lpNorm<Infinity>());
+  VERIFY_IS_APPROX(m1.cwiseAbs().rowwise().maxCoeff(), m1.rowwise().template lpNorm<Infinity>());
+
   // test normalized
   m2 = m1.colwise().normalized();
   VERIFY_IS_APPROX(m2.col(c), m1.col(c).normalized());
@@ -204,14 +223,27 @@ template<typename MatrixType> void vectorwiseop_matrix(const MatrixType& m)
   m2 = m1;
   m2.rowwise().normalize();
   VERIFY_IS_APPROX(m2.row(r), m1.row(r).normalized());
+
+  // test with partial reduction of products
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,MatrixType::RowsAtCompileTime> m1m1 = m1 * m1.transpose();
+  VERIFY_IS_APPROX( (m1 * m1.transpose()).colwise().sum(), m1m1.colwise().sum());
+  Matrix<Scalar,1,MatrixType::RowsAtCompileTime> tmp(rows);
+  VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), (MatrixType::RowsAtCompileTime==Dynamic ? 1 : 0));
+
+  m2 = m1.rowwise() - (m1.colwise().sum()/m1.rows()).eval();
+  m1 = m1.rowwise() - (m1.colwise().sum()/m1.rows());
+  VERIFY_IS_APPROX( m1, m2 );
+  VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/m1.rows()), (MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime!=1 ? 1 : 0) );
 }
 
 void test_vectorwiseop()
 {
-  CALL_SUBTEST_1(vectorwiseop_array(Array22cd()));
-  CALL_SUBTEST_2(vectorwiseop_array(Array<double, 3, 2>()));
-  CALL_SUBTEST_3(vectorwiseop_array(ArrayXXf(3, 4)));
-  CALL_SUBTEST_4(vectorwiseop_matrix(Matrix4cf()));
-  CALL_SUBTEST_5(vectorwiseop_matrix(Matrix<float,4,5>()));
-  CALL_SUBTEST_6(vectorwiseop_matrix(MatrixXd(7,2)));
+  CALL_SUBTEST_1( vectorwiseop_array(Array22cd()) );
+  CALL_SUBTEST_2( vectorwiseop_array(Array<double, 3, 2>()) );
+  CALL_SUBTEST_3( vectorwiseop_array(ArrayXXf(3, 4)) );
+  CALL_SUBTEST_4( vectorwiseop_matrix(Matrix4cf()) );
+  CALL_SUBTEST_5( vectorwiseop_matrix(Matrix<float,4,5>()) );
+  CALL_SUBTEST_6( vectorwiseop_matrix(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  CALL_SUBTEST_7( vectorwiseop_matrix(VectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+  CALL_SUBTEST_7( vectorwiseop_matrix(RowVectorXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
 }
diff --git a/test/visitor.cpp b/test/visitor.cpp
index 39a5d6b5f..844170ec6 100644
--- a/test/visitor.cpp
+++ b/test/visitor.cpp
@@ -55,6 +55,11 @@ template<typename MatrixType> void matrixVisitor(const MatrixType& p)
   VERIFY_IS_APPROX(maxc, eigen_maxc);
   VERIFY_IS_APPROX(minc, m.minCoeff());
   VERIFY_IS_APPROX(maxc, m.maxCoeff());
+
+  eigen_maxc = (m.adjoint()*m).maxCoeff(&eigen_maxrow,&eigen_maxcol);
+  eigen_maxc = (m.adjoint()*m).eval().maxCoeff(&maxrow,&maxcol);
+  VERIFY(maxrow == eigen_maxrow);
+  VERIFY(maxcol == eigen_maxcol);
 }
 
 template<typename VectorType> void vectorVisitor(const VectorType& w)
diff --git a/unsupported/Eigen/AdolcForward b/unsupported/Eigen/AdolcForward
index 2627decd0..15f5f0731 100644
--- a/unsupported/Eigen/AdolcForward
+++ b/unsupported/Eigen/AdolcForward
@@ -25,7 +25,7 @@
 #ifndef NUMBER_DIRECTIONS
 # define NUMBER_DIRECTIONS 2
 #endif
-#include <adolc/adouble.h>
+#include <adolc/adtl.h>
 
 // adolc defines some very stupid macros:
 #if defined(malloc)
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 6faf4585d..6d0cf4f9d 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -1,7 +1,24 @@
-set(Eigen_HEADERS AdolcForward BVH IterativeSolvers MatrixFunctions MoreVectorization AutoDiff AlignedVector3 Polynomials
-                  FFT NonLinearOptimization SparseExtra IterativeSolvers
-                  NumericalDiff Skyline MPRealSupport OpenGLSupport KroneckerProduct Splines LevenbergMarquardt
-   )
+set(Eigen_HEADERS 
+  AdolcForward
+  AlignedVector3
+  ArpackSupport
+  AutoDiff
+  BVH
+  FFT
+  IterativeSolvers 
+  KroneckerProduct
+  LevenbergMarquardt
+  MatrixFunctions 
+  MoreVectorization
+  MPRealSupport
+  NonLinearOptimization
+  NumericalDiff
+  OpenGLSupport
+  Polynomials
+  Skyline 
+  SparseExtra
+  Splines
+  )
 
 install(FILES
   ${Eigen_HEADERS}
diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core
index 292f09564..c8dcf7c16 100644
--- a/unsupported/Eigen/CXX11/Core
+++ b/unsupported/Eigen/CXX11/Core
@@ -32,11 +32,12 @@
 
 #include <vector>
 
+#include "src/Core/util/EmulateArray.h"
+
 // Emulate the cxx11 functionality that we need if the compiler doesn't support it.
 #if __cplusplus <= 199711L
 #include "src/Core/util/EmulateCXX11Meta.h"
 #else
-#include <array>
 #include "src/Core/util/CXX11Workarounds.h"
 #include "src/Core/util/CXX11Meta.h"
 #endif
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index cbe416602..7481a9ddb 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -8,8 +8,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_MODULE
-#define EIGEN_CXX11_TENSOR_MODULE
+//#ifndef EIGEN_CXX11_TENSOR_MODULE
+//#define EIGEN_CXX11_TENSOR_MODULE
 
 #include "Core"
 
@@ -28,14 +28,22 @@
 
 #include <cstddef>
 #include <cstring>
+
+#ifdef _WIN32
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
 #include <stdint.h>
+#endif
 
 #if __cplusplus > 199711
 #include <random>
 #endif
 
 #ifdef _WIN32
-#include <winbase.h>
+#include <windows.h>
 #elif defined(__APPLE__)
 #include <mach/mach_time.h>
 #else
@@ -57,15 +65,19 @@
 #endif
 
 
+#include "src/Tensor/TensorMacros.h"
 #include "src/Tensor/TensorForwardDeclarations.h"
 #include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorDeviceType.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceCuda.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
 #include "src/Tensor/TensorDimensions.h"
 #include "src/Tensor/TensorInitializer.h"
 #include "src/Tensor/TensorTraits.h"
 #include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorUInt128.h"
 #include "src/Tensor/TensorIntDiv.h"
 
 #include "src/Tensor/TensorBase.h"
@@ -73,6 +85,7 @@
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionCuda.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContraction.h"
@@ -80,6 +93,7 @@
 #include "src/Tensor/TensorContractionCuda.h"
 #include "src/Tensor/TensorConversion.h"
 #include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
 #include "src/Tensor/TensorPatch.h"
 #include "src/Tensor/TensorImagePatch.h"
 #include "src/Tensor/TensorVolumePatch.h"
@@ -111,4 +125,4 @@
 
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
-#endif // EIGEN_CXX11_TENSOR_MODULE
+//#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
index 3a08628be..3f149c6a3 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h
@@ -112,7 +112,7 @@ template<typename a, typename... as>                      struct get<0, type_lis
 template<int n EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, as)> struct get<n, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
 
 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
-template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static int value = a; };
+template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
 template<typename T, int n EIGEN_TPL_PP_SPEC_HACK_DEFC(T, as)>   struct get<n, numeric_list<T EIGEN_TPL_PP_SPEC_HACK_USEC(as)>> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); };
 
 /* always get type, regardless of dummy; good for parameter pack expansion */
@@ -252,6 +252,13 @@ template<
   typename... Ts
 > struct reduce;
 
+template<
+  typename Reducer
+> struct reduce<Reducer>
+{
+  constexpr static inline int run() { return Reducer::Identity; }
+};
+
 template<
   typename Reducer,
   typename A,
@@ -275,8 +282,14 @@ template<
 
 /* generic binary operations */
 
-struct sum_op           { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   } };
-struct product_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   } };
+struct sum_op           {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  static constexpr int Identity = 0;
+};
+struct product_op       {
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  static constexpr int Identity = 1;
+};
 
 struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
 struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
@@ -321,11 +334,12 @@ constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
 }
 
 template<typename T, std::size_t N>
-constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr)
+constexpr inline array<T, N> array_reverse(array<T, N> arr)
 {
   return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
 
+
 /* generic array reductions */
 
 // can't reuse standard reduce() interface above because Intel's Compiler
@@ -335,39 +349,48 @@ constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr)
 // an infinite loop)
 template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)))
+  EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
   {
-    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr));
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
   }
 };
 
 template<typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0>
 {
-  constexpr static inline T run(std::array<T, N> arr)
+  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
   {
     return array_get<0>(arr);
   }
 };
 
-template<typename Reducer, typename T, std::size_t N>
-constexpr inline auto array_reduce(std::array<T, N> arr) -> decltype(h_array_reduce<Reducer, T, N>::run(arr))
+template<typename Reducer, typename T>
+struct h_array_reduce<Reducer, T, 0>
 {
-  return h_array_reduce<Reducer, T, N>::run(arr);
+  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
+  {
+    return identity;
+  }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+{
+  return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
 
 /* standard array reductions */
 
 template<typename T, std::size_t N>
-constexpr inline auto array_sum(std::array<T, N> arr) -> decltype(array_reduce<sum_op, T, N>(arr))
+EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
 {
-  return array_reduce<sum_op, T, N>(arr);
+  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 
 template<typename T, std::size_t N>
-constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce<product_op, T, N>(arr))
+EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
 {
-  return array_reduce<product_op, T, N>(arr);
+  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
 
 template<typename t>
@@ -381,13 +404,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 /* zip an array */
 
 template<typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>)
+constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
-  return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
 }
 
 template<typename Op, typename A, typename B, std::size_t N>
-constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<A, N> a, std::array<B, N> b)
+constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
 {
   return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -395,13 +418,13 @@ constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<
 /* zip an array and reduce the result */
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
 }
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -409,13 +432,13 @@ constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N>
 /* apply stuff to an array */
 
 template<typename Op, typename A, std::size_t N, int... n>
-constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>)
+constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
-  return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
 }
 
 template<typename Op, typename A, std::size_t N>
-constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A, N> a)
+constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 {
   return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -423,34 +446,34 @@ constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A,
 /* apply stuff to an array and reduce */
 
 template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
 }
 
 template<typename Reducer, typename Op, typename A, std::size_t N>
-constexpr inline auto array_apply_and_reduce(std::array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
 
 /* repeat a value n times (and make an array out of it
  * usage:
- *   std::array<int, 16> = repeat<16>(42);
+ *   array<int, 16> = repeat<16>(42);
  */
 
 template<int n>
 struct h_repeat
 {
   template<typename t, int... ii>
-  constexpr static inline std::array<t, n> run(t v, numeric_list<int, ii...>)
+  constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
   {
     return {{ typename id_numeric<int, ii, t>::type(v)... }};
   }
 };
 
 template<int n, typename t>
-constexpr std::array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
+constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
 
 /* instantiate a class by a C-style array */
 template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index a590cf4e1..b1528aa66 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -39,46 +39,16 @@
 
 namespace Eigen {
 
-// Use std::array as Eigen array
-template <typename T, std::size_t N> using array = std::array<T, N>;
-
 namespace internal {
 
 /* std::get is only constexpr in C++14, not yet in C++11
- *     - libstdc++ from version 4.7 onwards has it nevertheless,
- *                                          so use that
- *     - libstdc++ older versions: use _M_instance directly
- *     - libc++ all versions so far: use __elems_ directly
- *     - all other libs: use std::get to be portable, but
- *                       this may not be constexpr
  */
-#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK             a._M_instance[I]
-#elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK             a.__elems_[I]
-#else
-#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
-#endif
 
-template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
 template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
 template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
 template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
 
-#undef STD_GET_ARR_HACK
-
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
-  static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<std::array<T,N> > {
-  static const size_t value = N;
-};
-
 /* Suppose you have a template of the form
  * template<typename T> struct X;
  * And you want to specialize it in such a way:
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
new file mode 100644
index 000000000..ab9c2ec3e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateArray.h
@@ -0,0 +1,225 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_ARRAY_H
+#define EIGEN_EMULATE_ARRAY_H
+
+
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed.
+// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
+#if __cplusplus <= 199711L || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
+
+namespace Eigen {
+template <typename T, size_t n> class array {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  static std::size_t size() { return n; }
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+  explicit EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
+    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
+    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    internal::smart_copy(l.begin(), l.end(), values);
+  }
+#endif
+};
+
+
+// Specialize array for zero size
+template <typename T> class array<T, 0> {
+ public:
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& operator[] (size_t) {
+    eigen_assert(false && "Can't index a zero size array");
+    return *static_cast<T*>(NULL);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
+    eigen_assert(false && "Can't index a zero size array");
+    return *static_cast<const T*>(NULL);
+  }
+
+  static EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array() { }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == 0);
+  }
+#endif
+};
+
+namespace internal {
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+  return a[I];
+}
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+  return a[I];
+}
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen array
+#include <array>
+namespace Eigen {
+
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
+namespace internal {
+/* std::get is only constexpr in C++14, not yet in C++11
+ *     - libstdc++ from version 4.7 onwards has it nevertheless,
+ *                                          so use that
+ *     - libstdc++ older versions: use _M_instance directly
+ *     - libc++ all versions so far: use __elems_ directly
+ *     - all other libs: use std::get to be portable, but
+ *                       this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK             a._M_instance[I]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK             a.__elems_[I]
+#else
+#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
+#endif
+
+template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+#undef STD_GET_ARR_HACK
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+
+
+
+
+#endif  // EIGEN_EMULATE_ARRAY_H
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 0ae638fb9..d685d4f9d 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -14,105 +14,6 @@
 
 namespace Eigen {
 
-// The array class is only available starting with cxx11. Emulate our own here
-// if needed
-template <typename T, size_t n> class array {
- public:
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
-
-  static const std::size_t size() { return n; }
-
-  T values[n];
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array() { }
-  explicit EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v) {
-    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
-    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
-    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
-                            const T& v4) {
-    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
-                            const T& v5) {
-    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
-                            const T& v5, const T& v6) {
-    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-    values[5] = v6;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
-                            const T& v5, const T& v6, const T& v7) {
-    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-    values[5] = v6;
-    values[6] = v7;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(
-      const T& v1, const T& v2, const T& v3, const T& v4,
-      const T& v5, const T& v6, const T& v7, const T& v8) {
-    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-    values[5] = v6;
-    values[6] = v7;
-    values[7] = v8;
-  }
-
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  array(std::initializer_list<T> l) {
-    eigen_assert(l.size() == n);
-    internal::smart_copy(l.begin(), l.end(), values);
-  }
-#endif
-};
-
-
 namespace internal {
 
 /** \internal
@@ -279,7 +180,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(
   return arg_prod<NList>::value;
 }
 
-template<std::size_t n, typename t>
+template<typename t, std::size_t n>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
   t prod = 1;
   for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
@@ -298,14 +199,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
   return prod;
 }
 
-template<std::size_t I, class T, std::size_t N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
-  return a[I];
-}
-template<std::size_t I, class T, std::size_t N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
-  return a[I];
-}
 
 template<std::size_t I, class T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
@@ -316,23 +209,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a
   return a[I];
 }
 
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<array<T,N> > {
-  static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<array<T,N>& > {
-  static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<const array<T,N> > {
-  static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<const array<T,N>& > {
-  static const size_t value = N;
-};
-
 struct sum_op {
   template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 87e57cebb..407485090 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1149,6 +1149,19 @@ are the smallest of the reduced values.
 Reduce a tensor using the prod() operator.  The resulting values
 are the product of the reduced values.
 
+### &lt;Operation&gt; all(const Dimensions& new_dims)
+### &lt;Operation&gt; all()
+Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
+whether all elements are true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### &lt;Operation&gt; any(const Dimensions& new_dims)
+### &lt;Operation&gt; any()
+Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
+whether any element is true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+
 ### &lt;Operation&gt; reduce(const Dimensions& new_dims, const Reducer& reducer)
 
 Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 6c16e0faa..ad525bac8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -59,7 +59,7 @@ namespace Eigen {
   * \ref TopicStorageOrders
   */
 
-template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
   public:
@@ -78,16 +78,25 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
-      CoordAccess = true,
+      CoordAccess = true
     };
 
     static const int Options = Options_;
-    static const std::size_t NumIndices = NumIndices_;
+    static const int NumIndices = NumIndices_;
     typedef DSizes<Index, NumIndices_> Dimensions;
 
   protected:
     TensorStorage<Scalar, Dimensions, Options> m_storage;
 
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices>
+    struct isOfNormalIndex{
+      static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
+      static const bool is_int = NumTraits<CustomIndices>::IsInteger;
+      static const bool value = is_array | is_int;
+    };
+#endif
+
   public:
     // Metadata
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
@@ -113,12 +122,30 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     }
 #endif
 
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
     {
       eigen_internal_assert(checkIndexRange(indices));
       return m_storage.data()[linearizedIndex(indices)];
     }
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
+    {
+        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
@@ -135,12 +162,30 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     }
 #endif
 
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
     {
       eigen_internal_assert(checkIndexRange(indices));
       return m_storage.data()[linearizedIndex(indices)];
     }
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+             >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
+    {
+        return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
@@ -178,9 +223,20 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     }
 #endif
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
+    {
+        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
     {
-      eigen_assert(checkIndexRange(indices));
       return coeff(indices);
     }
 
@@ -190,6 +246,12 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return coeff(index);
     }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
     {
       // The bracket operator is only for vectors, use the parenthesis operator instead.
@@ -228,18 +290,35 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     }
 #endif
 
+    // normal indices
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
     {
-      eigen_assert(checkIndexRange(indices));
       return coeffRef(indices);
     }
 
+    // custom indices
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomIndices,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
+    {
+      return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+    }
+#endif
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
     {
       eigen_assert(index >= 0 && index < size());
       return coeffRef(index);
     }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
     {
       // The bracket operator is only for vectors, use the parenthesis operator instead
@@ -261,41 +340,42 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline Tensor(Index firstDimension, IndexTypes... otherDimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
         : m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
     {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
-    inline explicit Tensor(Index dim1)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
       : m_storage(dim1, array<Index, 1>(dim1))
     {
       EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2)
       : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
     {
       EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2, Index dim3)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3)
       : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
     {
       EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
       : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
     {
       EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
-      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5))
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
     {
       EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-    inline explicit Tensor(const array<Index, NumIndices>& dimensions)
+    /** Normal Dimension */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
         : m_storage(internal::array_prod(dimensions), dimensions)
     {
       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
@@ -341,7 +421,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC 
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     void resize(Index firstDimension, IndexTypes... otherDimensions)
     {
       // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
@@ -350,9 +430,10 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     }
 #endif
 
+    /** Normal Dimension */
     EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
     {
-      std::size_t i;
+      int i;
       Index size = Index(1);
       for (i = 0; i < NumIndices; i++) {
         internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
@@ -367,20 +448,39 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       #endif
     }
 
+    // Why this overload, DSizes is derived from array ??? //
     EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
       array<Index, NumIndices> dims;
-      for (std::size_t i = 0; i < NumIndices; ++i) {
+      for (int i = 0; i < NumIndices; ++i) {
         dims[i] = dimensions[i];
       }
       resize(dims);
     }
 
+    EIGEN_DEVICE_FUNC
+    void resize()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      // Nothing to do: rank 0 tensors have fixed size
+    }
+
+    /** Custom Dimension */
+#ifdef EIGEN_HAS_SFINAE
+    template<typename CustomDimension,
+             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
+    >
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
+    {
+      resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
+    }
+#endif
+
 #ifndef EIGEN_EMULATE_CXX11_META_H
     template <typename std::ptrdiff_t... Indices>
     EIGEN_DEVICE_FUNC
     void resize(const Sizes<Indices...>& dimensions) {
       array<Index, NumIndices> dims;
-      for (std::size_t i = 0; i < NumIndices; ++i) {
+      for (int i = 0; i < NumIndices; ++i) {
         dims[i] = static_cast<Index>(dimensions[i]);
       }
       resize(dims);
@@ -390,7 +490,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     EIGEN_DEVICE_FUNC
     void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
       array<Index, NumIndices> dims;
-      for (std::size_t i = 0; i < NumIndices; ++i) {
+      for (int i = 0; i < NumIndices; ++i) {
         dims[i] = static_cast<Index>(dimensions[i]);
       }
       resize(dims);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index ee3bf7fe3..c783aab97 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -215,10 +215,18 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim()),
-        m_strides(gen_strides(m_orig_impl.dimensions())),
-        m_stride_mod(gen_stride_mod(m_orig_impl.dimensions())),
-        m_stride_div(gen_stride_div()) { }
+        m_return_dim(op.return_dim()) {
+
+    gen_strides(m_orig_impl.dimensions(), m_strides);
+    if (Layout == static_cast<int>(ColMajor)) {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
+    } else {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
+    }
+    m_stride_div = m_strides[m_return_dim];
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
@@ -240,9 +248,10 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  private:
-  EIGEN_DEVICE_FUNC StrideDims gen_strides(const InputDimensions& dims) {
-    StrideDims strides;
-    if (m_return_dim < 0) return strides;  // Won't be using these.
+  EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
+    if (m_return_dim < 0) {
+      return;  // Won't be using the strides.
+    }
     eigen_assert(m_return_dim < NumDims &&
                  "Asking to convert index to a dimension outside of the rank");
 
@@ -259,28 +268,15 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
         strides[i] = strides[i+1] * dims[i+1];
       }
     }
-    return strides;
-  }
-
-  EIGEN_DEVICE_FUNC Index gen_stride_mod(const InputDimensions& dims) {
-    if (Layout == static_cast<int>(ColMajor)) {
-      return (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : dims.TotalSize();
-    } else {
-      return (m_return_dim > 0) ? m_strides[m_return_dim - 1] : dims.TotalSize();
-    }
-  }
-
-  EIGEN_DEVICE_FUNC Index gen_stride_div() {
-    return m_strides[m_return_dim];
   }
 
  protected:
   TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
   TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
   const int m_return_dim;
-  const StrideDims m_strides;
-  const Index m_stride_mod;
-  const Index m_stride_div;
+  StrideDims m_strides;
+  Index m_stride_mod;
+  Index m_stride_div;
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 477e4a174..392acf302 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -86,6 +86,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_sqrt_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+    sign() const {
+      return unaryExpr(internal::scalar_sign_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
     rsqrt() const {
@@ -116,6 +122,24 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_tanh_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived>
+    lgamma() const {
+      return unaryExpr(internal::scalar_lgamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
+    erf() const {
+      return unaryExpr(internal::scalar_erf_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived>
+    erfc() const {
+      return unaryExpr(internal::scalar_erfc_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
     sigmoid() const {
@@ -155,7 +179,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
     operator- (Scalar rhs) const {
-      EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
       return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
     }
 
@@ -168,10 +192,16 @@ class TensorBase<Derived, ReadOnlyAccessors>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
     operator/ (Scalar rhs) const {
-      // EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE);
       return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
+    operator% (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+      return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMax(Scalar threshold) const {
@@ -248,35 +278,67 @@ class TensorBase<Derived, ReadOnlyAccessors>
 
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
     operator<(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), std::less<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
     operator<=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), std::less_equal<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
     operator>(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), std::greater<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
     operator>=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), std::greater_equal<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
     operator==(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), std::equal_to<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
     operator!=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), std::not_equal_to<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+    }
+
+    // comparisons and tests for Scalars
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<(Scalar threshold) const {
+      return operator<(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<=(Scalar threshold) const {
+      return operator<=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>(Scalar threshold) const {
+      return operator>(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>=(Scalar threshold) const {
+      return operator>=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator==(Scalar threshold) const {
+      return operator==(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator!=(Scalar threshold) const {
+      return operator!=(constant(threshold));
     }
 
     // Coefficient-wise ternary operators.
@@ -302,6 +364,13 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
     }
 
+    // Fourier transforms
+    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+    fft(const FFT& fft) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
+    }
+
     // Reductions.
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
@@ -363,6 +432,32 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
     }
 
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    all(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::AndReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    all() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::AndReducer());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    any(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::OrReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    any() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::OrReducer());
+    }
+
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorTupleReducerOp<
       internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
@@ -561,8 +656,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
   protected:
-    template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor;
-   template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
     template <typename OtherDerived, int AccessLevel> friend class TensorBase;
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -578,7 +673,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
     typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
     static const int NumDimensions = DerivedTraits::NumDimensions;
 
-    template <typename Scalar, std::size_t NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
     template <typename OtherDerived, int AccessLevel> friend class TensorBase;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 24a0df820..dc64959e1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -99,6 +99,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
+    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
+    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
+    // tensor with N >= 1 of 1 element first and then broadcast.
+    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Broadcast& broadcast = op.broadcast();
     for (int i = 0; i < NumDims; ++i) {
@@ -152,11 +156,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
         eigen_assert(idx < m_impl.dimensions()[i]);
         inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
           eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
           inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
@@ -164,11 +168,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
       index -= idx * m_outputStrides[i];
     }
-    if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
       eigen_assert(index < m_impl.dimensions()[0]);
       inputIndex += index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
         eigen_assert(index % m_impl.dimensions()[0] == 0);
       } else {
         inputIndex += (index % m_impl.dimensions()[0]);
@@ -182,11 +186,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     Index inputIndex = 0;
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
         eigen_assert(idx < m_impl.dimensions()[i]);
         inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
           eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
           inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
@@ -194,11 +198,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
       index -= idx * m_outputStrides[i];
     }
-    if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
       eigen_assert(index < m_impl.dimensions()[NumDims-1]);
       inputIndex += index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
         eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
       } else {
         inputIndex += (index % m_impl.dimensions()[NumDims-1]);
@@ -231,11 +235,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     Index inputIndex = 0;
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
         eigen_assert(idx < m_impl.dimensions()[i]);
         inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
           eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
           inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
@@ -244,11 +248,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       index -= idx * m_outputStrides[i];
     }
     Index innermostLoc;
-    if (internal::index_statically_eq<Broadcast>()(0, 1)) {
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
       eigen_assert(index < m_impl.dimensions()[0]);
       innermostLoc = index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
         eigen_assert(index % m_impl.dimensions()[0] == 0);
         innermostLoc = 0;
       } else {
@@ -284,11 +288,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     Index inputIndex = 0;
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
         eigen_assert(idx < m_impl.dimensions()[i]);
         inputIndex += idx * m_inputStrides[i];
       } else {
-        if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
           eigen_assert(idx % m_impl.dimensions()[i] == 0);
         } else {
           inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
@@ -297,11 +301,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       index -= idx * m_outputStrides[i];
     }
     Index innermostLoc;
-    if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
+    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
       eigen_assert(index < m_impl.dimensions()[NumDims-1]);
       innermostLoc = index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
+      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
         eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
         innermostLoc = 0;
       } else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c9fa39e51..abc3c92ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -50,7 +50,7 @@ struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingO
 template <DenseIndex DimId>
 struct DimensionId
 {
-  DimensionId(DenseIndex dim) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -60,7 +60,7 @@ struct DimensionId
 template <>
 struct DimensionId<Dynamic>
 {
-  DimensionId(DenseIndex dim) : actual_dim(dim) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
     eigen_assert(dim >= 0);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index fa05cab30..3d153bb94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -131,7 +131,9 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     eigen_assert(0 <= m_axis && m_axis < NumDims);
     const Dimensions& lhs_dims = m_leftImpl.dimensions();
     const Dimensions& rhs_dims = m_rightImpl.dimensions();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e60fab713..eda93a1de 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -33,14 +33,14 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size, bool inner_dim_contiguous>
-class BaseTensorContractionMapper {
+class SimpleTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
+  SimpleTensorContractionMapper(const Tensor& tensor,
+                                const nocontract_t& nocontract_strides,
+                                const nocontract_t& ij_strides,
+                                const contract_t& contract_strides,
+                                const contract_t& k_strides) :
       m_tensor(tensor),
       m_nocontract_strides(nocontract_strides),
       m_ij_strides(ij_strides),
@@ -160,104 +160,23 @@ class BaseTensorContractionMapper {
 };
 
 
-
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper;
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper {
+         int packet_size, bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+  class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous>
+{
  public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
-    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
-    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
-    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT, int AlignmentType>
-  EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return loadPacket(i);
-  }
-
-  template <typename Packet>
-  bool aligned(Index /*i*/) const {
-    return false;
-  }
-
- private:
-  const ParentMapper& m_base_mapper;
-  const Index m_vert_offset;
-  const Index m_horiz_offset;
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
-         bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned>
-class TensorContractionInputMapper
-    : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> {
-
- public:
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
-  typedef SubMapper VectorMapper;
-
-  TensorContractionInputMapper(const Tensor& tensor,
-                               const nocontract_t& nocontract_strides,
-                               const nocontract_t& ij_strides,
-                               const contract_t& contract_strides,
-                               const contract_t& k_strides)
-      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> ParentMapper;
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
-    return VectorMapper(*this, i, j);
-  }
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
@@ -322,35 +241,23 @@ class TensorContractionInputMapper
 };
 
 
-
-
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
-    : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>  {
-
+         bool inner_dim_contiguous,
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>
+{
  public:
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
-  typedef SubMapper VectorMapper;
-
-  TensorContractionInputMapper(const Tensor& tensor,
-                               const nocontract_t& nocontract_strides,
-                               const nocontract_t& ij_strides,
-                               const contract_t& contract_strides,
-                               const contract_t& k_strides)
-      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> ParentMapper;
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
-    return VectorMapper(*this, i, j);
-  }
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
   typedef typename packet_traits<Scalar>::type Packet;
   EIGEN_DEVICE_FUNC
@@ -365,6 +272,106 @@ class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, co
   }
 };
 
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper;
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return loadPacket(i);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+    return false;
+  }
+
+ private:
+  const ParentMapper& m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         int packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper
+  : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+};
+
+
 
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
@@ -488,7 +495,7 @@ struct TensorContractionEvaluatorBase
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
   static const int RDims =
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const unsigned int ContractDims = internal::array_size<Indices>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
   static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef array<Index, LDims> left_dim_mapper_t;
@@ -524,7 +531,7 @@ struct TensorContractionEvaluatorBase
         eval_right_dims[i] = m_rightImpl.dimensions()[i];
       }
       // We keep the pairs of contracting indices.
-      for (unsigned int i = 0; i < ContractDims; i++) {
+      for (int i = 0; i < ContractDims; i++) {
         eval_op_indices[i].first = op.indices()[i].first;
         eval_op_indices[i].second = op.indices()[i].second;
       }
@@ -538,7 +545,7 @@ struct TensorContractionEvaluatorBase
       }
       // We need to flip all the pairs of contracting indices as well as
       // reversing the dimensions.
-      for (unsigned int i = 0; i < ContractDims; i++) {
+      for (int i = 0; i < ContractDims; i++) {
         eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
         eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
       }
@@ -577,7 +584,7 @@ struct TensorContractionEvaluatorBase
     for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
       bool contracting = false;
-      for (unsigned int j = 0; j < ContractDims; j++) {
+      for (int j = 0; j < ContractDims; j++) {
         if (eval_op_indices[j].first == i) {
           contracting = true;
           break;
@@ -605,7 +612,7 @@ struct TensorContractionEvaluatorBase
     for (int i = 0; i < RDims; i++) {
       bool contracting = false;
       // find if we are contracting on index i of right tensor
-      for (unsigned int j = 0; j < ContractDims; j++) {
+      for (int j = 0; j < ContractDims; j++) {
         if (eval_op_indices[j].second == i) {
           contracting = true;
           break;
@@ -632,7 +639,7 @@ struct TensorContractionEvaluatorBase
     // each tensor, we'll only look at the first tensor here.
     m_rhs_inner_dim_contiguous = true;
     m_rhs_inner_dim_reordered = false;
-    for (unsigned int i = 0; i < ContractDims; i++) {
+    for (int i = 0; i < ContractDims; i++) {
       Index left = eval_op_indices[i].first;
       Index right = eval_op_indices[i].second;
 
@@ -640,7 +647,7 @@ struct TensorContractionEvaluatorBase
       eigen_assert(size == eval_right_dims[right] &&
                    "Contraction axes must be same size");
 
-      if (i+1 < internal::array_size<contract_t>::value) {
+      if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) {
         m_k_strides[i+1] = m_k_strides[i] * size;
       } else {
         m_k_size = m_k_strides[i] * size;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index f6bd949bd..90ee50678 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1147,7 +1147,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
 
   bool check_rhs = (base_n + 63) >= n_size;
   bool check_lhs128 = (base_m + 127) >= m_size;
-  bool check_lhs64 = (base_m + 63) >= m_size;
 
   if (!check_rhs) {
     if (!check_lhs128) {
@@ -1227,9 +1226,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
   typedef typename internal::conditional<
-    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
   typedef typename internal::conditional<
-    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 4ca978ab4..3ca7daf32 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -52,6 +52,7 @@ struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorCo
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
 struct PacketConverter {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
 
@@ -67,6 +68,7 @@ struct PacketConverter {
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
 struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
 
@@ -87,6 +89,7 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
 struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
 
@@ -124,8 +127,8 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
     typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
     typedef typename internal::traits<TensorConversionOp>::Index Index;
     typedef typename internal::nested<TensorConversionOp>::type Nested;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename XprType::PacketReturnType PacketReturnType;
+    typedef Scalar CoeffReturnType;
+    typedef Packet PacketReturnType;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 17f10c07b..29e50a3b2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -63,92 +63,6 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
     ExpressionType& m_expression;
 };
 
-
-#ifdef EIGEN_USE_THREADS
-template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
-  public:
-    TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-      Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
-      Sum sum(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
-      Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
-      Difference difference(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
-      Assign assign(m_expression, difference);
-      internal::TensorExecutor<const Assign, ThreadPoolDevice>::run(assign, m_device);
-      return *this;
-    }
-
-  protected:
-    const ThreadPoolDevice& m_device;
-    ExpressionType& m_expression;
-};
-#endif
-
-
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
-{
-  public:
-    TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-      Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
-      Sum sum(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
-      Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
-      Difference difference(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
-      Assign assign(m_expression, difference);
-      internal::TensorExecutor<const Assign, GpuDevice>::run(assign, m_device);
-      return *this;
-    }
-
-  protected:
-    const GpuDevice& m_device;
-    ExpressionType& m_expression;
-};
-#endif
-
-
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
similarity index 51%
rename from unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
rename to unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 2ff7d471d..4d7570077 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -7,272 +7,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
 
 
 namespace Eigen {
 
-// Default device for the machine (typically a single cpu core)
-struct DefaultDevice {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef __CUDA_ARCH__
-    // Running on the host CPU
-    return 1;
-#else
-    // Running on a CUDA device
-    return 32;
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
-    // Running single threaded on the host CPU
-    // Should return an enum that encodes the ISA supported by the CPU
-    return 1;
-#else
-    // Running on a CUDA device
-    return __CUDA_ARCH__ / 100;
-#endif
-  }
-};
-
-
-// Multiple cpu cores
-// We should really use a thread pool here but first we need to find a portable thread pool library.
-#ifdef EIGEN_USE_THREADS
-
-// This defines an interface that ThreadPoolDevice can take to use
-// custom thread pools underneath.
-class ThreadPoolInterface {
- public:
-  virtual void Schedule(std::function<void()> fn) = 0;
-
-  virtual ~ThreadPoolInterface() {}
-};
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-class ThreadPool : public ThreadPoolInterface {
- public:
-  // Construct a pool that contains "num_threads" threads.
-  explicit ThreadPool(int num_threads) {
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
-    }
-  }
-
-  // Wait until all scheduled work has finished and then destroy the
-  // set of threads.
-  ~ThreadPool()
-  {
-    {
-      // Wait for all work to get done.
-      std::unique_lock<std::mutex> l(mu_);
-      empty_.wait(l, [this]() { return pending_.empty(); });
-      exiting_ = true;
-
-      // Wakeup all waiters.
-      for (auto w : waiters_) {
-        w->ready = true;
-        w->work = nullptr;
-        w->cv.notify_one();
-      }
-    }
-
-    // Wait for threads to finish.
-    for (auto t : threads_) {
-      t->join();
-      delete t;
-    }
-  }
-
-  // Schedule fn() for execution in the pool of threads. The functions are
-  // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) {
-    std::unique_lock<std::mutex> l(mu_);
-    if (waiters_.empty()) {
-      pending_.push_back(fn);
-    } else {
-      Waiter* w = waiters_.back();
-      waiters_.pop_back();
-      w->ready = true;
-      w->work = fn;
-      w->cv.notify_one();
-    }
-  }
-
- protected:
-  void WorkerLoop() {
-    std::unique_lock<std::mutex> l(mu_);
-    Waiter w;
-    while (!exiting_) {
-      std::function<void()> fn;
-      if (pending_.empty()) {
-        // Wait for work to be assigned to me
-        w.ready = false;
-        waiters_.push_back(&w);
-        w.cv.wait(l, [&w]() { return w.ready; });
-        fn = w.work;
-        w.work = nullptr;
-      } else {
-        // Pick up pending work
-        fn = pending_.front();
-        pending_.pop_front();
-        if (pending_.empty()) {
-          empty_.notify_all();
-        }
-      }
-      if (fn) {
-        mu_.unlock();
-        fn();
-        mu_.lock();
-      }
-    }
-  }
-
- private:
-  struct Waiter {
-    std::condition_variable cv;
-    std::function<void()> work;
-    bool ready;
-  };
-
-  std::mutex mu_;
-  std::vector<std::thread*> threads_;               // All threads
-  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
-  std::deque<std::function<void()>> pending_;       // Queue of pending work
-  std::condition_variable empty_;                   // Signaled on pending_.empty()
-  bool exiting_ = false;
-};
-
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object.
-// but only one caller must call Notify() on the object.
-class Notification {
- public:
-  Notification() : notified_(false) {}
-  ~Notification() {}
-
-  void Notify() {
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void WaitForNotification() {
-    std::unique_lock<std::mutex> l(mu_);
-    cv_.wait(l, [this]() { return notified_; } );
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  bool notified_;
-};
-
-// Runs an arbitrary function and then calls Notify() on the passed in
-// Notification.
-template <typename Function, typename... Args> struct FunctionWrapper
-{
-  static void run(Notification* n, Function f, Args... args) {
-    f(args...);
-    n->Notify();
-  }
-};
-
-static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
-  if (n) {
-    n->WaitForNotification();
-  }
-}
-
-
-// Build a thread pool device on top the an existing pool of threads.
-struct ThreadPoolDevice {
-  // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
-  }
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-
-  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    return num_threads_;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    // Should return an enum that encodes the ISA supported by the CPU
-    return 1;
-  }
-
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
-    Notification* n = new Notification();
-    std::function<void()> func =
-      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
-    pool_->Schedule(func);
-    return n;
-  }
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    std::function<void()> func = std::bind(f, args...);
-    pool_->Schedule(func);
-  }
-
- private:
-  ThreadPoolInterface* pool_;
-  size_t num_threads_;
-};
-
-#endif
-
-
-// GPU offloading
-#ifdef EIGEN_USE_GPU
-
 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
@@ -295,6 +35,7 @@ static void initializeDeviceProp() {
     if (!m_devicePropInitialized) {
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(status)
       assert(status == cudaSuccess);
       m_deviceProperties = new cudaDeviceProp[num_devices];
       for (int i = 0; i < num_devices; ++i) {
@@ -330,6 +71,7 @@ class CudaStreamDevice : public StreamInterface {
     } else {
       int num_devices;
       cudaError_t err = cudaGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
       assert(err == cudaSuccess);
       assert(device < num_devices);
       device_ = device;
@@ -343,6 +85,7 @@ class CudaStreamDevice : public StreamInterface {
   }
   virtual void* allocate(size_t num_bytes) const {
     cudaError_t err = cudaSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
     void* result;
     err = cudaMalloc(&result, num_bytes);
@@ -352,6 +95,7 @@ class CudaStreamDevice : public StreamInterface {
   }
   virtual void deallocate(void* buffer) const {
     cudaError_t err = cudaSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
     assert(buffer != NULL);
     err = cudaFree(buffer);
@@ -363,7 +107,6 @@ class CudaStreamDevice : public StreamInterface {
   int device_;
 };
 
-
 struct GpuDevice {
   // The StreamInterface is not owned: the caller is
   // responsible for its initialization and eventual destruction.
@@ -398,6 +141,7 @@ struct GpuDevice {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
                                       stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
@@ -408,6 +152,7 @@ struct GpuDevice {
 #ifndef __CUDA_ARCH__
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
@@ -418,6 +163,7 @@ struct GpuDevice {
 #ifndef __CUDA_ARCH__
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
@@ -427,6 +173,7 @@ struct GpuDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
     eigen_assert(false && "The default device should be used instead to generate kernel code");
@@ -450,8 +197,9 @@ struct GpuDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef __CUDA_ARCH__
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
     cudaError_t err = cudaStreamSynchronize(stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
     assert(false && "The default device should be used instead to generate kernel code");
@@ -477,8 +225,12 @@ struct GpuDevice {
   // This function checks if the CUDA runtime recorded an error for the
   // underlying stream device.
   inline bool ok() const {
+#ifdef __CUDACC__
     cudaError_t error = cudaStreamQuery(stream_->stream());
     return (error == cudaSuccess) || (error == cudaErrorNotReady);
+#else
+    return false;
+#endif
   }
 
  private:
@@ -486,18 +238,22 @@ struct GpuDevice {
 
 };
 
-
+#ifndef __CUDA_ARCH__
 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)            \
   (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);  \
   assert(cudaGetLastError() == cudaSuccess);
-
+#else
+#define LAUNCH_CUDA_KERNEL(...)  \
+   eigen_assert(false && "Cannot launch a kernel from another kernel");
+#endif
 
 // FIXME: Should be device and kernel specific.
+#ifdef __CUDACC__
 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
   cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
   assert(status == cudaSuccess);
 }
-
 #endif
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
new file mode 100644
index 000000000..267f6f8e3
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#ifndef __CUDA_ARCH__
+    // Running on the host CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#ifndef __CUDA_ARCH__
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#else
+    // Running on a CUDA device
+    return __CUDA_ARCH__ / 100;
+#endif
+  }
+};
+
+}  // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
new file mode 100644
index 000000000..dcbef5b03
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -0,0 +1,224 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+class ThreadPool : public ThreadPoolInterface {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit ThreadPool(int num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(new std::thread([this]() { WorkerLoop(); }));
+    }
+  }
+
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~ThreadPool()
+  {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      empty_.wait(l, [this]() { return pending_.empty(); });
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->work = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      t->join();
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) {
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(fn);
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->work = fn;
+      w->cv.notify_one();
+    }
+  }
+
+ protected:
+  void WorkerLoop() {
+    std::unique_lock<std::mutex> l(mu_);
+    Waiter w;
+    while (!exiting_) {
+      std::function<void()> fn;
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        w.cv.wait(l, [&w]() { return w.ready; });
+        fn = w.work;
+        w.work = nullptr;
+      } else {
+        // Pick up pending work
+        fn = pending_.front();
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (fn) {
+        mu_.unlock();
+        fn();
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  struct Waiter {
+    std::condition_variable cv;
+    std::function<void()> work;
+    bool ready;
+  };
+
+  std::mutex mu_;
+  std::vector<std::thread*> threads_;               // All threads
+  std::vector<Waiter*> waiters_;                    // Stack of waiting threads.
+  std::deque<std::function<void()>> pending_;       // Queue of pending work
+  std::condition_variable empty_;                   // Signaled on pending_.empty()
+  bool exiting_ = false;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object.
+// but only one caller must call Notify() on the object.
+class Notification {
+ public:
+  Notification() : notified_(false) {}
+  ~Notification() {}
+
+  void Notify() {
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> l(mu_);
+    cv_.wait(l, [this]() { return notified_; } );
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
+};
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapper
+{
+  static void run(Notification* n, Function f, Args... args) {
+    f(args...);
+    n->Notify();
+  }
+};
+
+static EIGEN_STRONG_INLINE void wait_until_ready(Notification* n) {
+  if (n) {
+    n->WaitForNotification();
+  }
+}
+
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+    Notification* n = new Notification();
+    std::function<void()> func =
+      std::bind(&FunctionWrapper<Function, Args...>::run, n, f, args...);
+    pool_->Schedule(func);
+    return n;
+  }
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    std::function<void()> func = std::bind(f, args...);
+    pool_->Schedule(func);
+  }
+
+ private:
+  ThreadPoolInterface* pool_;
+  size_t num_threads_;
+};
+
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
index 9773afccf..ca9ac79df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -23,6 +23,7 @@ namespace Eigen {
   */
 
 template <typename Index, std::size_t Rank> struct DimensionList {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   const Index operator[] (const Index i) const { return i; }
 };
 
@@ -45,184 +46,184 @@ template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(c
 
 #if defined(EIGEN_HAS_CONSTEXPR)
 template <typename Index, std::size_t Rank>
-struct index_known_statically<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_known_statically<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<const DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
-  constexpr bool operator() () const {
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i == value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i == value;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i != value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i != value;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i > value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i > value;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i < value;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<const DimensionList<Index, Rank> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return i < value;
   }
 };
 
 #else
 template <typename Index, std::size_t Rank>
-struct index_known_statically<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_known_statically<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex) const {
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct all_indices_known_statically<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() () const {
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return true;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_eq<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_ne<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_gt<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
 template <typename Index, std::size_t Rank>
-struct index_statically_lt<const DimensionList<Index, Rank> > {
-  EIGEN_ALWAYS_INLINE bool operator() (const DenseIndex, const DenseIndex) const {
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
     return false;
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index d6ec62a74..f3c9a3148 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -52,8 +52,8 @@ struct fixed_size_tensor_index_linearization_helper
   static inline Index run(array<Index, NumIndices> const& indices,
                           const Dimensions& dimensions)
   {
-    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
-        dget<RowMajor ? n : (NumIndices - n - 1), Dimensions>::value *
+    return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
+        dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
         fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
   }
 };
@@ -62,10 +62,9 @@ template<typename Index, std::size_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const& indices,
-                          const Dimensions&)
+  static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
   {
-    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+    return 0;
   }
 };
 
@@ -76,8 +75,8 @@ struct fixed_size_tensor_index_extraction_helper
   static inline Index run(const Index index,
                           const Dimensions& dimensions)
   {
-    const Index mult = (index == n) ? 1 : 0;
-    return array_get<n>(dimensions) * mult +
+    const Index mult = (index == n-1) ? 1 : 0;
+    return array_get<n-1>(dimensions) * mult +
         fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
   }
 };
@@ -86,13 +85,12 @@ template<typename Index>
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index index,
-                          const Dimensions& dimensions)
+  static inline Index run(const Index,
+                          const Dimensions&)
   {
-    const Index mult = (index == 0) ? 1 : 0;
-    return array_get<0>(dimensions) * mult;
+    return 0;
   }
-};
+  };
 
 }  // end namespace internal
 
@@ -130,16 +128,16 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count - 1>::run(index, *this);
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
   }
 };
 
@@ -216,17 +214,17 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
         return internal::get<4, Base>::value;
       default:
         eigen_assert(false && "index overflow");
-        return static_cast<std::size_t>(-1);
+        return static_cast<DenseIndex>(-1);
     }
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *reinterpret_cast<const Base*>(this));
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *reinterpret_cast<const Base*>(this));
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
   }
 };
 
@@ -267,10 +265,10 @@ struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 
 
 // Dynamic size
-template <typename DenseIndex, std::size_t NumDims>
+template <typename DenseIndex, int NumDims>
 struct DSizes : array<DenseIndex, NumDims> {
   typedef array<DenseIndex, NumDims> Base;
-  static const std::size_t count = NumDims;
+  static const int count = NumDims;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
     return NumDims;
@@ -280,8 +278,8 @@ struct DSizes : array<DenseIndex, NumDims> {
     return internal::array_prod(*static_cast<const Base*>(this));
   }
 
-  EIGEN_DEVICE_FUNC DSizes() {
-    for (std::size_t i = 0 ; i < NumDims; ++i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
+    for (int i = 0 ; i < NumDims; ++i) {
       (*this)[i] = 0;
     }
   }
@@ -371,10 +369,10 @@ struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 
 namespace internal {
 
-template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
+template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
   static const size_t value = NumDims;
 };
-template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
+template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
   static const size_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
@@ -387,6 +385,10 @@ static const std::ptrdiff_t value = Sizes<Indices...>::count;
 template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
   return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
 }
+template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
+  eigen_assert(false && "should never be called");
+  return -1;
+}
 #else
 template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
   static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
@@ -402,22 +404,22 @@ template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::si
 
 
 template <typename Dims1, typename Dims2, size_t n, size_t m>
-struct sizes_match_up_to_dim {
+struct sizes_match_below_dim {
   static inline bool run(Dims1&, Dims2&) {
     return false;
   }
 };
 template <typename Dims1, typename Dims2, size_t n>
-struct sizes_match_up_to_dim<Dims1, Dims2, n, n> {
+struct sizes_match_below_dim<Dims1, Dims2, n, n> {
   static inline bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<n>(dims1) == array_get<n>(dims2)) &
-        sizes_match_up_to_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+        sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
 template <typename Dims1, typename Dims2>
-struct sizes_match_up_to_dim<Dims1, Dims2, 0, 0> {
-  static inline bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<0>(dims1) == array_get<0>(dims2));
+struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
+  static inline bool run(Dims1&, Dims2&) {
+    return true;
   }
 };
 
@@ -426,7 +428,7 @@ struct sizes_match_up_to_dim<Dims1, Dims2, 0, 0> {
 
 template <typename Dims1, typename Dims2>
 bool dimensions_match(Dims1& dims1, Dims2& dims2) {
-  return internal::sizes_match_up_to_dim<Dims1, Dims2, internal::array_size<Dims1>::value-1, internal::array_size<Dims2>::value-1>::run(dims1, dims2);
+  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index be0b07cdf..902f25247 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -319,7 +319,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index b2800aefb..c28078882 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -50,6 +50,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
 {
  public:
   typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
   {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
@@ -57,7 +58,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
     if (needs_assign)
     {
       const Index size = array_prod(evaluator.dimensions());
-      static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
       const Index VectorizedSize = (size / PacketSize) * PacketSize;
 
       for (Index i = 0; i < VectorizedSize; i += PacketSize) {
@@ -149,7 +150,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, false> {
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+};
+
+template <typename Expression>
+class TensorExecutor<Expression, GpuDevice, true> {
+ public:
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(__CUDACC__)
+
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
@@ -193,48 +211,53 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
   }
 }
 
-
-template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, false>
+/*static*/
+template <typename Expression>
+EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
 {
- public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const GpuDevice& device)
+#ifndef __CUDA_ARCH__
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign)
   {
-    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
-      const int block_size = device.maxCudaThreadsPerBlock();
-      const Index size = array_prod(evaluator.dimensions());
-      LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
-    }
-    evaluator.cleanup();
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+    LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
   }
-};
-
-template<typename Expression>
-class TensorExecutor<Expression, GpuDevice, true>
-{
- public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const GpuDevice& device)
-  {
-    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
-      const int block_size = device.maxCudaThreadsPerBlock();
-      const Index size = array_prod(evaluator.dimensions());
-      LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
-    }
-    evaluator.cleanup();
-  }
-};
-
+  evaluator.cleanup();
+#else
+   eigen_assert(false && "Cannot launch a kernel from another kernel");
 #endif
+}
+
+
+/*static*/
+template<typename Expression>
+EIGEN_DEVICE_FUNC inline void TensorExecutor<Expression, GpuDevice, true>::run(const Expression& expr, const GpuDevice& device)
+{
+#ifndef __CUDA_ARCH__
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign)
+  {
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash if we're called with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+    LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+#else
+   eigen_assert(false && "Cannot launch a kernel from another kernel");
+#endif
+}
+
+#endif  // __CUDACC__
+#endif  // EIGEN_USE_GPU
 
 } // end namespace internal
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
new file mode 100644
index 000000000..215a4ebad
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -0,0 +1,598 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
+// NVCC fails to compile this code
+#if !defined(__CUDACC__)
+
+namespace Eigen {
+
+/** \class TensorFFT
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor FFT class.
+  *
+  * TODO:
+  * Vectorize the Cooley Tukey and the Bluestein algorithm
+  * Add support for multithreaded evaluation
+  * Improve the performance on GPU
+  */
+
+template <bool NeedUprade> struct MakeComplex {
+  template <typename T>
+  EIGEN_DEVICE_FUNC
+  T operator() (const T& val) const { return val; }
+};
+
+template <> struct MakeComplex<true> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC
+  std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
+};
+
+template <> struct MakeComplex<false> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC
+  std::complex<T> operator() (const std::complex<T>& val) const { return val; }
+};
+
+template <int ResultType> struct PartOf {
+  template <typename T> T operator() (const T& val) const { return val; }
+};
+
+template <> struct PartOf<RealPart> {
+  template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
+};
+
+template <> struct PartOf<ImagPart> {
+  template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
+};
+
+namespace internal {
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
+  typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+  typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
+};
+
+}  // end namespace internal
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
+      : m_xpr(expr), m_fft(fft) {}
+
+  EIGEN_DEVICE_FUNC
+  const FFT& fft() const { return m_fft; }
+
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
+    return m_xpr;
+  }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const FFT m_fft;
+};
+
+// Eval as rvalue
+template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
+  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename std::complex<RealScalar> ComplexScalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef internal::traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = true,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+    m_size = m_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalToBuf(data);
+      return false;
+    } else {
+      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      evalToBuf(m_data);
+      return true;
+    }
+  }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_data) {
+      m_device.deallocate(m_data);
+      m_data = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+    return m_data[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
+    ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
+
+    for (Index i = 0; i < m_size; ++i) {
+      buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
+    }
+
+    for (size_t i = 0; i < m_fft.size(); ++i) {
+      int dim = m_fft[i];
+      eigen_assert(dim >= 0 && dim < NumDims);
+      Index line_len = m_dimensions[dim];
+      eigen_assert(line_len >= 1);
+      ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
+      const bool is_power_of_two = isPowerOfTwo(line_len);
+      const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+      const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+
+      ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
+      if (!is_power_of_two) {
+        ComplexScalar pos_j_base = ComplexScalar(std::cos(M_PI/line_len), std::sin(M_PI/line_len));
+        for (Index j = 0; j < line_len + 1; ++j) {
+          pos_j_base_powered[j] = std::pow(pos_j_base, j * j);
+        }
+      }
+
+      for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
+        Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
+
+        // get data into line_buf
+        for (Index j = 0; j < line_len; ++j) {
+          Index offset = getIndexFromOffset(base_offset, dim, j);
+          line_buf[j] = buf[offset];
+        }
+
+        // processs the line
+        if (is_power_of_two) {
+          processDataLineCooleyTukey(line_buf, line_len, log_len);
+        }
+        else {
+          processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
+        }
+
+        // write back
+        for (Index j = 0; j < line_len; ++j) {
+          const ComplexScalar div_factor = (FFTDir == FFT_FORWARD) ? ComplexScalar(1, 0) : ComplexScalar(line_len, 0);
+          Index offset = getIndexFromOffset(base_offset, dim, j);
+          buf[offset] =  line_buf[j] / div_factor;
+        }
+      }
+      m_device.deallocate(line_buf);
+      if (!pos_j_base_powered) {
+        m_device.deallocate(a);
+        m_device.deallocate(b);
+        m_device.deallocate(pos_j_base_powered);
+      }
+    }
+
+    if(!write_to_out) {
+      for (Index i = 0; i < m_size; ++i) {
+        data[i] = PartOf<FFTResultType>()(buf[i]);
+      }
+      m_device.deallocate(buf);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
+    eigen_assert(x > 0);
+    return !(x & (x - 1));
+  }
+
+  // The composite number for padding, used in Bluestein's FFT algorithm
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
+    Index i = 2;
+    while (i < 2 * n - 1) i *= 2;
+    return i;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
+    Index log2m = 0;
+    while (m >>= 1) log2m++;
+    return log2m;
+  }
+
+  // Call Cooley Tukey algorithm directly, data length must be power of 2
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
+    eigen_assert(isPowerOfTwo(line_len));
+    scramble_FFT(line_buf, line_len);
+    compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
+  }
+
+  // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
+    Index n = line_len;
+    Index m = good_composite;
+    ComplexScalar* data = line_buf;
+
+    for (Index i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+      }
+      else {
+        a[i] = data[i] * pos_j_base_powered[i];
+      }
+    }
+    for (Index i = n; i < m; ++i) {
+      a[i] = ComplexScalar(0, 0);
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[i];
+      }
+      else {
+        b[i] = std::conj(pos_j_base_powered[i]);
+      }
+    }
+    for (Index i = n; i < m - n; ++i) {
+      b[i] = ComplexScalar(0, 0);
+    }
+    for (Index i = m - n; i < m; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[m-i];
+      }
+      else {
+        b[i] = std::conj(pos_j_base_powered[m-i]);
+      }
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
+
+    scramble_FFT(b, m);
+    compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
+
+    for (Index i = 0; i < m; ++i) {
+      a[i] *= b[i];
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
+
+    //Do the scaling after ifft
+    for (Index i = 0; i < m; ++i) {
+      a[i] /= m;
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if(FFTDir == FFT_FORWARD) {
+        data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+      }
+      else {
+        data[i] = a[i] * pos_j_base_powered[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
+    eigen_assert(isPowerOfTwo(n));
+    Index j = 1;
+    for (Index i = 1; i < n; ++i){
+      if (j > i) {
+        std::swap(data[j-1], data[i-1]);
+      }
+      Index m = n >> 1;
+      while (m >= 2 && j > m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+  }
+
+  template<int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, Index n, Index n_power_of_2) {
+    eigen_assert(isPowerOfTwo(n));
+    if (n == 1) {
+      return;
+    }
+    else if (n == 2) {
+      ComplexScalar tmp = data[1];
+      data[1] = data[0] - data[1];
+      data[0] += tmp;
+      return;
+    }
+    else if (n == 4) {
+      ComplexScalar tmp[4];
+      tmp[0] = data[0] + data[1];
+      tmp[1] = data[0] - data[1];
+      tmp[2] = data[2] + data[3];
+      if(Dir == FFT_FORWARD) {
+        tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
+      }
+      else {
+        tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
+      }
+      data[0] = tmp[0] + tmp[2];
+      data[1] = tmp[1] + tmp[3];
+      data[2] = tmp[0] - tmp[2];
+      data[3] = tmp[1] - tmp[3];
+      return;
+    }
+    else if (n == 8) {
+      ComplexScalar tmp_1[8];
+      ComplexScalar tmp_2[8];
+
+      tmp_1[0] = data[0] + data[1];
+      tmp_1[1] = data[0] - data[1];
+      tmp_1[2] = data[2] + data[3];
+      if (Dir == FFT_FORWARD) {
+        tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
+      }
+      else {
+        tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
+      }
+      tmp_1[4] = data[4] + data[5];
+      tmp_1[5] = data[4] - data[5];
+      tmp_1[6] = data[6] + data[7];
+      if (Dir == FFT_FORWARD) {
+        tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
+      }
+      else {
+        tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
+      }
+      tmp_2[0] = tmp_1[0] + tmp_1[2];
+      tmp_2[1] = tmp_1[1] + tmp_1[3];
+      tmp_2[2] = tmp_1[0] - tmp_1[2];
+      tmp_2[3] = tmp_1[1] - tmp_1[3];
+      tmp_2[4] = tmp_1[4] + tmp_1[6];
+      // SQRT2DIV2 = sqrt(2)/2
+      #define SQRT2DIV2 0.7071067811865476
+      if (Dir == FFT_FORWARD) {
+        tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
+        tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
+        tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
+      }
+      else {
+        tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
+        tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
+        tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
+      }
+      data[0] = tmp_2[0] + tmp_2[4];
+      data[1] = tmp_2[1] + tmp_2[5];
+      data[2] = tmp_2[2] + tmp_2[6];
+      data[3] = tmp_2[3] + tmp_2[7];
+      data[4] = tmp_2[0] - tmp_2[4];
+      data[5] = tmp_2[1] - tmp_2[5];
+      data[6] = tmp_2[2] - tmp_2[6];
+      data[7] = tmp_2[3] - tmp_2[7];
+
+      return;
+    }
+    else {
+      compute_1D_Butterfly<Dir>(data, n/2, n_power_of_2 - 1);
+      compute_1D_Butterfly<Dir>(data + n/2, n/2, n_power_of_2 - 1);
+      //Original code:
+      //RealScalar wtemp = std::sin(M_PI/n);
+      //RealScalar wpi =  -std::sin(2 * M_PI/n);
+      RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
+      RealScalar wpi;
+      if (Dir == FFT_FORWARD) {
+        wpi =  m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+      }
+      else {
+        wpi = 0 - m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+      }
+
+      const ComplexScalar wp(wtemp, wpi);
+      ComplexScalar w(1.0, 0.0);
+      for(Index i = 0; i < n/2; i++) {
+        ComplexScalar temp(data[i + n/2] * w);
+        data[i + n/2] = data[i] - temp;
+        data[i] += temp;
+        w += w * wp;
+      }
+      return;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+    Index result = 0;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > omitted_dim; --i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    else {
+      for (Index i = 0; i < omitted_dim; ++i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    // Value of index_coords[omitted_dim] is not determined to this step
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
+    Index result = base + offset * m_strides[omitted_dim] ;
+    return result;
+  }
+
+ protected:
+  Index m_size;
+  const FFT& m_fft;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  CoeffReturnType* m_data;
+  const Device& m_device;
+
+  // This will support a maximum FFT size of 2^32 for each dimension
+  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
+  RealScalar m_sin_PI_div_n_LUT[32] = {
+  0.0,
+  -2,
+  -0.999999999999999,
+  -0.292893218813453,
+  -0.0761204674887130,
+  -0.0192147195967696,
+  -0.00481527332780311,
+  -0.00120454379482761,
+  -3.01181303795779e-04,
+  -7.52981608554592e-05,
+  -1.88247173988574e-05,
+  -4.70619042382852e-06,
+  -1.17654829809007e-06,
+  -2.94137117780840e-07,
+  -7.35342821488550e-08,
+  -1.83835707061916e-08,
+  -4.59589268710903e-09,
+  -1.14897317243732e-09,
+  -2.87243293150586e-10,
+  -7.18108232902250e-11,
+  -1.79527058227174e-11,
+  -4.48817645568941e-12,
+  -1.12204411392298e-12,
+  -2.80511028480785e-13,
+  -7.01277571201985e-14,
+  -1.75319392800498e-14,
+  -4.38298482001247e-15,
+  -1.09574620500312e-15,
+  -2.73936551250781e-16,
+  -6.84841378126949e-17,
+  -1.71210344531737e-17,
+  -4.28025861329343e-18
+  };
+
+  // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
+  RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
+    0.0,
+    0.0,
+   -1.00000000000000e+00,
+   -7.07106781186547e-01,
+   -3.82683432365090e-01,
+   -1.95090322016128e-01,
+   -9.80171403295606e-02,
+   -4.90676743274180e-02,
+   -2.45412285229123e-02,
+   -1.22715382857199e-02,
+   -6.13588464915448e-03,
+   -3.06795676296598e-03,
+   -1.53398018628477e-03,
+   -7.66990318742704e-04,
+   -3.83495187571396e-04,
+   -1.91747597310703e-04,
+   -9.58737990959773e-05,
+   -4.79368996030669e-05,
+   -2.39684498084182e-05,
+   -1.19842249050697e-05,
+   -5.99211245264243e-06,
+   -2.99605622633466e-06,
+   -1.49802811316901e-06,
+   -7.49014056584716e-07,
+   -3.74507028292384e-07,
+   -1.87253514146195e-07,
+   -9.36267570730981e-08,
+   -4.68133785365491e-08,
+   -2.34066892682746e-08,
+   -1.17033446341373e-08,
+   -5.85167231706864e-09,
+   -2.92583615853432e-09
+  };
+};
+
+}  // end namespace Eigen
+
+#endif  // __CUDACC__
+
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 5f3e49e61..a4d6ce6b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -68,7 +68,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -90,9 +90,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return m_storage.data()[index];
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& coeff() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -114,9 +122,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return m_storage.data()[index];
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& coeffRef()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return m_storage.data()[0];
+    }
+
+
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -138,6 +154,13 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return coeff(index);
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeff();
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
     {
@@ -148,7 +171,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -170,6 +193,13 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return coeffRef(index);
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return coeffRef();
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator[](Index index)
     {
@@ -191,7 +221,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     }
 
 #ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    inline TensorFixedSize(Self&& other)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
       : m_storage(other.m_storage)
     {
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index d0202559a..65fd25a2e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -104,7 +104,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     m_impl.evalSubExprsIfNeeded(NULL);
     const Index numValues = m_impl.dimensions().TotalSize();
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
@@ -121,7 +121,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     m_impl.cleanup();
     return true;
   }
-  EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_device.deallocate(m_buffer);
     m_buffer = NULL;
   }
@@ -132,7 +132,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return internal::ploadt<Packet, LoadMode>(m_buffer + index);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index c22444e6f..a8bd8b888 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
+template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
 template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename PlainObjectType> class TensorRef;
@@ -29,6 +29,7 @@ template<typename Axis, typename LeftXprType, typename RightXprType> class Tenso
 template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
 template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
+template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
 template<typename PatchDim, typename XprType> class TensorPatchOp;
 template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
@@ -58,6 +59,18 @@ struct DefaultDevice;
 struct ThreadPoolDevice;
 struct GpuDevice;
 
+enum FFTResultType {
+  RealPart = 0,
+  ImagPart = 1,
+  BothParts = 2
+};
+
+enum FFTDirection {
+    FFT_FORWARD = 0,
+    FFT_REVERSE = 1
+};
+
+
 namespace internal {
 
 template <typename Device, typename Expression>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index ed259399b..34ba4e392 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -14,6 +14,20 @@ namespace Eigen {
 namespace internal {
 
 
+/** \internal
+ * \brief Template functor to compute the modulo between an array and a scalar.
+ */
+template <typename Scalar>
+struct scalar_mod_op {
+  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  const Scalar m_divisor;
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+
+
 /** \internal
   * \brief Template functor to compute the sigmoid of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
@@ -26,8 +40,8 @@ struct scalar_sigmoid_op {
     return one / (one + std::exp(-x));
   }
 
-  template <typename Packet>
-  inline Packet packetOp(const Packet& x) const {
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
     const Packet one = pset1<Packet>(1);
     return pdiv(one, padd(one, pexp(pnegate(x))));
   }
@@ -82,6 +96,7 @@ template <typename T> struct MeanReducer
   static const bool PacketAccess = true;
   static const bool IsStateful = true;
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
@@ -219,6 +234,33 @@ template <typename T> struct ProdReducer
 };
 
 
+struct AndReducer
+{
+  static const bool PacketAccess = false;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum && t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
+struct OrReducer {
+  static const bool PacketAccess = false;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+    *accum = *accum || t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+    return false;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+    return accum;
+  }
+};
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 3b6f2c730..38a833f82 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -33,7 +33,10 @@ std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccesso
   const Index total_size = internal::array_prod(tensor.dimensions());
 
   // Print the tensor as a 1d vector or a 2d matrix.
-  if (internal::array_size<Dimensions>::value == 1) {
+  static const int rank = internal::array_size<Dimensions>::value;
+  if (rank == 0) {
+    os << tensor.coeff(0);
+  } else if (rank == 1) {
     Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
     os << array;
   } else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 78e1d2bd1..74ce6d0ec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -77,38 +77,128 @@ struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
+
+
+
+template<typename... T>
+struct IndexTuple;
+
+template<typename T, typename... O>
+struct IndexTuple<T, O...> {
+  constexpr IndexTuple() : head(), others() { }
+  constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
+
+  constexpr static int count = 1 + sizeof...(O);
+  T head;
+  IndexTuple<O...> others;
+  typedef T Head;
+  typedef IndexTuple<O...> Other;
+};
+
+template<typename T>
+  struct IndexTuple<T> {
+  constexpr IndexTuple() : head() { }
+  constexpr IndexTuple(const T& v) : head(v) { }
+
+  constexpr static int count = 1;
+  T head;
+  typedef T Head;
+};
+
+
+template<int N, typename... T>
+struct IndexTupleExtractor;
+
+template<int N, typename T, typename... O>
+struct IndexTupleExtractor<N, T, O...> {
+
+  typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
+
+  static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+  }
+
+  static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+  }
+  template <typename V>
+  static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
+  }
+
+};
+
+ template<typename T, typename... O>
+   struct IndexTupleExtractor<0, T, O...> {
+
+   typedef T ValType;
+
+   static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return val.head;
+  }
+   static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return val.head;
+  }
+  template <typename V>
+  static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    val.head = new_val;
+  }
+};
+
+
+
+template <int N, typename T, typename... O>
+constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <int N, typename T, typename... O>
+constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <typename T, typename... O>
+  struct array_size<IndexTuple<T, O...> > {
+  static const size_t value = IndexTuple<T, O...>::count;
+};
+template <typename T, typename... O>
+  struct array_size<const IndexTuple<T, O...> > {
+  static const size_t value = IndexTuple<T, O...>::count;
+};
+
+
+
+
 template <DenseIndex Idx>
 struct tuple_coeff {
   template <typename... T>
-  static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
-    return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+  static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
+    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
   }
   template <typename... T>
-  static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+  static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
     if (i == Idx) {
-      update_value(std::get<Idx>(t), value);
+      update_value(array_get<Idx>(t), value);
     } else {
       tuple_coeff<Idx-1>::set(i, t, value);
     }
   }
 
   template <typename... T>
-  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
-    return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) ||
+  static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
+    return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
         tuple_coeff<Idx-1>::value_known_statically(i, t);
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
-    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
+  static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
         tuple_coeff<Idx-1>::values_up_to_known_statically(t);
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
-    return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
-           is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value &&
-           std::get<Idx>(t) > std::get<Idx-1>(t) &&
+  static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           array_get<Idx>(t) > array_get<Idx-1>(t) &&
            tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
   }
 };
@@ -116,62 +206,66 @@ struct tuple_coeff {
 template <>
 struct tuple_coeff<0> {
   template <typename... T>
-  static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
+  static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return std::get<0>(t) * (i == 0);
+    return array_get<0>(t) * (i == 0);
   }
   template <typename... T>
-  static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
+  static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
     eigen_assert (i == 0);
-    update_value(std::get<0>(t), value);
+    update_value(array_get<0>(t), value);
   }
   template <typename... T>
-  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>&) {
-    //    eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
+  static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value & (i == 0);
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_known_statically(const std::tuple<T...>&) {
-    return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value;
+  static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
   }
 
   template <typename... T>
-  static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>&) {
+  static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
     return true;
   }
 };
 }  // namespace internal
 
 
+
 template<typename FirstType, typename... OtherTypes>
-struct IndexList : std::tuple<FirstType, OtherTypes...> {
+  struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
   }
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
   }
 
-  constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
-  constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { }
+  constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
+  constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
   constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
   }
   constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
   }
 
   constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
 
 template<typename FirstType, typename... OtherTypes>
 constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
-  return std::make_tuple(val1, other_vals...);
+  return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
 }
 
 
@@ -186,172 +280,178 @@ template<typename FirstType, typename... OtherTypes> size_t array_prod(const Ind
 }
 
 template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
-  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
 };
 template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
-  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
 };
 
-template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
-  return std::get<n>(a);
+template<DenseIndex N, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
-template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
-  return std::get<n>(a);
+template<DenseIndex N, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
 
 template <typename T>
-struct index_known_statically {
-  constexpr bool operator() (DenseIndex) const {
+struct index_known_statically_impl {
+  static constexpr bool run(const DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_known_statically<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i) const {
+struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+  static constexpr bool run(const DenseIndex i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_known_statically<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i) const {
+struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+  static constexpr bool run(const DenseIndex i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
+
 template <typename T>
-struct all_indices_known_statically {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl {
+  static constexpr bool run() {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+  static constexpr bool run() {
     return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
+struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+  static constexpr bool run() {
     return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
   }
 };
 
+
 template <typename T>
-struct indices_statically_known_to_increase {
-  constexpr bool operator() () const {
+struct indices_statically_known_to_increase_impl {
+  static constexpr bool run() {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
-    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
+  static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() () const {
-    return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
+  static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
   }
 };
 
+
 template <typename Tx>
-struct index_statically_eq {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
+struct index_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] == value);
+        (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] == value);
-  }
-};
-
-template <typename T>
-struct index_statically_ne {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
-  return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] != value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] != value);
+        (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
 };
 
 
 template <typename T>
-struct index_statically_gt {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
-  return false;
+struct index_statically_ne_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_gt<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] > value);
+        (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] > value);
+        (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
 };
 
+
 template <typename T>
-struct index_statically_lt {
-  constexpr bool operator() (DenseIndex, DenseIndex) const {
-  return false;
+struct index_statically_gt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_lt<IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] < value);
+        (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
-struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
-  constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
+struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>()[i] < value);
+        (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+
+
+template <typename T>
+struct index_statically_lt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
 };
 
@@ -363,52 +463,51 @@ struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
 namespace Eigen {
 namespace internal {
 
-// No C++11 support
 template <typename T>
-struct index_known_statically {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{
+struct index_known_statically_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
-struct all_indices_known_statically {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+struct all_indices_known_statically_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
     return false;
   }
 };
 
 template <typename T>
-struct indices_statically_known_to_increase {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
+struct indices_statically_known_to_increase_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_eq {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_eq_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_ne {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_ne_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_gt {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_gt_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
-struct index_statically_lt {
-  EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
+struct index_statically_lt_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
@@ -418,4 +517,46 @@ struct index_statically_lt {
 
 #endif
 
+
+namespace Eigen {
+namespace internal {
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) {
+  return index_known_statically_impl<T>::run(i);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+  return all_indices_known_statically_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+  return indices_statically_known_to_increase_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) {
+  return index_statically_ne_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) {
+  return index_statically_gt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) {
+  return index_statically_lt_impl<T>::run(i, value);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+
 #endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 4303e3536..ad2a1e6ac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -55,6 +55,18 @@ struct Initializer<Derived, 1> {
   }
 };
 
+template <typename Derived>
+struct Initializer<Derived, 0> {
+  typedef typename traits<Derived>::Scalar InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*/* indices*/,
+                  const InitList& v) {
+    tensor.coeffRef(0) = v;
+  }
+};
+
+
 template <typename Derived, int N>
 void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
                        const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index fd2441894..b58173e58 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -34,10 +34,7 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
   {
 #ifdef __CUDA_ARCH__
-    if (sizeof(T) == 8) {
-      return __clzll(val);
-    }
-    return __clz(val);
+    return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
 #elif EIGEN_COMP_MSVC
     DWORD leading_zeros = 0;
     if (sizeof(T) == 8) {
@@ -46,11 +43,11 @@ namespace {
     else {
       _BitScanReverse(&leading_zero, val);
     }
+    return leading_zeros;
 #else
-    if (sizeof(T) == 8) {
-      return __builtin_clzl(static_cast<uint64_t>(val));
-    }
-    return __builtin_clz(static_cast<uint32_t>(val));
+    return (sizeof(T) == 8) ?
+      __builtin_clzl(static_cast<uint64_t>(val)) :
+      __builtin_clz(static_cast<uint32_t>(val));
 #endif
   }
 
@@ -61,13 +58,8 @@ namespace {
 
   template <typename T>
   struct DividerTraits {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
     typedef typename UnsignedTraits<T>::type type;
     static const int N = sizeof(T) * 8;
-#else
-    typedef uint32_t type;
-    static const int N = 32;
-#endif
   };
 
   template <typename T>
@@ -79,44 +71,42 @@ namespace {
 #endif
   }
 
-#if defined(__CUDA_ARCH__)
- template <typename T>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-    return __umul64hi(a, b);
- }
-#else
   template <typename T>
-  EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+#if defined(__CUDA_ARCH__)
+    return __umul64hi(a, b);
+#elif defined(__SIZEOF_INT128__)
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
 #else
-    EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return (a * b) >> 32;
+    return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
 #endif
   }
-#endif
 
   template <int N, typename T>
   struct DividerHelper {
-    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
       EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
       return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
     }
   };
 
-#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
   template <typename T>
   struct DividerHelper<64, T> {
-    static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
+#else
+      const uint64_t shift = 1ULL << log_div;
+      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      return static_cast<uint64_t>(result);
+#endif
     }
   };
-#endif
 }
 
 
-template <typename T>
+template <typename T, bool div_gt_one = false>
 struct TensorIntDivisor {
  public:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
@@ -166,8 +156,9 @@ struct TensorIntDivisor {
 
 // Optimized version for signed 32 bit integers.
 // Derived from Hacker's Delight.
+// Only works for divisors strictly greater than one
 template <>
-class TensorIntDivisor<int32_t> {
+class TensorIntDivisor<int32_t, true> {
  public:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
     magic = 0;
@@ -226,8 +217,8 @@ private:
 };
 
 
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+template <typename T, bool div_gt_one>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
   return divisor.divide(numerator);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
new file mode 100644
index 000000000..8ed71f838
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+
+
+/** use this macro in sfinae selection in templated functions
+ *
+ *   template<typename T,
+ *            typename std::enable_if< isBanana<T>::value , int >::type = 0
+ *   >
+ *   void foo(){}
+ *
+ *   becomes =>
+ *
+ *   template<typename TopoType,
+ *           SFINAE_ENABLE_IF( isBanana<T>::value )
+ *   >
+ *   void foo(){}
+ */
+
+// SFINAE requires variadic templates
+#ifndef __CUDACC__
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+  // SFINAE doesn't work for gcc <= 4.7
+  #ifdef EIGEN_COMP_GNUC
+    #if EIGEN_GNUC_AT_LEAST(4,8)
+      #define EIGEN_HAS_SFINAE
+    #endif
+  #else
+    #define EIGEN_HAS_SFINAE
+  #endif
+#endif
+#endif
+
+#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
+    typename internal::enable_if< ( __condition__ ) , int >::type = 0
+
+
+#if defined(EIGEN_HAS_CONSTEXPR)
+#define EIGEN_CONSTEXPR constexpr
+#else
+#define EIGEN_CONSTEXPR
+#endif
+
+
+#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 2cb2bc7a6..5c759af09 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -49,9 +49,15 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       IsAligned = ((int(Options_)&Aligned)==Aligned),
       PacketAccess = (internal::packet_traits<Scalar>::size > 1),
       Layout = PlainObjectType::Layout,
-      CoordAccess = true,
+      CoordAccess = true
     };
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+      EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    }
+
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
@@ -82,15 +88,19 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     }
 #endif
 
-   inline TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
     template <typename Dimensions>
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
+      : m_data(tensor.data()), m_dimensions(tensor.dimensions())
+    { }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
     EIGEN_DEVICE_FUNC
@@ -117,11 +127,18 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       }
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return m_data[0];
+    }
+
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
-      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
         return m_data[index];
@@ -141,7 +158,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
     {
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = i1 + i0 * m_dimensions[0];
+        const Index index = i1 + i0 * m_dimensions[1];
         return m_data[index];
       } else {
         const Index index = i0 + i1 * m_dimensions[0];
@@ -152,7 +169,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
     {
       if (PlainObjectType::Options&RowMajor) {
-         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
          return m_data[index];
       } else {
          const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
@@ -196,6 +213,13 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       }
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()()
+    {
+      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      return m_data[0];
+    }
+
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
@@ -221,7 +245,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
     {
        if (PlainObjectType::Options&RowMajor) {
-         const Index index = i1 + i0 * m_dimensions[0];
+         const Index index = i1 + i0 * m_dimensions[1];
         return m_data[index];
       } else {
         const Index index = i0 + i1 * m_dimensions[0];
@@ -232,7 +256,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
     {
        if (PlainObjectType::Options&RowMajor) {
-         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
         return m_data[index];
       } else {
          const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 7dfa04760..f28a9699d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -32,14 +32,29 @@ template <> struct max_n_1<0> {
 };
 
 
+// Default packet types
+template <typename Scalar, typename Device>
+struct PacketType {
+  typedef typename internal::packet_traits<Scalar>::type type;
+  static const int size = internal::unpacket_traits<type>::size;
+};
 
-
-#if defined(EIGEN_HAS_CONSTEXPR)
-#define EIGEN_CONSTEXPR constexpr
-#else
-#define EIGEN_CONSTEXPR
+// For CUDA packet types when using a GpuDevice
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <>
+struct PacketType<float, GpuDevice> {
+  typedef float4 type;
+  static const int size = 4;
+};
+template <>
+struct PacketType<double, GpuDevice> {
+  typedef double2 type;
+  static const int size = 2;
+};
 #endif
 
+
+
 // Tuple mimics std::pair but works on e.g. nvcc.
 template <typename U, typename V> struct Tuple {
  public:
@@ -83,7 +98,55 @@ bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
   return !(x == y);
 }
 
-#undef EIGEN_CONSTEXPR
+
+
+#ifdef EIGEN_HAS_SFINAE
+namespace internal {
+
+  template<typename IndexType, Index... Is>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
+    return { idx[Is]... };
+  }
+  template<typename IndexType>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
+    return array<Index, 0>();
+  }
+
+  /** Make an array (for index/dimensions) out of a custom index */
+  template<typename Index, std::size_t NumIndices, typename IndexType>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+    return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
+  }
+
+
+  template <typename B, typename D>
+  struct is_base_of
+  {
+
+    typedef char (&yes)[1];
+    typedef char (&no)[2];
+
+    template <typename BB, typename DD>
+    struct Host
+    {
+      operator BB*() const;
+      operator DD*();
+    };
+
+    template<typename T>
+    static yes check(D*, T);
+    static no check(B*, int);
+
+    static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
+  };
+
+}
+#endif
+
+
 
 }  // namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index bdc86e0fa..d8c923d74 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -443,7 +443,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       return rslt;
     }
     else {
-      typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[packetSize-1] = m_impl.coeff(inputIndices[1]);
       for (int i = 1; i < packetSize-1; ++i) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 07a6e8d4c..91e32d200 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -98,6 +98,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_padding(op.padding())
   {
+    // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
+    // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
+    // of 1 element first and then pad.
+    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     // Compute dimensions
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 1d22843af..aaa877185 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -64,10 +64,10 @@ template <typename OutputDims> struct DimInitializer {
   }
 };
 
-template <> struct DimInitializer<Sizes<1> > {
+template <> struct DimInitializer<Sizes<> > {
   template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
   static void run(const InputDims& input_dims, const array<bool, Rank>&,
-                  Sizes<1>*,  array<Index, Rank>* reduced_dims) {
+                  Sizes<>*, array<Index, Rank>* reduced_dims) {
     const int NumInputDims = internal::array_size<InputDims>::value;
     for (int i = 0; i < NumInputDims; ++i) {
       (*reduced_dims)[i] = input_dims[i];
@@ -88,30 +88,30 @@ struct preserve_inner_most_dims {
 #if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, 0);
-  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
   static const bool value = tmp1 & tmp2 & tmp3;
 };
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value);
-  static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
   static const bool value = tmp1 & tmp2 & tmp3;
 
 };
 template <typename ReducedDims, int NumTensorDims>
 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_gt<ReducedDims>()(0, 0);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
   static const bool value = tmp1 & tmp2;
 
 };
 template <typename ReducedDims, int NumTensorDims>
 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
-  static const bool tmp2 = index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
   static const bool value = tmp1 & tmp2;
 };
 #endif
@@ -136,6 +136,12 @@ struct GenericDimReducer<0, Self, Op> {
     }
   }
 };
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
+    reducer.reduce(self.m_impl.coeff(index), accum);
+  }
+};
 
 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct InnerMostDimReducer {
@@ -192,6 +198,12 @@ struct InnerMostDimPreserver<0, Self, Op, true> {
     }
   }
 };
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
 
 // Default full reducer
 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
@@ -326,184 +338,9 @@ struct FullReducer<Self, Op, ThreadPoolDevice, true> {
 
 
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple cuda thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another cuda thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    assert(0 && "Wordsize not supported");
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
 #endif
-}
-
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
-#if __CUDA_ARCH__ >= 300
-  atomicAdd(output, accum);
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  if (first_index == 0) {
-    *output = reducer.initialize();
-  }
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  for (Index i = 0; i < NumPerThread; ++i) {
-    const Index index = first_index + i * BlockSize;
-    if (index >= num_coeffs) {
-      break;
-    }
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reduce(__shfl_down(accum, offset), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-}
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(false && "Should only be called on floats");
-  }
-
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
-  }
-};
-
-#endif
-
-
-template <typename Self, typename Op,
-          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
-class BlockReducer {
- public:
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  explicit BlockReducer(const Op& reducer) : op_(reducer) {
-    accum_ = op_.initialize();
-  }
-  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
-    for (Index i = 0; i < num_values_to_reduce; ++i) {
-      op_.reduce(data[index + i], &accum_);
-    }
-  }
-  CoeffReturnType Finalize() {
-    return op_.finalize(accum_);
-  }
-
- private:
-  CoeffReturnType accum_;
-  Op op_;
-};
-
-
-template <typename Self, typename Op>
-class BlockReducer<Self, Op, true> {
- public:
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::PacketReturnType PacketReturnType;
-  explicit BlockReducer(const Op& reducer) : op_(reducer) {
-    vaccum_ = op_.template initializePacket<PacketReturnType>();
-    accum_ = op_.initialize();
-  }
-  void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
-    const int packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    const typename Self::Index vectorized_size = (num_values_to_reduce /
-                                                  packet_size) * packet_size;
-    for (typename Self::Index i = 0; i < vectorized_size; i += packet_size) {
-      op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
-          &data[index + i]), &vaccum_);
-    }
-
-    for (typename Self::Index i = vectorized_size;
-         i < num_values_to_reduce; ++i) {
-      op_.reduce(data[index + i], &accum_);
-    }
-  }
-  typename Self::CoeffReturnType Finalize() {
-    return op_.finalizeBoth(accum_, vaccum_);
-  }
-
- private:
-  typename Self::PacketReturnType vaccum_;
-  typename Self::CoeffReturnType accum_;
-  Op op_;
-};
 
 }  // end namespace internal
 
@@ -550,8 +387,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
   static const int NumInputDims = internal::array_size<InputDimensions>::value;
   static const int NumReducedDims = internal::array_size<Dims>::value;
-  static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
-  typedef typename internal::conditional<NumInputDims==NumReducedDims, Sizes<1>, DSizes<Index, NumOutputDims> >::type Dimensions;
+  static const int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
   typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
@@ -565,7 +402,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
-  static const bool RunningFullReduction = (NumInputDims==NumReducedDims);
+  static const bool RunningFullReduction = (NumOutputDims==0);
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
@@ -589,47 +426,50 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
 
     // Precompute output strides.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_outputStrides[0] = 1;
-      for (int i = 1; i < NumOutputDims; ++i) {
-        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-      }
-    } else {
-      m_outputStrides[NumOutputDims - 1] = 1;
-      for (int i = NumOutputDims - 2; i >= 0; --i) {
-        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-      }
-    }
-
-    // Precompute input strides.
-    array<Index, NumInputDims> input_strides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      input_strides[0] = 1;
-      for (int i = 1; i < NumInputDims; ++i) {
-        input_strides[i] = input_strides[i-1] * input_dims[i-1];
-      }
-    } else {
-      input_strides[NumInputDims - 1] = 1;
-      for (int i = NumInputDims - 2; i >= 0; --i) {
-        input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
-      }
-    }
-
-    int outputIndex = 0;
-    int reduceIndex = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (reduced[i]) {
-        m_reducedStrides[reduceIndex] = input_strides[i];
-        ++reduceIndex;
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+	m_outputStrides[0] = 1;
+	for (int i = 1; i < NumOutputDims; ++i) {
+	  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+	}
       } else {
-        m_preservedStrides[outputIndex] = input_strides[i];
-        ++outputIndex;
+	m_outputStrides[NumOutputDims - 1] = 1;
+	for (int i = NumOutputDims - 2; i >= 0; --i) {
+	  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+	}
+      }
+    }
+    
+    // Precompute input strides.
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+	input_strides[0] = 1;
+	for (int i = 1; i < NumInputDims; ++i) {
+	  input_strides[i] = input_strides[i-1] * input_dims[i-1];
+	}
+      } else {
+	input_strides[NumInputDims - 1] = 1;
+	for (int i = NumInputDims - 2; i >= 0; --i) {
+	  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+	}
+      }
+
+      int outputIndex = 0;
+      int reduceIndex = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+	if (reduced[i]) {
+	  m_reducedStrides[reduceIndex] = input_strides[i];
+	  ++reduceIndex;
+	} else {
+	  m_preservedStrides[outputIndex] = input_strides[i];
+	  ++outputIndex;
+	}
       }
     }
 
     // Special case for full reductions
-    if (NumInputDims == NumReducedDims) {
-      eigen_assert(m_dimensions[0] == 1);
+    if (NumOutputDims == 0) {
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
   }
@@ -639,7 +479,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
@@ -674,9 +514,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       return *m_result;
     }
     Op reducer(m_reducer);
-    if (ReducingInnerMostDims) {
+    if (ReducingInnerMostDims || RunningFullReduction) {
       const Index num_values_to_reduce =
-	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
       return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
                                                              num_values_to_reduce, reducer);
     } else {
@@ -697,7 +537,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
     if (ReducingInnerMostDims) {
       const Index num_values_to_reduce =
-	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
       const Index firstIndex = firstInput(index);
       for (Index i = 0; i < packetSize; ++i) {
         Op reducer(m_reducer);
@@ -748,7 +588,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         return index * m_preservedStrides[0];
       } else {
-        return index * m_preservedStrides[NumOutputDims - 1];
+        return index * m_preservedStrides[NumPreservedStrides - 1];
       }
     }
     // TBD: optimize the case where we preserve the innermost dimensions.
@@ -774,10 +614,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
         index -= idx * m_outputStrides[i];
       }
       if (PreservingInnerMostDims) {
-        eigen_assert(m_preservedStrides[NumOutputDims - 1] == 1);
+        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
         startInput += index;
       } else {
-        startInput += index * m_preservedStrides[NumOutputDims - 1];
+        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
       }
     }
     return startInput;
@@ -789,7 +629,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   array<Index, NumOutputDims> m_outputStrides;
   // Subset of strides of the input tensor for the non-reduced dimensions.
   // Indexed by output dimensions.
-  array<Index, NumOutputDims> m_preservedStrides;
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+  array<Index, NumPreservedStrides> m_preservedStrides;
 
   // Subset of strides of the input tensor for the reduced dimensions.
   // Indexed by reduced dimensions.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
new file mode 100644
index 000000000..af1b9432c
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple cuda thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another cuda thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    assert(0 && "Wordsize not supported");
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <typename T>
+__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+#if __CUDA_ARCH__ >= 300
+  atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output) {
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+
+  if (first_index == 0) {
+    *output = reducer.initialize();
+  }
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reduce(__shfl_down(accum, offset), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+}
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+
+  template <typename OutputType>
+  EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(false && "Should only be called on floats");
+  }
+
+  EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+  }
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index e092c0e04..10328c61f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -66,7 +66,7 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
       const XprType& expr, const ReverseDimensions& reverse_dims)
-      : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+      : m_xpr(expr), m_reverse_dims(reverse_dims) { }
 
     EIGEN_DEVICE_FUNC
     const ReverseDimensions& reverse() const { return m_reverse_dims; }
@@ -119,6 +119,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
                                                         const Device& device)
       : m_impl(op.expression(), device), m_reverse(op.reverse())
   {
+    // Reversing a scalar isn't supported yet. It would be a no-op anyway.
+    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     // Compute strides
     m_dimensions = m_impl.dimensions();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 9e4cf039d..98631fc7f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -63,7 +63,7 @@ class TensorStorage<T, FixedDimensions, Options_>
 
 
 // pure dynamic
-template<typename T, int Options_, typename IndexType, std::size_t NumIndices_>
+template<typename T, int Options_, typename IndexType, int NumIndices_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
   public:
@@ -71,7 +71,11 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
     typedef DSizes<IndexType, NumIndices_> Dimensions;
     typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
 
-    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {}
+    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
+      if (NumIndices_ == 0) {
+	m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+      }
+    }
     EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
     EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
@@ -101,13 +105,17 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 
     EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
     {
+      eigen_assert(size >= 1);
       const Index currentSz = internal::array_prod(m_dimensions);
       if(size != currentSz)
       {
         internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
         if (size)
           m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
-        else
+        else if (NumIndices_ == 0) {
+	  m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+	}
+	else 
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 8f1c02ea4..7a9568b36 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -44,7 +44,7 @@ class compute_tensor_flags
 };
 
 
-template<typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
   typedef Scalar_ Scalar;
@@ -107,13 +107,13 @@ struct traits<TensorRef<PlainObjectType> >
 };
 
 
-template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_>
+template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
   typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
 };
 
-template<typename _Scalar, std::size_t NumIndices_, int Options, typename IndexType_>
+template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
   typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
@@ -161,13 +161,13 @@ template<typename T, int n=1, typename PlainObject = void> struct nested
   typedef typename ref_selector<T>::type type;
 };
 
-template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
   typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
 };
 
-template <typename Scalar_, std::size_t NumIndices_, int Options_, typename IndexType_>
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
   typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
new file mode 100644
index 000000000..f5cca0ad7
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+
+namespace Eigen {
+namespace internal {
+
+
+template <uint64_t n>
+struct static_val {
+  static const uint64_t value = n;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    eigen_assert(v == n);
+  }
+};
+
+
+template <typename HIGH = uint64_t, typename LOW = uint64_t>
+struct TensorUInt128
+{
+  HIGH high;
+  LOW low;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(int x) : high(0), low(x) {
+    eigen_assert(x >= 0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(int64_t x) : high(0), low(x) {
+    eigen_assert(x >= 0);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(uint64_t x) : high(0), low(x) { }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
+    return low;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
+    return low;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
+    return high;
+  }
+};
+
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  return (lhs.high == rhs.high) & (lhs.low == rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  return (lhs.high != rhs.high) | (lhs.low != rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  if (lhs.high != rhs.high) {
+    return lhs.high > rhs.high;
+  }
+  return lhs.low >= rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  if (lhs.high != rhs.high) {
+    return lhs.high < rhs.high;
+  }
+  return lhs.low < rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
+  if (result.low < rhs.low) {
+    result.high += 1;
+  }
+  return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
+  if (result.low > lhs.low) {
+    result.high -= 1;
+  }
+  return result;
+}
+
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  // Split each 128-bit integer into 4 32-bit integers, and then do the
+  // multiplications by hand as follow:
+  //   lhs      a  b  c  d
+  //   rhs      e  f  g  h
+  //           -----------
+  //           ah bh ch dh
+  //           bg cg dg
+  //           cf df
+  //           de
+  // The result is stored in 2 64bit integers, high and low.
+
+  const uint64_t LOW = 0x00000000FFFFFFFFLL;
+  const uint64_t HIGH = 0xFFFFFFFF00000000LL;
+
+  uint64_t d = lhs.low & LOW;
+  uint64_t c = (lhs.low & HIGH) >> 32LL;
+  uint64_t b = lhs.high & LOW;
+  uint64_t a = (lhs.high & HIGH) >> 32LL;
+
+  uint64_t h = rhs.low & LOW;
+  uint64_t g = (rhs.low & HIGH) >> 32LL;
+  uint64_t f = rhs.high & LOW;
+  uint64_t e = (rhs.high & HIGH) >> 32LL;
+
+  // Compute the low 32 bits of low
+  uint64_t acc = d * h;
+  uint64_t low = acc & LOW;
+  //  Compute the high 32 bits of low. Add a carry every time we wrap around
+  acc >>= 32LL;
+  uint64_t carry = 0;
+  uint64_t acc2 = acc + c * h;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * g;
+  if (acc < acc2) {
+    carry++;
+  }
+  low |= (acc << 32LL);
+
+  // Carry forward the high bits of acc to initiate the computation of the
+  // low 32 bits of high
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+  carry = 0;
+
+  acc = acc2 + b * h;
+  if (acc < acc2) {
+    carry++;
+  }
+  acc2 = acc + c * g;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * f;
+  if (acc < acc2) {
+    carry++;
+  }
+  uint64_t high = acc & LOW;
+
+  // Start to compute the high 32 bits of high.
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+
+  acc = acc2 + a * h;
+  acc2 = acc + b * g;
+  acc = acc2 + c * f;
+  acc2 = acc + d * e;
+  high |= (acc2 << 32LL);
+
+  return TensorUInt128<uint64_t, uint64_t>(high, low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+  if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
+    return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+  } else if (lhs < rhs) {
+    return TensorUInt128<uint64_t, uint64_t>(0);
+  } else {
+    // calculate the biggest power of 2 times rhs that's less than or equal to lhs
+    TensorUInt128<uint64_t, uint64_t> power2(1);
+    TensorUInt128<uint64_t, uint64_t> d(rhs);
+    TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
+    while (lhs >= d) {
+      tmp = tmp - d;
+      d = d + d;
+      power2 = power2 + power2;
+    }
+
+    tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+    TensorUInt128<uint64_t, uint64_t> result(0);
+    while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
+      if (tmp >= d) {
+        tmp = tmp - d;
+        result = result + power2;
+      }
+      // Shift right
+      power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
+      d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
+    }
+
+    return result;
+  }
+}
+
+
+}  // namespace internal
+}  // namespace Eigen
+
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
diff --git a/unsupported/Eigen/IterativeSolvers b/unsupported/Eigen/IterativeSolvers
index ff0d59b6e..31e880bdc 100644
--- a/unsupported/Eigen/IterativeSolvers
+++ b/unsupported/Eigen/IterativeSolvers
@@ -33,7 +33,7 @@
 #include "../../Eigen/Jacobi"
 #include "../../Eigen/Householder"
 #include "src/IterativeSolvers/GMRES.h"
-#include "src/IterativeSolvers/IncompleteCholesky.h"
+#include "src/IterativeSolvers/DGMRES.h"
 //#include "src/IterativeSolvers/SSORPreconditioner.h"
 #include "src/IterativeSolvers/MINRES.h"
 
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
old mode 100644
new mode 100755
index 8336c2644..e30ad5b6d
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -99,7 +99,11 @@ class AutoDiffScalar
     {}
 
     template<typename OtherDerType>
-    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other)
+    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    , typename internal::enable_if<internal::is_same<Scalar,typename OtherDerType::Scalar>::value,void*>::type = 0
+#endif
+    )
       : m_value(other.value()), m_derivatives(other.derivatives())
     {}
 
@@ -127,6 +131,14 @@ class AutoDiffScalar
       return *this;
     }
 
+    inline AutoDiffScalar& operator=(const Scalar& other)
+    {
+      m_value = other;
+      if(m_derivatives.size()>0)
+        m_derivatives.setZero();
+      return *this;
+    }
+
 //     inline operator const Scalar& () const { return m_value; }
 //     inline operator Scalar& () { return m_value; }
 
@@ -626,9 +638,10 @@ EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
 template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
   : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
 {
-  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime> > Real;
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime,
+                                DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real;
   typedef AutoDiffScalar<DerType> NonInteger;
-  typedef AutoDiffScalar<DerType>& Nested;
+  typedef AutoDiffScalar<DerType> Nested;
   enum{
     RequireInitialization = 1
   };
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
index fae1c5854..754953335 100644
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ b/unsupported/Eigen/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 ADD_SUBDIRECTORY(AutoDiff)
 ADD_SUBDIRECTORY(BVH)
+ADD_SUBDIRECTORY(Eigenvalues)
 ADD_SUBDIRECTORY(FFT)
 ADD_SUBDIRECTORY(IterativeSolvers)
 ADD_SUBDIRECTORY(LevenbergMarquardt)
diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
new file mode 100644
index 000000000..1d4387c82
--- /dev/null
+++ b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Eigenvalues_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Eigenvalues_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 52eb65a2f..bae04fc30 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -40,7 +40,6 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
 {
   eigen_assert(vec.size() == perm.size());
   typedef typename IndexType::Scalar Index; 
-  typedef typename VectorType::Scalar Scalar; 
   bool flag; 
   for (Index k  = 0; k < ncut; k++)
   {
@@ -84,6 +83,8 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * x = solver.solve(b);
  * \endcode
  * 
+ * DGMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
  * References :
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
@@ -101,7 +102,7 @@ template< typename _MatrixType, typename _Preconditioner>
 class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
 {
     typedef IterativeSolverBase<DGMRES> Base;
-    using Base::mp_matrix;
+    using Base::matrix;
     using Base::m_error;
     using Base::m_iterations;
     using Base::m_info;
@@ -112,6 +113,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef _Preconditioner Preconditioner;
     typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix; 
@@ -134,8 +136,8 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  DGMRES(const MatrixType& A) : Base(A),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false)
-  {}
+  template<typename MatrixDerived>
+  explicit DGMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
 
   ~DGMRES() {}
   
@@ -150,7 +152,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      dgmres(mp_matrix, b.col(j), xj, Base::m_preconditioner);
+      dgmres(matrix(), b.col(j), xj, Base::m_preconditioner);
     }
     m_info = failed ? NumericalIssue
            : m_error <= Base::m_tolerance ? Success
@@ -202,7 +204,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     template<typename Dest>
     int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
     // Compute data to use for deflation 
-    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const;
+    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
     // Apply deflation to a vector
     template<typename RhsType, typename DestType>
     int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
@@ -218,7 +220,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
     mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
     mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
-    mutable int m_neig; //Number of eigenvalues to extract at each restart
+    mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
     mutable int m_r; // Current number of deflated eigenvalues, size of m_U
     mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
     mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
@@ -338,7 +340,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
     
     beta = std::abs(g(it+1));
     m_error = beta/normRhs; 
-    std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
+    // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
     it++; nbIts++; 
     
     if (m_error < m_tolerance)
@@ -416,7 +418,7 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr
 }
 
 template< typename _MatrixType, typename _Preconditioner>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const
+int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
 {
   // First, find the Schur form of the Hessenberg matrix H
   typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
@@ -426,7 +428,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU); 
   
   ComplexVector eig(it);
-  Matrix<Index,Dynamic,1>perm(it); 
+  Matrix<StorageIndex,Dynamic,1>perm(it);
   eig = this->schurValues(schurofH);
   
   // Reorder the absolute values of Schur values
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 05e5862a5..fbe21fc7e 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -251,13 +251,15 @@ struct traits<GMRES<_MatrixType,_Preconditioner> >
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
   * 
+  * GMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
 class GMRES : public IterativeSolverBase<GMRES<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<GMRES> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -288,7 +290,8 @@ public:
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  GMRES(const MatrixType& A) : Base(A), m_restart(30) {}
+  template<typename MatrixDerived>
+  explicit GMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30) {}
 
   ~GMRES() {}
 
@@ -312,7 +315,7 @@ public:
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      if(!internal::gmres(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
+      if(!internal::gmres(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index c393112a4..256990c1a 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -191,6 +191,8 @@ namespace Eigen {
      * By default the iterations start with x=0 as an initial guess of the solution.
      * One can control the start using the solveWithGuess() method.
      *
+     * MINRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+     *
      * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
      */
     template< typename _MatrixType, int _UpLo, typename _Preconditioner>
@@ -198,7 +200,7 @@ namespace Eigen {
     {
         
         typedef IterativeSolverBase<MINRES> Base;
-        using Base::mp_matrix;
+        using Base::matrix;
         using Base::m_error;
         using Base::m_iterations;
         using Base::m_info;
@@ -227,7 +229,8 @@ namespace Eigen {
          * this class becomes invalid. Call compute() to update it with the new
          * matrix A, or modify a copy of A.
          */
-        MINRES(const MatrixType& A) : Base(A) {}
+        template<typename MatrixDerived>
+        explicit MINRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
         
         /** Destructor. */
         ~MINRES(){}
@@ -236,21 +239,31 @@ namespace Eigen {
         template<typename Rhs,typename Dest>
         void _solve_with_guess_impl(const Rhs& b, Dest& x) const
         {
+            typedef typename Base::MatrixWrapper MatrixWrapper;
+            typedef typename Base::ActualMatrixType ActualMatrixType;
+            enum {
+              TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                              &&  (UpLo==(Lower|Upper))
+                              &&  (!MatrixType::IsRowMajor)
+                              &&  (!NumTraits<Scalar>::IsComplex)
+            };
+            typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+            EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
             typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                                   Ref<const MatrixType>&,
-                                                   SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
-                                                  >::type MatrixWrapperType;
-                                          
+                                                  RowMajorWrapper,
+                                                  typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                            >::type SelfAdjointWrapper;
+
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
-            
+            RowMajorWrapper row_mat(matrix());
             for(int j=0; j<b.cols(); ++j)
             {
                 m_iterations = Base::maxIterations();
                 m_error = Base::m_tolerance;
                 
                 typename Dest::ColXpr xj(x,j);
-                internal::minres(MatrixWrapperType(mp_matrix), b.col(j), xj,
+                internal::minres(SelfAdjointWrapper(row_mat), b.col(j), xj,
                                  Base::m_preconditioner, m_iterations, m_error);
             }
             
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 4406437cc..4d3e5358e 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -240,7 +240,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
 
     Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
           | EvalBeforeNestingBit | EvalBeforeAssigningBit,
-    CoeffReadCost = Dynamic
+    CoeffReadCost = HugeCost
   };
 
   typedef SparseMatrix<Scalar, 0, StorageIndex> ReturnType;
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index b37481cbe..14a8aef58 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -348,7 +348,7 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename std::complex<RealScalar> ComplexScalar;
   if (sizeof(RealScalar) > 14) {
-    result = arg.matrixFunction(StdStemFunctions<ComplexScalar>::exp);
+    result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
     return;
   }
 #endif
diff --git a/unsupported/Eigen/src/Skyline/SkylineProduct.h b/unsupported/Eigen/src/Skyline/SkylineProduct.h
index d218a7c25..d9eb814c1 100644
--- a/unsupported/Eigen/src/Skyline/SkylineProduct.h
+++ b/unsupported/Eigen/src/Skyline/SkylineProduct.h
@@ -49,7 +49,7 @@ struct internal::traits<SkylineProduct<LhsNested, RhsNested, ProductMode> > {
         | EvalBeforeAssigningBit
         | EvalBeforeNestingBit,
 
-        CoeffReadCost = Dynamic
+        CoeffReadCost = HugeCost
     };
 
     typedef typename internal::conditional<ResultIsSkyline,
diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index 54e0c5d63..037a13f86 100644
--- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -56,6 +56,8 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
  class  DynamicSparseMatrix
   : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
 {
+    typedef SparseMatrixBase<DynamicSparseMatrix> Base;
+    using Base::convert_index;
   public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix)
     // FIXME: why are these operator already alvailable ???
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 653392e40..81a03f582 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -49,11 +49,11 @@ ei_add_test(EulerAngles)
 
 find_package(MPFR 2.3.0)
 find_package(GMP)
-if(MPFR_FOUND)
+if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
   include_directories(${MPFR_INCLUDES} ./mpreal)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
   set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
- ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+ ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -93,7 +93,6 @@ endif()
 ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
 ei_add_test(splines)
-ei_add_test(incomplete_cholesky)
 ei_add_test(gmres)
 ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
@@ -120,6 +119,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
   ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
   ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
+  ei_add_test(cxx11_tensor_uint128 "-std=c++0x")
   ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
   ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
   ei_add_test(cxx11_tensor_map "-std=c++0x")
@@ -145,6 +145,10 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_io "-std=c++0x")
   ei_add_test(cxx11_tensor_generator "-std=c++0x")
   ei_add_test(cxx11_tensor_custom_op "-std=c++0x")
+  ei_add_test(cxx11_tensor_custom_index "-std=c++0x")
+  ei_add_test(cxx11_tensor_sugar "-std=c++0x")
+  ei_add_test(cxx11_tensor_fft "-std=c++0x")
+  ei_add_test(cxx11_tensor_ifft "-std=c++0x")
 
   # These tests needs nvcc
 #  ei_add_test(cxx11_tensor_device "-std=c++0x")
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 087e7c542..1aa1b3d2d 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -129,6 +129,7 @@ template<typename Func> void forward_jacobian(const Func& f)
 
 
 // TODO also check actual derivatives!
+template <int>
 void test_autodiff_scalar()
 {
   Vector2f p = Vector2f::Random();
@@ -140,6 +141,7 @@ void test_autodiff_scalar()
 }
 
 // TODO also check actual derivatives!
+template <int>
 void test_autodiff_vector()
 {
   Vector2f p = Vector2f::Random();
@@ -153,6 +155,7 @@ void test_autodiff_vector()
   VERIFY_IS_APPROX(res.value(), foo(p));
 }
 
+template <int>
 void test_autodiff_jacobian()
 {
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double,2,2>()) ));
@@ -162,12 +165,56 @@ void test_autodiff_jacobian()
   CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
 }
 
+
+template <int>
+void test_autodiff_hessian()
+{
+  typedef AutoDiffScalar<VectorXd> AD;
+  typedef Matrix<AD,Eigen::Dynamic,1> VectorAD;
+  typedef AutoDiffScalar<VectorAD> ADD;
+  typedef Matrix<ADD,Eigen::Dynamic,1> VectorADD;
+  VectorADD x(2);
+  double s1 = internal::random<double>(), s2 = internal::random<double>(), s3 = internal::random<double>(), s4 = internal::random<double>();
+  x(0).value()=s1;
+  x(1).value()=s2;
+
+  //set unit vectors for the derivative directions (partial derivatives of the input vector)
+  x(0).derivatives().resize(2);
+  x(0).derivatives().setZero();
+  x(0).derivatives()(0)= 1;
+  x(1).derivatives().resize(2);
+  x(1).derivatives().setZero();
+  x(1).derivatives()(1)=1;
+
+  //repeat partial derivatives for the inner AutoDiffScalar
+  x(0).value().derivatives() = VectorXd::Unit(2,0);
+  x(1).value().derivatives() = VectorXd::Unit(2,1);
+
+  //set the hessian matrix to zero
+  for(int idx=0; idx<2; idx++) {
+      x(0).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+      x(1).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+  }
+
+  ADD y = sin(AD(s3)*x(0) + AD(s4)*x(1));
+
+  VERIFY_IS_APPROX(y.value().derivatives()(0), y.derivatives()(0).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(1), y.derivatives()(1).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(0), s3*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
+  VERIFY_IS_APPROX(y.derivatives()(1).derivatives(),  -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+}
+
+
+
 void test_autodiff()
 {
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( test_autodiff_scalar() );
-    CALL_SUBTEST_2( test_autodiff_vector() );
-    CALL_SUBTEST_3( test_autodiff_jacobian() );
+    CALL_SUBTEST_1( test_autodiff_scalar<1>() );
+    CALL_SUBTEST_2( test_autodiff_vector<1>() );
+    CALL_SUBTEST_3( test_autodiff_jacobian<1>() );
+    CALL_SUBTEST_4( test_autodiff_hessian<1>() );
   }
 }
 
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index 6eeecb717..482dfa7de 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -61,14 +61,14 @@ static void test_argmax_tuple_reducer()
   Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
   index_tuples = tensor.index_tuples();
 
-  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced(1);
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
   DimensionList<DenseIndex, 4> dims;
   reduced = index_tuples.reduce(
       dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float>>());
 
-  Tensor<float, 1, DataLayout> maxi = tensor.maximum();
+  Tensor<float, 0, DataLayout> maxi = tensor.maximum();
 
-  VERIFY_IS_EQUAL(maxi(0), reduced(0).second);
+  VERIFY_IS_EQUAL(maxi(), reduced(0).second);
 
   array<DenseIndex, 3> reduce_dims;
   for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
@@ -93,14 +93,14 @@ static void test_argmin_tuple_reducer()
   Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
   index_tuples = tensor.index_tuples();
 
-  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced(1);
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
   DimensionList<DenseIndex, 4> dims;
   reduced = index_tuples.reduce(
       dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float>>());
 
-  Tensor<float, 1, DataLayout> mini = tensor.minimum();
+  Tensor<float, 0, DataLayout> mini = tensor.minimum();
 
-  VERIFY_IS_EQUAL(mini(0), reduced(0).second);
+  VERIFY_IS_EQUAL(mini(), reduced(0).second);
 
   array<DenseIndex, 3> reduce_dims;
   for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
@@ -123,7 +123,7 @@ static void test_simple_argmax()
   tensor = (tensor + tensor.constant(0.5)).log();
   tensor(0,0,0,0) = 10.0;
 
-  Tensor<DenseIndex, 1, DataLayout> tensor_argmax(1);
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmax;
 
   tensor_argmax = tensor.argmax();
 
@@ -144,7 +144,7 @@ static void test_simple_argmin()
   tensor = (tensor + tensor.constant(0.5)).log();
   tensor(0,0,0,0) = -10.0;
 
-  Tensor<DenseIndex, 1, DataLayout> tensor_argmin(1);
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmin;
 
   tensor_argmin = tensor.argmin();
 
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index d16aaf847..e5cf61fe1 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -29,8 +29,8 @@ static void test_1d()
   int row_major[6];
   memset(col_major, 0, 6*sizeof(int));
   memset(row_major, 0, 6*sizeof(int));
-  TensorMap<Tensor<int, 1>> vec3(col_major, 6);
-  TensorMap<Tensor<int, 1, RowMajor>> vec4(row_major, 6);
+  TensorMap<Tensor<int, 1> > vec3(col_major, 6);
+  TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
 
   vec3 = vec1;
   vec4 = vec2;
@@ -92,8 +92,8 @@ static void test_2d()
   int row_major[6];
   memset(col_major, 0, 6*sizeof(int));
   memset(row_major, 0, 6*sizeof(int));
-  TensorMap<Tensor<int, 2>> mat3(row_major, 2, 3);
-  TensorMap<Tensor<int, 2, RowMajor>> mat4(col_major, 2, 3);
+  TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
+  TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
 
   mat3 = mat1;
   mat4 = mat2;
@@ -152,8 +152,8 @@ static void test_3d()
   int row_major[2*3*7];
   memset(col_major, 0, 2*3*7*sizeof(int));
   memset(row_major, 0, 2*3*7*sizeof(int));
-  TensorMap<Tensor<int, 3>> mat3(col_major, 2, 3, 7);
-  TensorMap<Tensor<int, 3, RowMajor>> mat4(row_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
 
   mat3 = mat1;
   mat4 = mat2;
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 729e43327..3c6d0d2ff 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -24,12 +24,12 @@ static void test_simple_cast()
   cplextensor.setRandom();
 
   chartensor = ftensor.cast<char>();
-  cplextensor = ftensor.cast<std::complex<float>>();
+  cplextensor = ftensor.cast<std::complex<float> >();
 
   for (int i = 0; i < 20; ++i) {
     for (int j = 0; j < 30; ++j) {
       VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
-      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float>>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j)));
     }
   }
 }
diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp
index 5ff082a3a..49e1894ab 100644
--- a/unsupported/test/cxx11_tensor_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_cuda.cpp
@@ -507,6 +507,115 @@ static void test_cuda_convolution_3d()
   }
 }
 
+
+template <typename Scalar>
+void test_cuda_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+}
+
+template <typename Scalar>
+void test_cuda_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+}
+
 void test_cxx11_tensor_cuda()
 {
   CALL_SUBTEST(test_cuda_elementwise_small());
@@ -522,4 +631,34 @@ void test_cxx11_tensor_cuda()
   CALL_SUBTEST(test_cuda_convolution_2d<RowMajor>());
   CALL_SUBTEST(test_cuda_convolution_3d<ColMajor>());
   CALL_SUBTEST(test_cuda_convolution_3d<RowMajor>());
+  CALL_SUBTEST(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST(test_cuda_lgamma<float>(0.001f));
+  CALL_SUBTEST(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST(test_cuda_erf<float>(0.001f));
+  CALL_SUBTEST(test_cuda_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
+  CALL_SUBTEST(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST(test_cuda_erfc<float>(0.001f));
+  CALL_SUBTEST(test_cuda_tanh<double>(1.0));
+  CALL_SUBTEST(test_cuda_tanh<double>(100.0));
+  CALL_SUBTEST(test_cuda_tanh<double>(0.01));
+  CALL_SUBTEST(test_cuda_tanh<double>(0.001));
+  CALL_SUBTEST(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST(test_cuda_lgamma<double>(0.001));
+  CALL_SUBTEST(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST(test_cuda_erf<double>(0.001));
+  CALL_SUBTEST(test_cuda_erfc<double>(1.0));
+  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
+  CALL_SUBTEST(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST(test_cuda_erfc<double>(0.001));
 }
diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
new file mode 100644
index 000000000..4528cc176
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <map>
+
+#include <Eigen/Dense>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+template <int DataLayout>
+static void test_map_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = std::map<ptrdiff_t, ptrdiff_t>;
+  CustomIndex coeffC;
+  coeffC[0] = 1;
+  coeffC[1] = 2;
+  coeffC[2] = 4;
+  coeffC[3] = 1;
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_matrix_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = Matrix<unsigned int, 4, 1>;
+  CustomIndex coeffC(1,2,4,1);
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_varlist_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff({1,2,4,1}), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef({1,2,4,1}), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_sizes_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+  Sizes<1,2,4,1> coeffC;
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+void test_cxx11_tensor_custom_index() {
+  test_map_as_index<ColMajor>();
+  test_map_as_index<RowMajor>();
+  test_matrix_as_index<ColMajor>();
+  test_matrix_as_index<RowMajor>();
+  test_varlist_as_index<ColMajor>();
+  test_varlist_as_index<RowMajor>();
+  test_sizes_as_index<ColMajor>();
+  test_sizes_as_index<RowMajor>();
+}
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 7e33c9580..8baa477cc 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -25,7 +25,9 @@ struct InsertZeros {
   template <typename Output, typename Device>
   void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
   {
-    array<DenseIndex, 2> strides{{2, 2}};
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
     output.stride(strides).device(device) = input;
 
     Eigen::DSizes<DenseIndex, 2> offsets(1,1);
@@ -70,7 +72,8 @@ struct BatchMatMul {
             Output& output, const Device& device) const
   {
     typedef Tensor<float, 3>::DimensionPair DimPair;
-    array<DimPair, 1> dims({{DimPair(1, 0)}});
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
     for (int i = 0; i < output.dimension(2); ++i) {
       output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
     }
@@ -88,9 +91,10 @@ static void test_custom_binary_op()
   Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
   for (int i = 0; i < 5; ++i) {
     typedef Tensor<float, 3>::DimensionPair DimPair;
-    array<DimPair, 1> dims({{DimPair(1, 0)}});
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
     Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
-    TensorRef<Tensor<float, 2>> val = result.chip<2>(i);
+    TensorRef<Tensor<float, 2> > val = result.chip<2>(i);
     for (int j = 0; j < 2; ++j) {
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(val(j, k), reference(j, k));
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
new file mode 100644
index 000000000..0f6e09106
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -0,0 +1,273 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_fft_2D_golden() {
+  Tensor<float, 2, DataLayout, long> input(2, 3);
+  input(0, 0) = 1;
+  input(0, 1) = 2;
+  input(0, 2) = 3;
+  input(1, 0) = 4;
+  input(1, 1) = 5;
+  input(1, 2) = 6;
+
+  array<int, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<float>, 2, DataLayout, long> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+
+  std::complex<float> output_golden[6]; // in ColMajor order
+  output_golden[0] = std::complex<float>(21, 0);
+  output_golden[1] = std::complex<float>(-9, 0);
+  output_golden[2] = std::complex<float>(-3, 1.73205);
+  output_golden[3] = std::complex<float>( 0, 0);
+  output_golden[4] = std::complex<float>(-3, -1.73205);
+  output_golden[5] = std::complex<float>(0 ,0);
+
+  std::complex<float> c_offset = std::complex<float>(1.0, 1.0);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_APPROX(output(0) + c_offset, output_golden[0] + c_offset);
+    VERIFY_IS_APPROX(output(1) + c_offset, output_golden[1] + c_offset);
+    VERIFY_IS_APPROX(output(2) + c_offset, output_golden[2] + c_offset);
+    VERIFY_IS_APPROX(output(3) + c_offset, output_golden[3] + c_offset);
+    VERIFY_IS_APPROX(output(4) + c_offset, output_golden[4] + c_offset);
+    VERIFY_IS_APPROX(output(5) + c_offset, output_golden[5] + c_offset);
+  }
+  else {
+    VERIFY_IS_APPROX(output(0)+ c_offset, output_golden[0]+ c_offset);
+    VERIFY_IS_APPROX(output(1)+ c_offset, output_golden[2]+ c_offset);
+    VERIFY_IS_APPROX(output(2)+ c_offset, output_golden[4]+ c_offset);
+    VERIFY_IS_APPROX(output(3)+ c_offset, output_golden[1]+ c_offset);
+    VERIFY_IS_APPROX(output(4)+ c_offset, output_golden[3]+ c_offset);
+    VERIFY_IS_APPROX(output(5)+ c_offset, output_golden[5]+ c_offset);
+  }
+}
+
+static void test_fft_complex_input_golden() {
+  Tensor<std::complex<float>, 1, ColMajor, long> input(5);
+  input(0) = std::complex<float>(1, 1);
+  input(1) = std::complex<float>(2, 2);
+  input(2) = std::complex<float>(3, 3);
+  input(3) = std::complex<float>(4, 4);
+  input(4) = std::complex<float>(5, 5);
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor, long> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor, long> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor, long> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor, long> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor, long> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor, long> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+  forward_golden_result[0] = std::complex<float>(15.000000000000000,+15.000000000000000);
+  forward_golden_result[1] = std::complex<float>(-5.940954801177935, +0.940954801177934);
+  forward_golden_result[2] = std::complex<float>(-3.312299240582266, -1.687700759417735);
+  forward_golden_result[3] = std::complex<float>(-1.687700759417735, -3.312299240582266);
+  forward_golden_result[4] = std::complex<float>( 0.940954801177934, -5.940954801177935);
+
+  reverse_golden_result[0] = std::complex<float>( 3.000000000000000, + 3.000000000000000);
+  reverse_golden_result[1] = std::complex<float>( 0.188190960235587, - 1.188190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.337540151883547, - 0.662459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.662459848116453, - 0.337540151883547);
+  reverse_golden_result[4] = std::complex<float>(-1.188190960235587, + 0.188190960235587);
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i), forward_golden_result[i]);
+    VERIFY_IS_APPROX(forward_output_real_part(i), forward_golden_result[i].real());
+    VERIFY_IS_APPROX(forward_output_imag_part(i), forward_golden_result[i].imag());
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i), reverse_golden_result[i]);
+    VERIFY_IS_APPROX(reverse_output_real_part(i), reverse_golden_result[i].real());
+    VERIFY_IS_APPROX(reverse_output_imag_part(i), reverse_golden_result[i].imag());
+  }
+}
+
+static void test_fft_real_input_golden() {
+  Tensor<float, 1, ColMajor, long> input(5);
+  input(0) = 1.0;
+  input(1) = 2.0;
+  input(2) = 3.0;
+  input(3) = 4.0;
+  input(4) = 5.0;
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor, long> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor, long> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor, long> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor, long> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor, long> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor, long> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+
+  forward_golden_result[0] = std::complex<float>(  15, 0);
+  forward_golden_result[1] = std::complex<float>(-2.5, +3.44095480117793);
+  forward_golden_result[2] = std::complex<float>(-2.5, +0.81229924058227);
+  forward_golden_result[3] = std::complex<float>(-2.5, -0.81229924058227);
+  forward_golden_result[4] = std::complex<float>(-2.5, -3.44095480117793);
+
+  reverse_golden_result[0] = std::complex<float>( 3.0, 0);
+  reverse_golden_result[1] = std::complex<float>(-0.5, -0.688190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.5, -0.162459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.5, +0.162459848116453);
+  reverse_golden_result[4] = std::complex<float>(-0.5, +0.688190960235587);
+
+  std::complex<float> c_offset(1.0, 1.0);
+  float r_offset = 1.0;
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i) + c_offset, forward_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(forward_output_real_part(i)  + r_offset, forward_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(forward_output_imag_part(i)  + r_offset, forward_golden_result[i].imag() + r_offset);
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i) + c_offset, reverse_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(reverse_output_real_part(i)  + r_offset, reverse_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(reverse_output_imag_part(i)  + r_offset, reverse_golden_result[i].imag() + r_offset);
+  }
+}
+
+
+template <int DataLayout, typename RealScalar, bool isComplexInput, int FFTResultType, int FFTDirection, int TensorRank>
+static void test_fft_real_input_energy() {
+
+  Eigen::DSizes<long, TensorRank> dimensions;
+  int total_size = 1;
+  for (int i = 0; i < TensorRank; ++i) {
+    dimensions[i] = rand() % 20 + 1;
+    total_size *= dimensions[i];
+  }
+  const DSizes<long, TensorRank> arr = dimensions;
+
+  typedef typename internal::conditional<isComplexInput == true, std::complex<RealScalar>, RealScalar>::type InputScalar;
+
+  Tensor<InputScalar, TensorRank, DataLayout, long> input;
+  input.resize(arr);
+  input.setRandom();
+
+  array<int, TensorRank> fft;
+  for (int i = 0; i < TensorRank; ++i) {
+    fft[i] = i;
+  }
+
+  typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar;
+  Tensor<OutputScalar, TensorRank, DataLayout> output;
+  output = input.template fft<FFTResultType, FFTDirection>(fft);
+
+  for (int i = 0; i < TensorRank; ++i) {
+    VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i));
+  }
+
+  float energy_original = 0.0;
+  float energy_after_fft = 0.0;
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_original += pow(std::abs(input(i)), 2);
+  }
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_after_fft += pow(std::abs(output(i)), 2);
+  }
+
+  if(FFTDirection == FFT_FORWARD) {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft / total_size);
+  }
+  else {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft * total_size);
+  }
+}
+
+void test_cxx11_tensor_fft() {
+    test_fft_complex_input_golden();
+    test_fft_real_input_golden();
+
+    test_fft_2D_golden<ColMajor>();
+    test_fft_2D_golden<RowMajor>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+}
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 5252e4d72..1c33fefb3 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -15,6 +15,33 @@ using Eigen::Tensor;
 using Eigen::RowMajor;
 
 
+static void test_0d()
+{
+  TensorFixedSize<float, Sizes<> > scalar1;
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+
+  scalar1() = 7.0;
+  scalar2() = 13.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<> > copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+  copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+
+  TensorFixedSize<float, Sizes<> > scalar3 = scalar1.sqrt();
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar4 = scalar2.sqrt();
+  VERIFY_IS_EQUAL(scalar3.rank(), 0);
+  VERIFY_IS_APPROX(scalar3(), sqrtf(7.0));
+  VERIFY_IS_APPROX(scalar4(), sqrtf(13.0));
+
+  scalar3 = scalar1 + scalar2;
+  VERIFY_IS_APPROX(scalar3(), 7.0f + 13.0f);
+}
+
 static void test_1d()
 {
   TensorFixedSize<float, Sizes<6> > vec1;
@@ -223,6 +250,7 @@ static void test_array()
 
 void test_cxx11_tensor_fixed_size()
 {
+  CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_tensor_map());
   CALL_SUBTEST(test_2d());
diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
new file mode 100644
index 000000000..5fd88fa6c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_ifft.cpp
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <complex>
+#include <cmath>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_1D_fft_ifft_invariant(int sequence_length) {
+  Tensor<double, 1, DataLayout> tensor(sequence_length);
+  tensor.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), sequence_length);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), sequence_length);
+
+  for (int i = 0; i < sequence_length; ++i) {
+    VERIFY_IS_APPROX(static_cast<float>(tensor(i)), static_cast<float>(std::real(tensor_after_fft_ifft(i))));
+  }
+}
+
+template <int DataLayout>
+static void test_2D_fft_ifft_invariant(int dim0, int dim1) {
+  Tensor<double, 2, DataLayout> tensor(dim0, dim1);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      //std::cout << "[" << i << "][" << j << "]" <<  "  Original data: " << tensor(i,j) << " Transformed data:" << tensor_after_fft_ifft(i,j) << std::endl;
+      VERIFY_IS_APPROX(static_cast<float>(tensor(i,j)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j))));
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_3D_fft_ifft_invariant(int dim0, int dim1, int dim2) {
+  Tensor<double, 3, DataLayout> tensor(dim0, dim1, dim2);
+  tensor.setRandom();
+
+  array<int, 3> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+  fft[2] = 2;
+
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j,k))));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3) {
+  Tensor<double, 4, DataLayout> tensor(dim0, dim1, dim2, dim3);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 2;
+  fft[1] = 0;
+
+  Tensor<std::complex<double>, 4, DataLayout> tensor_after_fft;
+  Tensor<double, 4, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(3), dim3);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(3), dim3);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        for (int l = 0; l < dim3; ++l) {
+          VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k,l)), static_cast<float>(tensor_after_fft_ifft(i,j,k,l)));
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_ifft() {
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(1024*1024));
+
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(4,4));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(8,16));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(16,32));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(1024,1024));
+
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(4,4,4));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(8,16,32));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(16,4,8));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(256,256,256));
+
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(4,4,4,4));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(8,16,32,64));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(16,4,8,12));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(64,64,64,64));
+}
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index ca9d18254..4ce8dea20 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -58,11 +58,11 @@ static void test_type2index_list()
   typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4;
 
 #if 0
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
 #endif
 
   EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -142,7 +142,7 @@ static void test_type2index_list()
   }
 
   const Dims4 reduction_axis4;
-  Tensor<float, 1> result4 = tensor.sum(reduction_axis4);
+  Tensor<float, 0> result4 = tensor.sum(reduction_axis4);
   float expected = 0.0f;
   for (int m = 0; m < 11; ++m) {
     for (int l = 0; l < 7; ++l) {
@@ -155,7 +155,7 @@ static void test_type2index_list()
       }
     }
   }
-  VERIFY_IS_APPROX(result4(0), expected);
+  VERIFY_IS_APPROX(result4(), expected);
 }
 
 
@@ -216,29 +216,29 @@ static void test_mixed_index_list()
   reduction_indices.set(3, 3);
   EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
 #if 0
-  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
 #endif
 
   typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList;
   ReductionList reduction_list;
-  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
 #if 0
-  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
 #endif
 
-  Tensor<float, 1> result1 = tensor.sum(reduction_axis);
-  Tensor<float, 1> result2 = tensor.sum(reduction_indices);
-  Tensor<float, 1> result3 = tensor.sum(reduction_list);
+  Tensor<float, 0> result1 = tensor.sum(reduction_axis);
+  Tensor<float, 0> result2 = tensor.sum(reduction_indices);
+  Tensor<float, 0> result3 = tensor.sum(reduction_list);
 
   float expected = 0.0f;
   for (int i = 0; i < 2; ++i) {
@@ -250,9 +250,9 @@ static void test_mixed_index_list()
       }
     }
   }
-  VERIFY_IS_APPROX(result1(0), expected);
-  VERIFY_IS_APPROX(result2(0), expected);
-  VERIFY_IS_APPROX(result3(0), expected);
+  VERIFY_IS_APPROX(result1(), expected);
+  VERIFY_IS_APPROX(result2(), expected);
+  VERIFY_IS_APPROX(result3(), expected);
 }
 
 
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 343b37dbd..48aa6d368 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -14,8 +14,29 @@
 
 void test_signed_32bit()
 {
+  // Divide by one
+  const Eigen::internal::TensorIntDivisor<int32_t, false> div_by_one(1);
+
+  for (int32_t j = 0; j < 25000; ++j) {
+    const int32_t fast_div = j / div_by_one;
+    const int32_t slow_div = j / 1;
+    VERIFY_IS_EQUAL(fast_div, slow_div);
+  }
+
+  // Standard divide by 2 or more
   for (int32_t i = 2; i < 25000; ++i) {
-    const Eigen::internal::TensorIntDivisor<int32_t> div(i);
+    const Eigen::internal::TensorIntDivisor<int32_t, false> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+
+  // Optimized divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, true> div(i);
 
     for (int32_t j = 0; j < 25000; ++j) {
       const int32_t fast_div = j / div;
@@ -42,7 +63,7 @@ void test_unsigned_32bit()
 
 void test_signed_64bit()
 {
-  for (int64_t i = 2; i < 25000; ++i) {
+  for (int64_t i = 1; i < 25000; ++i) {
     const Eigen::internal::TensorIntDivisor<int64_t> div(i);
 
     for (int64_t j = 0; j < 25000; ++j) {
@@ -56,7 +77,7 @@ void test_signed_64bit()
 
 void test_unsigned_64bit()
 {
-  for (uint64_t i = 2; i < 25000; ++i) {
+  for (uint64_t i = 1; i < 25000; ++i) {
     const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
 
     for (uint64_t j = 0; j < 25000; ++j) {
@@ -95,8 +116,7 @@ void test_powers_64bit() {
       if (start_num < 0)
         start_num = 0;
       for (int64_t num = start_num; num < end_num; num++) {
-        Eigen::internal::TensorIntDivisor<int64_t> divider =
-          Eigen::internal::TensorIntDivisor<int64_t>(div);
+        Eigen::internal::TensorIntDivisor<int64_t> divider(div);
         int64_t result = num/div;
         int64_t result_op = divider.divide(num);
         VERIFY_IS_EQUAL(result_op, result);
@@ -109,8 +129,7 @@ void test_specific() {
   // A particular combination that was previously failing
   int64_t div = 209715200;
   int64_t num = 3238002688;
-  Eigen::internal::TensorIntDivisor<int64_t> divider =
-      Eigen::internal::TensorIntDivisor<int64_t>(div);
+  Eigen::internal::TensorIntDivisor<int64_t> divider(div);
   int64_t result = num/div;
   int64_t result_op = divider.divide(num);
   VERIFY_IS_EQUAL(result, result_op);
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 9cf2eb150..a8a095e38 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -14,6 +14,24 @@
 using Eigen::Tensor;
 using Eigen::RowMajor;
 
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<Tensor<const int, 0>> scalar3(scalar1.data());
+  TensorMap<Tensor<const int, 0, RowMajor>> scalar4(scalar2.data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
 static void test_1d()
 {
   Tensor<int, 1> vec1(6);
@@ -139,9 +157,117 @@ static void test_3d()
 }
 
 
+static void test_from_tensor()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3>> mat3(mat1);
+  TensorMap<Tensor<int, 3, RowMajor>> mat4(mat2);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  TensorFixedSize<int, Sizes<2,3,7>> mat5;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat5(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<TensorFixedSize<int, Sizes<2,3,7>>> mat6(mat5);
+
+  VERIFY_IS_EQUAL(mat6.rank(), 3);
+  VERIFY_IS_EQUAL(mat6.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat6.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat6.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat6.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat6(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static int f(const TensorMap<Tensor<int, 3> >& tensor) {
+  //  Size<0> empty;
+  EIGEN_STATIC_ASSERT((internal::array_size<Sizes<>>::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_size<DSizes<int, 0>>::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  Tensor<int, 0> result = tensor.sum();
+  return result();
+}
+
+static void test_casting()
+{
+  Tensor<int, 3> tensor(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        tensor(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3>> map(tensor);
+  int sum1 = f(map);
+  int sum2 = f(tensor);
+
+  VERIFY_IS_EQUAL(sum1, sum2);
+  VERIFY_IS_EQUAL(sum1, 861);
+}
+
 void test_cxx11_tensor_map()
 {
+  CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
   CALL_SUBTEST(test_3d());
+
+  CALL_SUBTEST(test_from_tensor());
+  CALL_SUBTEST(test_casting());
 }
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index b2c85a879..0ec316991 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -13,6 +13,45 @@
 
 using Eigen::Tensor;
 
+template <int DataLayout>
+static void test_trivial_reductions() {
+  {
+    Tensor<float, 0, DataLayout> tensor;
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result(), tensor());
+  }
+
+  {
+    Tensor<float, 1, DataLayout> tensor(7);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 7);
+    for (int i = 0; i < 7; ++i) {
+      VERIFY_IS_EQUAL(result(i), tensor(i));
+    }
+  }
+
+  {
+    Tensor<float, 2, DataLayout> tensor(2, 3);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 2);
+    VERIFY_IS_EQUAL(result.dimension(1), 3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        VERIFY_IS_EQUAL(result(i, j), tensor(i, j));
+      }
+    }
+  }
+}
+
 template <int DataLayout>
 static void test_simple_reductions() {
   Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
@@ -37,18 +76,18 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 1, DataLayout> sum1 = tensor.sum();
-    VERIFY_IS_EQUAL(sum1.dimension(0), 1);
+    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    VERIFY_IS_EQUAL(sum1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
     reduction_axis4[0] = 0;
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 1, DataLayout> sum2 = tensor.sum(reduction_axis4);
-    VERIFY_IS_EQUAL(sum2.dimension(0), 1);
+    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    VERIFY_IS_EQUAL(sum2.rank(), 0);
 
-    VERIFY_IS_APPROX(sum1(0), sum2(0));
+    VERIFY_IS_APPROX(sum1(), sum2());
   }
 
   reduction_axis2[0] = 0;
@@ -69,18 +108,18 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 1, DataLayout> prod1 = tensor.prod();
-    VERIFY_IS_EQUAL(prod1.dimension(0), 1);
+    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    VERIFY_IS_EQUAL(prod1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
     reduction_axis4[0] = 0;
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 1, DataLayout> prod2 = tensor.prod(reduction_axis4);
-    VERIFY_IS_EQUAL(prod2.dimension(0), 1);
+    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    VERIFY_IS_EQUAL(prod2.rank(), 0);
 
-    VERIFY_IS_APPROX(prod1(0), prod2(0));
+    VERIFY_IS_APPROX(prod1(), prod2());
   }
 
   reduction_axis2[0] = 0;
@@ -101,18 +140,18 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 1, DataLayout> max1 = tensor.maximum();
-    VERIFY_IS_EQUAL(max1.dimension(0), 1);
+    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    VERIFY_IS_EQUAL(max1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
     reduction_axis4[0] = 0;
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 1, DataLayout> max2 = tensor.maximum(reduction_axis4);
-    VERIFY_IS_EQUAL(max2.dimension(0), 1);
+    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    VERIFY_IS_EQUAL(max2.rank(), 0);
 
-    VERIFY_IS_APPROX(max1(0), max2(0));
+    VERIFY_IS_APPROX(max1(), max2());
   }
 
   reduction_axis2[0] = 0;
@@ -133,18 +172,18 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 1, DataLayout> min1 = tensor.minimum();
-    VERIFY_IS_EQUAL(min1.dimension(0), 1);
+    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    VERIFY_IS_EQUAL(min1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
     reduction_axis4[0] = 0;
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 1, DataLayout> min2 = tensor.minimum(reduction_axis4);
-    VERIFY_IS_EQUAL(min2.dimension(0), 1);
+    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    VERIFY_IS_EQUAL(min2.rank(), 0);
 
-    VERIFY_IS_APPROX(min1(0), min2(0));
+    VERIFY_IS_APPROX(min1(), min2());
   }
 
   reduction_axis2[0] = 0;
@@ -167,18 +206,35 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 1, DataLayout> mean1 = tensor.mean();
-    VERIFY_IS_EQUAL(mean1.dimension(0), 1);
+    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    VERIFY_IS_EQUAL(mean1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
     reduction_axis4[0] = 0;
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 1, DataLayout> mean2 = tensor.mean(reduction_axis4);
-    VERIFY_IS_EQUAL(mean2.dimension(0), 1);
+    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    VERIFY_IS_EQUAL(mean2.rank(), 0);
 
-    VERIFY_IS_APPROX(mean1(0), mean2(0));
+    VERIFY_IS_APPROX(mean1(), mean2());
+  }
+
+  {
+    Tensor<int, 1> ints(10);
+    std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
+
+    TensorFixedSize<bool, Sizes<> > all;
+    all = ints.all();
+    VERIFY(!all());
+    all = (ints >= ints.constant(0)).all();
+    VERIFY(all());
+
+    TensorFixedSize<bool, Sizes<> > any;
+    any = (ints > ints.constant(10)).any();
+    VERIFY(!any());
+    any = (ints < ints.constant(1)).any();
+    VERIFY(any());
   }
 }
 
@@ -190,8 +246,8 @@ static void test_full_reductions() {
   reduction_axis[0] = 0;
   reduction_axis[1] = 1;
 
-  Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
-  VERIFY_IS_EQUAL(result.dimension(0), 1);
+  Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.rank(), 0);
 
   float sum = 0.0f;
   for (int i = 0; i < 2; ++i) {
@@ -202,7 +258,7 @@ static void test_full_reductions() {
   VERIFY_IS_APPROX(result(0), sum);
 
   result = tensor.square().sum(reduction_axis).sqrt();
-  VERIFY_IS_EQUAL(result.dimension(0), 1);
+  VERIFY_IS_EQUAL(result.rank(), 0);
 
   sum = 0.0f;
   for (int i = 0; i < 2; ++i) {
@@ -210,7 +266,7 @@ static void test_full_reductions() {
       sum += tensor(i, j) * tensor(i, j);
     }
   }
-  VERIFY_IS_APPROX(result(0), sqrtf(sum));
+  VERIFY_IS_APPROX(result(), sqrtf(sum));
 }
 
 struct UserReducer {
@@ -401,6 +457,8 @@ static void test_reduce_middle_dims() {
 }
 
 void test_cxx11_tensor_reduction() {
+  CALL_SUBTEST(test_trivial_reductions<ColMajor>());
+  CALL_SUBTEST(test_trivial_reductions<RowMajor>());
   CALL_SUBTEST(test_simple_reductions<ColMajor>());
   CALL_SUBTEST(test_simple_reductions<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cpp b/unsupported/test/cxx11_tensor_reduction_cuda.cpp
index f426ebbc1..9e06eb126 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cpp
@@ -28,7 +28,7 @@ static void test_full_reductions() {
   Tensor<float, 2, DataLayout> in(num_rows, num_cols);
   in.setRandom();
 
-  Tensor<float, 1, DataLayout> full_redux(1);
+  Tensor<float, 0, DataLayout> full_redux;
   full_redux = in.sum();
 
   std::size_t in_bytes = in.size() * sizeof(float);
@@ -38,16 +38,16 @@ static void test_full_reductions() {
   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
 
   TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
-  TensorMap<Tensor<float, 1, DataLayout> > out_gpu(gpu_out_ptr, 1);
+  TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
 
   out_gpu.device(gpu_device) = in_gpu.sum();
 
-  Tensor<float, 1, DataLayout> full_redux_gpu(1);
+  Tensor<float, 0, DataLayout> full_redux_gpu;
   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
   gpu_device.synchronize();
 
   // Check that the CPU and GPU reductions return the same result.
-  VERIFY_IS_APPROX(full_redux(0), full_redux_gpu(0));
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
 }
 
 void test_cxx11_tensor_reduction_cuda() {
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index f96c21fa3..b35b8d29e 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -114,10 +114,18 @@ static void test_expr_reverse(bool LValue)
 
   Tensor<float, 4, DataLayout> result(2,3,5,7);
 
-  array<ptrdiff_t, 4> src_slice_dim{{2,3,1,7}};
-  array<ptrdiff_t, 4> src_slice_start{{0,0,0,0}};
-  array<ptrdiff_t, 4> dst_slice_dim{{2,3,1,7}};
-  array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<ptrdiff_t, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<ptrdiff_t, 4> dst_slice_dim = src_slice_dim;
+  array<ptrdiff_t, 4> dst_slice_start = src_slice_start;
 
   for (int i = 0; i < 5; ++i) {
     if (LValue) {
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 8cd2ab7fd..47d4d8636 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -14,6 +14,35 @@
 using Eigen::Tensor;
 using Eigen::RowMajor;
 
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+  Tensor<int, 0> scalar3;
+  Tensor<int, 0, RowMajor> scalar4;
+
+  scalar3.resize();
+  scalar4.resize();
+
+  scalar1() = 7;
+  scalar2() = 13;
+  scalar3.setValues(17);
+  scalar4.setZero();
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+  VERIFY_IS_EQUAL(scalar3(), 17);
+  VERIFY_IS_EQUAL(scalar4(), 0);
+
+  Tensor<int, 0> scalar5(scalar1);
+
+  VERIFY_IS_EQUAL(scalar5(), 7);
+  VERIFY_IS_EQUAL(scalar5.data()[0], 7);
+}
+
 static void test_1d()
 {
   Tensor<int, 1> vec1(6);
@@ -287,13 +316,10 @@ static void test_resize()
 
 void test_cxx11_tensor_simple()
 {
+  CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
   CALL_SUBTEST(test_3d());
   CALL_SUBTEST(test_simple_assign());
   CALL_SUBTEST(test_resize());
 }
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
new file mode 100644
index 000000000..adac472cf
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -0,0 +1,38 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_comparison_sugar() {
+  // we already trust comparisons between tensors, we're simply checking that
+  // the sugared versions are doing the same thing
+  Tensor<int, 3> t(6, 7, 5);
+
+  t.setRandom();
+  // make sure we have at least one value == 0
+  t(0,0,0) = 0;
+
+  Tensor<bool,0> b;
+
+#define TEST_TENSOR_EQUAL(e1, e2) \
+  b = ((e1) == (e2)).all();       \
+  VERIFY(b())
+
+#define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0))
+
+  TEST_OP(==);
+  TEST_OP(!=);
+  TEST_OP(<=);
+  TEST_OP(>=);
+  TEST_OP(<);
+  TEST_OP(>);
+#undef TEST_OP
+#undef TEST_TENSOR_EQUAL
+}
+
+void test_cxx11_tensor_sugar()
+{
+  CALL_SUBTEST(test_comparison_sugar());
+}
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
new file mode 100644
index 000000000..ee3767e58
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -0,0 +1,144 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::internal::TensorUInt128;
+using Eigen::internal::static_val;
+
+void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, __uint128_t expected) {
+  bool matchl = actual.lower() == static_cast<uint64_t>(expected);
+  bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64);
+  if (!matchl || !matchh) {
+    const char* testname = g_test_stack.back().c_str();
+    std::cerr << "Test " << testname << " failed in " << __FILE__
+              << " (" << __LINE__ << ")"
+              << std::endl;
+    abort();
+  }
+}
+
+
+void test_add() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i + j;
+          __uint128_t expected = a + b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_sub() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i - j;
+          __uint128_t expected = a - b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_mul() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i * j;
+          __uint128_t expected = a * b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_div() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      __uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          __uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i / j;
+          __uint128_t expected = a / b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_misc1() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+    TensorUInt128<static_val<0>, uint64_t> i(0, i2);
+    __uint128_t a = static_cast<__uint128_t>(i2);
+    for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+      TensorUInt128<static_val<0>, uint64_t> j(0, j2);
+      __uint128_t b = static_cast<__uint128_t>(j2);
+      uint64_t actual = (i * j).upper();
+      uint64_t expected = (a * b) >> 64;
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+
+void test_misc2() {
+  int64_t incr = internal::random<int64_t>(1, 100);
+  for (int64_t log_div = 0; log_div < 63; ++log_div) {
+    for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) {
+      uint64_t expected = (static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1;
+      uint64_t shift = 1ULL << log_div;
+
+      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      uint64_t actual = static_cast<uint64_t>(result);
+      VERIFY_EQUAL(actual, expected);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_uint128()
+{
+  CALL_SUBTEST_1(test_add());
+  CALL_SUBTEST_2(test_sub());
+  CALL_SUBTEST_3(test_mul());
+  CALL_SUBTEST_4(test_div());
+  CALL_SUBTEST_5(test_misc1());
+  CALL_SUBTEST_6(test_misc2());
+}
diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index d4baafe62..866db8e86 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp
@@ -13,8 +13,6 @@
 #define NUMBER_DIRECTIONS 16
 #include <unsupported/Eigen/AdolcForward>
 
-int adtl::ADOLC_numDir;
-
 template<typename Vector>
 EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p)
 {
@@ -123,7 +121,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
 
 void test_forward_adolc()
 {
-  adtl::ADOLC_numDir = NUMBER_DIRECTIONS;
+  adtl::setNumDir(NUMBER_DIRECTIONS);
 
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,2,2>()) ));
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
index 104cb686f..9b0cf7268 100644
--- a/unsupported/test/mpreal/mpreal.h
+++ b/unsupported/test/mpreal/mpreal.h
@@ -1,33 +1,34 @@
 /*
-    MPFR C++: Multi-precision floating point number class for C++. 
+    MPFR C++: Multi-precision floating point number class for C++.
     Based on MPFR library:    http://mpfr.org
 
     Project homepage:    http://www.holoborodko.com/pavel/mpfr
     Contact e-mail:      pavel@holoborodko.com
 
-    Copyright (c) 2008-2014 Pavel Holoborodko
+    Copyright (c) 2008-2015 Pavel Holoborodko
 
     Contributors:
-    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman, 
-    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen, 
-    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng, 
+    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
+    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
+    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
     Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
-    Petr Aleksandrov, Orion Poplawski, Charles Karney.
+    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
+    Rodney James, Jorge Leitao.
 
     Licensing:
     (A) MPFR C++ is under GNU General Public License ("GPL").
-    
-    (B) Non-free licenses may also be purchased from the author, for users who 
+
+    (B) Non-free licenses may also be purchased from the author, for users who
         do not want their programs protected by the GPL.
 
-        The non-free licenses are for users that wish to use MPFR C++ in 
-        their products but are unwilling to release their software 
-        under the GPL (which would require them to release source code 
+        The non-free licenses are for users that wish to use MPFR C++ in
+        their products but are unwilling to release their software
+        under the GPL (which would require them to release source code
         and allow free redistribution).
 
         Such users can purchase an unlimited-use license from the author.
         Contact us for more details.
-    
+
     GNU General Public License ("GPL") copyright permissions statement:
     **************************************************************************
     This program is free software: you can redistribute it and/or modify
@@ -55,10 +56,10 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include <complex>
+#include <algorithm>
 
 // Options
-// FIXME HAVE_INT64_SUPPORT leads to clashes with long int and int64_t on some systems.
-//#define MPREAL_HAVE_INT64_SUPPORT               // Enable int64_t support if possible. Available only for MSVC 2010 & GCC.
 #define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
 #define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
                                                 // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
@@ -66,17 +67,15 @@
 
 // Library version
 #define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 5
-#define MPREAL_VERSION_PATCHLEVEL 9
-#define MPREAL_VERSION_STRING "3.5.9"
+#define MPREAL_VERSION_MINOR 6
+#define MPREAL_VERSION_PATCHLEVEL 2
+#define MPREAL_VERSION_STRING "3.6.2"
 
 // Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__) && defined(__INTEL_COMPILER)
-    #define IsInf(x) (isinf)(x)                   // Intel ICC compiler on Linux 
-
-#elif defined(_MSC_VER)                         // Microsoft Visual C++ 
-    #define IsInf(x) (!_finite(x))                           
-
+#if defined(__GNUC__)
+    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
+#elif defined(_MSC_VER)                         // Microsoft Visual C++
+    #define IsInf(x) (!_finite(x))
 #else
     #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
 #endif
@@ -93,54 +92,27 @@
 
     #define MPREAL_HAVE_MOVE_SUPPORT
 
-    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization 
+    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
     #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
     #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
 #endif
 
-// Detect support for explicit converters. 
+// Detect support for explicit converters.
 #if (__has_feature(cxx_explicit_conversions) || \
-       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
-      (defined(_MSC_VER) && _MSC_VER >= 1800))
+       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR >= 5) || __cplusplus >= 201103L || \
+       (defined(_MSC_VER) && _MSC_VER >= 1800))
 
     #define MPREAL_HAVE_EXPLICIT_CONVERTERS
 #endif
 
-// Detect available 64-bit capabilities
-#if defined(MPREAL_HAVE_INT64_SUPPORT)
-    
-    #define MPFR_USE_INTMAX_T                   // Should be defined before mpfr.h
-
-    #if defined(_MSC_VER)                       // MSVC + Windows
-        #if (_MSC_VER >= 1600)                    
-            #include <stdint.h>                 // <stdint.h> is available only in msvc2010!
-
-        #else                                   // MPFR relies on intmax_t which is available only in msvc2010
-            #undef MPREAL_HAVE_INT64_SUPPORT    // Besides, MPFR & MPIR have to be compiled with msvc2010
-            #undef MPFR_USE_INTMAX_T            // Since we cannot detect this, disable x64 by default
-                                                // Someone should change this manually if needed.
-        #endif
-
-    #elif defined (__GNUC__) && defined(__linux__)
-        #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) || defined (__PPC64__)
-            #undef MPREAL_HAVE_INT64_SUPPORT    // Remove all shaman dances for x64 builds since
-            #undef MPFR_USE_INTMAX_T            // GCC already supports x64 as of "long int" is 64-bit integer, nothing left to do
-        #else
-            #include <stdint.h>                 // use int64_t, uint64_t otherwise
-        #endif
-
-    #else
-        #include <stdint.h>                     // rely on int64_t, uint64_t in all other cases, Mac OSX, etc.
-    #endif
-
-#endif 
+#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
 
 #if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
     #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
     #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
 #else
-    #define MPREAL_MSVC_DEBUGVIEW_CODE 
-    #define MPREAL_MSVC_DEBUGVIEW_DATA 
+    #define MPREAL_MSVC_DEBUGVIEW_CODE
+    #define MPREAL_MSVC_DEBUGVIEW_DATA
 #endif
 
 #include <mpfr.h>
@@ -150,9 +122,15 @@
 #endif
 
 // Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal 
+#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal
                                                 // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
                                                 // = -1 disables overflow checks (default)
+
+// Fast replacement for mpfr_set_zero(x, +1):
+// (a) uses low-level data members, might not be compatible with new versions of MPFR
+// (b) sign is not set, add (x)->_mpfr_sign = 1;
+#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
+
 #if defined(__GNUC__)
   #define MPREAL_PERMISSIVE_EXPR __extension__
 #else
@@ -164,9 +142,9 @@ namespace mpfr {
 class mpreal {
 private:
     mpfr_t mp;
-    
+
 public:
-    
+
     // Get default rounding mode & precision
     inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
     inline static mp_prec_t  get_default_prec()   {    return mpfr_get_default_prec();                            }
@@ -174,29 +152,26 @@ public:
     // Constructors && type conversions
     mpreal();
     mpreal(const mpreal& u);
-    mpreal(const mpf_t u);    
-    mpreal(const mpz_t u,             mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());    
-    mpreal(const mpq_t u,             mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());    
-    mpreal(const double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long double u,       mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    
-    // Construct mpreal from mpfr_t structure.
-    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.    
-    mpreal(const mpfr_t  u, bool shared = false);   
+    mpreal(const mpf_t u);
+    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    mpreal(const uint64_t u,          mp_prec_t prec = mpreal::get_default_prec(),  mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int64_t u,           mp_prec_t prec = mpreal::get_default_prec(),  mp_rnd_t mode = mpreal::get_default_rnd());
-#endif
+    // Construct mpreal from mpfr_t structure.
+    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
+    mpreal(const mpfr_t  u, bool shared = false);
 
     mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
     mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
 
-    ~mpreal();                           
+    ~mpreal();
 
 #ifdef MPREAL_HAVE_MOVE_SUPPORT
     mpreal& operator=(mpreal&& v);
@@ -205,7 +180,7 @@ public:
 
     // Operations
     // =
-    // +, -, *, /, ++, --, <<, >> 
+    // +, -, *, /, ++, --, <<, >>
     // *=, +=, -=, /=,
     // <, >, ==, <=, >=
 
@@ -215,13 +190,16 @@ public:
     mpreal& operator=(const mpz_t v);
     mpreal& operator=(const mpq_t v);
     mpreal& operator=(const long double v);
-    mpreal& operator=(const double v);        
+    mpreal& operator=(const double v);
     mpreal& operator=(const unsigned long int v);
+    mpreal& operator=(const unsigned long long int v);
+    mpreal& operator=(const long long int v);
     mpreal& operator=(const unsigned int v);
     mpreal& operator=(const long int v);
     mpreal& operator=(const int v);
     mpreal& operator=(const char* s);
     mpreal& operator=(const std::string& s);
+    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
 
     // +
     mpreal& operator+=(const mpreal& v);
@@ -235,20 +213,18 @@ public:
     mpreal& operator+=(const long int u);
     mpreal& operator+=(const int u);
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    mpreal& operator+=(const int64_t  u);
-    mpreal& operator+=(const uint64_t u);
-    mpreal& operator-=(const int64_t  u);
-    mpreal& operator-=(const uint64_t u);
-    mpreal& operator*=(const int64_t  u);
-    mpreal& operator*=(const uint64_t u);
-    mpreal& operator/=(const int64_t  u);
-    mpreal& operator/=(const uint64_t u);
-#endif 
+    mpreal& operator+=(const long long int  u);
+    mpreal& operator+=(const unsigned long long int u);
+    mpreal& operator-=(const long long int  u);
+    mpreal& operator-=(const unsigned long long int u);
+    mpreal& operator*=(const long long int  u);
+    mpreal& operator*=(const unsigned long long int u);
+    mpreal& operator/=(const long long int  u);
+    mpreal& operator/=(const unsigned long long int u);
 
     const mpreal operator+() const;
     mpreal& operator++ ();
-    const mpreal  operator++ (int); 
+    const mpreal  operator++ (int);
 
     // -
     mpreal& operator-=(const mpreal& v);
@@ -266,7 +242,7 @@ public:
     friend const mpreal operator-(const long int b,          const mpreal& a);
     friend const mpreal operator-(const int b,               const mpreal& a);
     friend const mpreal operator-(const double b,            const mpreal& a);
-    mpreal& operator-- ();    
+    mpreal& operator-- ();
     const mpreal  operator-- (int);
 
     // *
@@ -279,7 +255,7 @@ public:
     mpreal& operator*=(const unsigned int v);
     mpreal& operator*=(const long int v);
     mpreal& operator*=(const int v);
-    
+
     // /
     mpreal& operator/=(const mpreal& v);
     mpreal& operator/=(const mpz_t v);
@@ -308,51 +284,27 @@ public:
     mpreal& operator>>=(const long int u);
     mpreal& operator>>=(const int u);
 
-    // Boolean Operators
-    friend bool operator >  (const mpreal& a, const mpreal& b);
-    friend bool operator >= (const mpreal& a, const mpreal& b);
-    friend bool operator <  (const mpreal& a, const mpreal& b);
-    friend bool operator <= (const mpreal& a, const mpreal& b);
-    friend bool operator == (const mpreal& a, const mpreal& b);
-    friend bool operator != (const mpreal& a, const mpreal& b);
-
-    // Optimized specializations for boolean operators
-    friend bool operator == (const mpreal& a, const unsigned long int b);
-    friend bool operator == (const mpreal& a, const unsigned int b);
-    friend bool operator == (const mpreal& a, const long int b);
-    friend bool operator == (const mpreal& a, const int b);
-    friend bool operator == (const mpreal& a, const long double b);
-    friend bool operator == (const mpreal& a, const double b);
-
     // Type Conversion operators
-    bool            toBool      (mp_rnd_t mode = GMP_RNDZ)    const;
-    long            toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long   toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    float           toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
-    double          toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
-    long double     toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
+    bool               toBool      (                        )    const;
+    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
+    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
+    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
+    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
+    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
+    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
+    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
 
 #if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator bool               () const { return toBool();       }
-    explicit operator int                () const { return int(toLong());  }
-    explicit operator long               () const { return toLong();       }
-    explicit operator long long          () const { return toLong();       }
-    explicit operator unsigned           () const { return unsigned(toULong()); }
-    explicit operator unsigned long      () const { return toULong();      }
-    explicit operator unsigned long long () const { return toULong();      }
-    explicit operator float              () const { return toFloat();      }
-    explicit operator double             () const { return toDouble();     }
-    explicit operator long double        () const { return toLDouble();    }
-#endif
-
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    int64_t         toInt64     (mp_rnd_t mode = GMP_RNDZ)    const;
-    uint64_t        toUInt64    (mp_rnd_t mode = GMP_RNDZ)    const;
-
-    #if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator int64_t   () const { return toInt64();      }
-    explicit operator uint64_t  () const { return toUInt64();     }
-    #endif
+    explicit operator bool               () const { return toBool();                 }
+    explicit operator int                () const { return int(toLong());            }
+    explicit operator long               () const { return toLong();                 }
+    explicit operator long long          () const { return toLLong();                }
+    explicit operator unsigned           () const { return unsigned(toULong());      }
+    explicit operator unsigned long      () const { return toULong();                }
+    explicit operator unsigned long long () const { return toULLong();               }
+    explicit operator float              () const { return toFloat();                }
+    explicit operator double             () const { return toDouble();               }
+    explicit operator long double        () const { return toLDouble();              }
 #endif
 
     // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
@@ -391,11 +343,12 @@ public:
     friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
     friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
     friend int cmpabs(const mpreal& a,const mpreal& b);
-    
+
     friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode); 
+    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
@@ -436,21 +389,22 @@ public:
     friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
 
     friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
     friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode); 
-    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode); 
+    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode); 
+    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
     friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
     friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
-    friend const mpreal sum      (const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode);
+    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
     friend int sgn(const mpreal& v); // returns -1 or +1
 
 // MPFR 2.4.0 Specifics
@@ -465,28 +419,26 @@ public:
     friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
 #endif
 
-// MPFR 3.0.0 Specifics
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
     friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
     friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
     friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
+#endif
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
     friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
     friend const mpreal grandom (unsigned int seed);
 #endif
-    
+
     // Uniformly distributed random number generation in [0,1] using
     // Mersenne-Twister algorithm by default.
     // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
     // Check urandom() for more precise control.
     friend const mpreal random(unsigned int seed);
 
-    // Exponent and mantissa manipulation
-    friend const mpreal frexp(const mpreal& v, mp_exp_t* exp);    
-    friend const mpreal ldexp(const mpreal& v, mp_exp_t exp);
-
     // Splits mpreal value into fractional and integer parts.
     // Returns fractional part and stores integer part in n.
-    friend const mpreal modf(const mpreal& v, mpreal& n);    
+    friend const mpreal modf(const mpreal& v, mpreal& n);
 
     // Constants
     // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
@@ -515,14 +467,14 @@ public:
     friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
     friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
     friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    
+
     // Miscellaneous Functions
     friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
     friend const mpreal nextabove  (const mpreal& x);
     friend const mpreal nextbelow  (const mpreal& x);
 
     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal urandomb (gmp_randstate_t& state); 
+    friend const mpreal urandomb (gmp_randstate_t& state);
 
 // MPFR < 2.4.2 Specifics
 #if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
@@ -549,9 +501,9 @@ public:
     // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
     inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
     inline int          getPrecision() const;
-    
+
     // Set mpreal to +/- inf, NaN, +/-0
-    mpreal&        setInf  (int Sign = +1);    
+    mpreal&        setInf  (int Sign = +1);
     mpreal&        setNan  ();
     mpreal&        setZero (int Sign = +1);
     mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
@@ -560,7 +512,7 @@ public:
     mp_exp_t get_exp();
     int set_exp(mp_exp_t e);
     int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
-    int subnormalize (int t,mp_rnd_t rnd_mode = get_default_rnd());
+    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
 
     // Inexact conversion from float
     inline bool fits_in_bits(double x, int n);
@@ -580,7 +532,7 @@ public:
 
     // Efficient swapping of two mpreal values - needed for std algorithms
     friend void swap(mpreal& x, mpreal& y);
-    
+
     friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
     friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
 
@@ -590,7 +542,7 @@ private:
     //
     // mpfr::mpreal=<DebugView>                              ; Show value only
     // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
-    // 
+    //
     // at the beginning of
     // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
     MPREAL_MSVC_DEBUGVIEW_DATA
@@ -609,15 +561,15 @@ public:
 //////////////////////////////////////////////////////////////////////////
 // Constructors & converters
 // Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal() 
-{ 
-    mpfr_init2 (mpfr_ptr(), mpreal::get_default_prec()); 
-    mpfr_set_ui(mpfr_ptr(), 0, mpreal::get_default_rnd());
+inline mpreal::mpreal()
+{
+    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
+    mpfr_set_zero_fast(mpfr_ptr());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-inline mpreal::mpreal(const mpreal& u) 
+inline mpreal::mpreal(const mpreal& u)
 {
     mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
     mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
@@ -628,7 +580,7 @@ inline mpreal::mpreal(const mpreal& u)
 #ifdef MPREAL_HAVE_MOVE_SUPPORT
 inline mpreal::mpreal(mpreal&& other)
 {
-    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pinter to actual data 
+    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pointer to actual data
     mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
@@ -700,15 +652,31 @@ inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
 }
 
 inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+{
     mpfr_init2 (mpfr_ptr(), prec);
     mpfr_set_ld(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
+inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_uj(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_sj(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
 inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+{
     mpfr_init2 (mpfr_ptr(), prec);
     mpfr_set_ui(mpfr_ptr(), u, mode);
 
@@ -716,7 +684,7 @@ inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
 }
 
 inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+{
     mpfr_init2 (mpfr_ptr(), prec);
     mpfr_set_ui(mpfr_ptr(), u, mode);
 
@@ -724,7 +692,7 @@ inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
 }
 
 inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+{
     mpfr_init2 (mpfr_ptr(), prec);
     mpfr_set_si(mpfr_ptr(), u, mode);
 
@@ -732,35 +700,17 @@ inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
 }
 
 inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
+{
     mpfr_init2 (mpfr_ptr(), prec);
     mpfr_set_si(mpfr_ptr(), u, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-inline mpreal::mpreal(const uint64_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_uj(mpfr_ptr(), u, mode); 
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int64_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_sj(mpfr_ptr(), u, mode); 
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-#endif
-
 inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
 {
     mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s, base, mode); 
+    mpfr_set_str(mpfr_ptr(), s, base, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
@@ -768,7 +718,7 @@ inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
 inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
 {
     mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode); 
+    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
 }
@@ -776,15 +726,15 @@ inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t m
 inline void mpreal::clear(::mpfr_ptr x)
 {
 #ifdef MPREAL_HAVE_MOVE_SUPPORT
-    if(mpfr_is_initialized(x)) 
+    if(mpfr_is_initialized(x))
 #endif
     mpfr_clear(x);
 }
 
-inline mpreal::~mpreal() 
-{ 
+inline mpreal::~mpreal()
+{
     clear(mpfr_ptr());
-}                           
+}
 
 // internal namespace needed for template magic
 namespace internal{
@@ -792,58 +742,55 @@ namespace internal{
     // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
     // This is needed for smooth integration with libraries based on expression templates, like Eigen.
     // TODO: Do the same for boolean operators.
-    template <typename ArgumentType> struct result_type {};    
-    
-    template <> struct result_type<mpreal>              {typedef mpreal type;};    
-    template <> struct result_type<mpz_t>               {typedef mpreal type;};    
-    template <> struct result_type<mpq_t>               {typedef mpreal type;};    
-    template <> struct result_type<long double>         {typedef mpreal type;};    
-    template <> struct result_type<double>              {typedef mpreal type;};    
-    template <> struct result_type<unsigned long int>   {typedef mpreal type;};    
-    template <> struct result_type<unsigned int>        {typedef mpreal type;};    
-    template <> struct result_type<long int>            {typedef mpreal type;};    
-    template <> struct result_type<int>                 {typedef mpreal type;};    
+    template <typename ArgumentType> struct result_type {};
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-    template <> struct result_type<int64_t  >           {typedef mpreal type;};    
-    template <> struct result_type<uint64_t >           {typedef mpreal type;};    
-#endif
+    template <> struct result_type<mpreal>              {typedef mpreal type;};
+    template <> struct result_type<mpz_t>               {typedef mpreal type;};
+    template <> struct result_type<mpq_t>               {typedef mpreal type;};
+    template <> struct result_type<long double>         {typedef mpreal type;};
+    template <> struct result_type<double>              {typedef mpreal type;};
+    template <> struct result_type<unsigned long int>   {typedef mpreal type;};
+    template <> struct result_type<unsigned int>        {typedef mpreal type;};
+    template <> struct result_type<long int>            {typedef mpreal type;};
+    template <> struct result_type<int>                 {typedef mpreal type;};
+    template <> struct result_type<long long>           {typedef mpreal type;};
+    template <> struct result_type<unsigned long long>  {typedef mpreal type;};
 }
 
 // + Addition
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    } 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    }
 
 // - Subtraction
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
     operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
 
 // * Multiplication
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    } 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    }
 
 // / Division
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
     operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
 
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
     operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
 
 //////////////////////////////////////////////////////////////////////////
@@ -893,17 +840,17 @@ const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::g
 const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); 
+const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); 
+const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());    
+const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());    
+const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
 const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
@@ -920,9 +867,9 @@ inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpr
 inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
 
 // Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);        
+inline mpreal machine_epsilon(const mpreal& x);
 
-// Gives max & min values for the required precision, 
+// Gives max & min values for the required precision,
 // minval is 'safe' meaning 1 / minval does not overflow
 // maxval is 'safe' meaning 1 / maxval does not underflow
 inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
@@ -935,13 +882,13 @@ inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
 inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
 
 // 'Bitwise' equality check
-//  maxUlps - a and b can be apart by maxUlps binary numbers. 
+//  maxUlps - a and b can be apart by maxUlps binary numbers.
 inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
 
 //////////////////////////////////////////////////////////////////////////
-//     Convert precision in 'bits' to decimal digits and vice versa.
-//        bits   = ceil(digits*log[2](10))
-//        digits = floor(bits*log[10](2))
+// Convert precision in 'bits' to decimal digits and vice versa.
+//    bits   = ceil(digits*log[2](10))
+//    digits = floor(bits*log[10](2))
 
 inline mp_prec_t digits2bits(int d);
 inline int       bits2digits(mp_prec_t b);
@@ -979,7 +926,7 @@ inline mpreal& mpreal::operator=(const mpreal& v)
 inline mpreal& mpreal::operator=(const mpf_t v)
 {
     mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-    
+
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
@@ -987,7 +934,7 @@ inline mpreal& mpreal::operator=(const mpf_t v)
 inline mpreal& mpreal::operator=(const mpz_t v)
 {
     mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-    
+
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
@@ -1000,16 +947,16 @@ inline mpreal& mpreal::operator=(const mpq_t v)
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const long double v)        
-{    
+inline mpreal& mpreal::operator=(const long double v)
+{
     mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const double v)                
-{   
+inline mpreal& mpreal::operator=(const double v)
+{
 #if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
   if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
   {
@@ -1024,33 +971,49 @@ inline mpreal& mpreal::operator=(const double v)
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const unsigned long int v)    
-{    
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());    
+inline mpreal& mpreal::operator=(const unsigned long int v)
+{
+    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const unsigned int v)        
-{    
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());    
+inline mpreal& mpreal::operator=(const unsigned int v)
+{
+    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal& mpreal::operator=(const long int v)            
-{    
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());    
+inline mpreal& mpreal::operator=(const unsigned long long int v)
+{
+    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const long long int v)
+{
+    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const long int v)
+{
+    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
 inline mpreal& mpreal::operator=(const int v)
-{    
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());    
+{
+    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -1071,7 +1034,7 @@ inline mpreal& mpreal::operator=(const char* s)
 
     if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
     {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); 
+        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
         MPREAL_MSVC_DEBUGVIEW_CODE;
     }
 
@@ -1094,7 +1057,7 @@ inline mpreal& mpreal::operator=(const std::string& s)
 
     if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
     {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); 
+        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
         MPREAL_MSVC_DEBUGVIEW_CODE;
     }
 
@@ -1102,6 +1065,11 @@ inline mpreal& mpreal::operator=(const std::string& s)
     return *this;
 }
 
+template <typename real_t>
+inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
+{
+    return *this = z.real();
+}
 
 //////////////////////////////////////////////////////////////////////////
 // + Addition
@@ -1135,9 +1103,9 @@ inline mpreal& mpreal::operator+=(const mpq_t u)
 
 inline mpreal& mpreal::operator+= (const long double u)
 {
-    *this += mpreal(u);    
+    *this += mpreal(u);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator+= (const double u)
@@ -1180,16 +1148,14 @@ inline mpreal& mpreal::operator+=(const int u)
     return *this;
 }
 
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-inline mpreal& mpreal::operator+=(const int64_t  u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator+=(const uint64_t u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const int64_t  u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const uint64_t u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const int64_t  u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const uint64_t u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const int64_t  u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const uint64_t u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-#endif 
+inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
 
 inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
 
@@ -1200,7 +1166,7 @@ inline const mpreal operator+(const mpreal& a, const mpreal& b)
   return c;
 }
 
-inline mpreal& mpreal::operator++() 
+inline mpreal& mpreal::operator++()
 {
     return *this += 1;
 }
@@ -1212,7 +1178,7 @@ inline const mpreal mpreal::operator++ (int)
     return x;
 }
 
-inline mpreal& mpreal::operator--() 
+inline mpreal& mpreal::operator--()
 {
     return *this -= 1;
 }
@@ -1249,9 +1215,9 @@ inline mpreal& mpreal::operator-=(const mpq_t v)
 
 inline mpreal& mpreal::operator-=(const long double v)
 {
-    *this -= mpreal(v);    
+    *this -= mpreal(v);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator-=(const double v)
@@ -1259,7 +1225,7 @@ inline mpreal& mpreal::operator-=(const double v)
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
     mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
 #else
-    *this -= mpreal(v);    
+    *this -= mpreal(v);
 #endif
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
@@ -1374,9 +1340,9 @@ inline mpreal& mpreal::operator*=(const mpq_t v)
 
 inline mpreal& mpreal::operator*=(const long double v)
 {
-    *this *= mpreal(v);    
+    *this *= mpreal(v);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator*=(const double v)
@@ -1384,7 +1350,7 @@ inline mpreal& mpreal::operator*=(const double v)
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
     mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
 #else
-    *this *= mpreal(v);    
+    *this *= mpreal(v);
 #endif
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -1452,7 +1418,7 @@ inline mpreal& mpreal::operator/=(const long double v)
 {
     *this /= mpreal(v);
     MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
+    return *this;
 }
 
 inline mpreal& mpreal::operator/=(const double v)
@@ -1460,7 +1426,7 @@ inline mpreal& mpreal::operator/=(const double v)
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
     mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
 #else
-    *this /= mpreal(v);    
+    *this /= mpreal(v);
 #endif
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -1671,21 +1637,65 @@ inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
 }
 
 //////////////////////////////////////////////////////////////////////////
-//Boolean operators
-inline bool operator >  (const mpreal& a, const mpreal& b){    return (mpfr_greater_p       (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator >= (const mpreal& a, const mpreal& b){    return (mpfr_greaterequal_p  (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator <  (const mpreal& a, const mpreal& b){    return (mpfr_less_p          (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator <= (const mpreal& a, const mpreal& b){    return (mpfr_lessequal_p     (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator == (const mpreal& a, const mpreal& b){    return (mpfr_equal_p         (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
-inline bool operator != (const mpreal& a, const mpreal& b){    return (mpfr_lessgreater_p   (a.mpfr_srcptr(),b.mpfr_srcptr()) !=0 );    }
+//Relational operators
 
-inline bool operator == (const mpreal& a, const unsigned long int b ){    return (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const unsigned int b      ){    return (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const long int b          ){    return (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const int b               ){    return (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const long double b       ){    return (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );    }
-inline bool operator == (const mpreal& a, const double b            ){    return (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );    }
+// WARNING:
+//
+// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
+//
+// isnan(b) =  (b != b)
+// isnan(b) = !(b == b)  (we use in code below)
+//
+// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
+// Use std::isnan instead (C++11).
 
+inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
+inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
+inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
+
+inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
+inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
+// inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
+inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
+
+inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
+inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
+inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
+
+inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
+inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
+inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
+
+inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
+inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
+inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
+
+inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
 
 inline bool (isnan)    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
 inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
@@ -1695,21 +1705,18 @@ inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcpt
 
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
 inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif 
+#endif
 
 //////////////////////////////////////////////////////////////////////////
 // Type Converters
-inline bool             mpreal::toBool (mp_rnd_t /*mode*/) const   {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
-inline long             mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
-inline unsigned long    mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
-inline float            mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
-inline double           mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
-inline long double      mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
-
-#if defined (MPREAL_HAVE_INT64_SUPPORT)
-inline int64_t      mpreal::toInt64 (mp_rnd_t mode)    const{    return mpfr_get_sj(mpfr_srcptr(), mode);    }
-inline uint64_t     mpreal::toUInt64(mp_rnd_t mode)    const{    return mpfr_get_uj(mpfr_srcptr(), mode);    }
-#endif
+inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
+inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
+inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
+inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
+inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
+inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
+inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
+inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
 
 inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
 inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
@@ -1755,21 +1762,21 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
 
     std::ostringstream format;
 
-    int digits = (n >= 0) ? n : bits2digits(mpfr_get_prec(mpfr_srcptr()));
-    
+    int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
+
     format << "%." << digits << "RNg";
 
     return toString(format.str());
 
 #else
 
-    char *s, *ns = NULL; 
+    char *s, *ns = NULL;
     size_t slen, nslen;
     mp_exp_t exp;
     std::string out;
 
     if(mpfr_inf_p(mp))
-    { 
+    {
         if(mpfr_sgn(mp)>0) return "+Inf";
         else               return "-Inf";
     }
@@ -1784,7 +1791,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
     {
         slen  = strlen(s);
         nslen = strlen(ns);
-        if(nslen<=slen) 
+        if(nslen<=slen)
         {
             mpfr_free_str(s);
             s = ns;
@@ -1801,7 +1808,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp) ptr--; 
+                while (*ptr=='0' && ptr>s+exp) ptr--;
 
                 if(ptr==s+exp) out = std::string(s,exp+1);
                 else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
@@ -1812,7 +1819,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp-1) ptr--; 
+                while (*ptr=='0' && ptr>s+exp-1) ptr--;
 
                 if(ptr==s+exp-1) out = std::string(s,exp);
                 else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
@@ -1825,7 +1832,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+1) ptr--; 
+                while (*ptr=='0' && ptr>s+1) ptr--;
 
                 if(ptr==s+1) out = std::string(s,2);
                 else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
@@ -1836,7 +1843,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
             {
                 // Remove zeros starting from right end
                 char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s) ptr--; 
+                while (*ptr=='0' && ptr>s) ptr--;
 
                 if(ptr==s) out = std::string(s,1);
                 else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
@@ -1863,7 +1870,7 @@ inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
 
 //////////////////////////////////////////////////////////////////////////
 // I/O
-inline std::ostream& mpreal::output(std::ostream& os) const 
+inline std::ostream& mpreal::output(std::ostream& os) const
 {
     std::ostringstream format;
     const std::ios::fmtflags flags = os.flags();
@@ -1926,8 +1933,7 @@ inline int bits2digits(mp_prec_t b)
 // Set/Get number properties
 inline int sgn(const mpreal& op)
 {
-    int r = mpfr_signbit(op.mpfr_srcptr());
-    return (r > 0? -1 : 1);
+    return mpfr_sgn(op.mpfr_srcptr());
 }
 
 inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
@@ -1949,29 +1955,28 @@ inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
     return *this;
 }
 
-inline mpreal& mpreal::setInf(int sign) 
-{ 
+inline mpreal& mpreal::setInf(int sign)
+{
     mpfr_set_inf(mpfr_ptr(), sign);
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
-}    
+}
 
-inline mpreal& mpreal::setNan() 
+inline mpreal& mpreal::setNan()
 {
     mpfr_set_nan(mpfr_ptr());
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
 }
 
-inline mpreal&    mpreal::setZero(int sign)
+inline mpreal& mpreal::setZero(int sign)
 {
-
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
     mpfr_set_zero(mpfr_ptr(), sign);
 #else
     mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
     setSign(sign);
-#endif 
+#endif
 
     MPREAL_MSVC_DEBUGVIEW_CODE;
     return *this;
@@ -2000,23 +2005,32 @@ inline int mpreal::set_exp (mp_exp_t e)
     return x;
 }
 
-inline const mpreal frexp(const mpreal& v, mp_exp_t* exp)
+inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
 {
-    mpreal x(v);
-    *exp = x.get_exp();
-    x.set_exp(0);
-    return x;
+    mpreal y(x);
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
+#else
+    *exp = mpfr_get_exp(y.mpfr_srcptr());
+    mpfr_set_exp(y.mpfr_ptr(),0);
+#endif
+    return y;
 }
 
 inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
 {
     mpreal x(v);
 
-    // rounding is not important since we just increasing the exponent
-    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd()); 
+    // rounding is not important since we are just increasing the exponent (= exact operation)
+    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
     return x;
 }
 
+inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
+{
+    return ldexp(v, exp);
+}
+
 inline mpreal machine_epsilon(mp_prec_t prec)
 {
     /* the smallest eps such that 1 + eps != 1 */
@@ -2024,7 +2038,7 @@ inline mpreal machine_epsilon(mp_prec_t prec)
 }
 
 inline mpreal machine_epsilon(const mpreal& x)
-{    
+{
     /* the smallest eps such that x + eps != x */
     if( x < 0)
     {
@@ -2045,7 +2059,7 @@ inline mpreal minval(mp_prec_t prec)
 inline mpreal maxval(mp_prec_t prec)
 {
     /* max = (1 - eps) * 2^emax, eps is machine epsilon */
-    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax(); 
+    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
 }
 
 inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
@@ -2063,12 +2077,26 @@ inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
     return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
 }
 
+//////////////////////////////////////////////////////////////////////////
+// C++11 sign functions.
+inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
+    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
+    return rop;
+}
+
+inline bool signbit(const mpreal& x)
+{
+    return mpfr_signbit(x.mpfr_srcptr());
+}
+
 inline const mpreal modf(const mpreal& v, mpreal& n)
 {
     mpreal f(v);
 
     // rounding is not important since we are using the same number
-    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());    
+    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
     mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
     return f;
 }
@@ -2131,7 +2159,7 @@ inline mp_exp_t mpreal::get_emax_max (void)
 #define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
         mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
         mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
-        return y; 
+        return y;
 
 inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
 {   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
@@ -2154,7 +2182,7 @@ inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
 inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
 {
     if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN  
+    else        return mpreal().setNan(); // NaN
 }
 
 inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
@@ -2165,9 +2193,9 @@ inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
 
 inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
 {
-    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); 
-    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);  
-    return y; 
+    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
+    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
+    return y;
 }
 
 inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
@@ -2209,6 +2237,8 @@ inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd
 inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
 inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
 
+inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
+
 inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
 inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
 inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
@@ -2230,6 +2260,7 @@ inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_r
 inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
 inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
 inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
+inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
 inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
 inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
 inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
@@ -2254,7 +2285,7 @@ inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode =
 }
 
 inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{    
+{
     mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
     mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
     return a;
@@ -2307,9 +2338,9 @@ inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, m
     mpreal a;
     mp_prec_t p1, p2, p3;
 
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-    p3 = v3.get_prec(); 
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+    p3 = v3.get_prec();
 
     a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
 
@@ -2322,9 +2353,9 @@ inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, m
     mpreal a;
     mp_prec_t p1, p2, p3;
 
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-    p3 = v3.get_prec(); 
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+    p3 = v3.get_prec();
 
     a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
 
@@ -2337,8 +2368,8 @@ inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode =
     mpreal a;
     mp_prec_t p1, p2;
 
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
 
     a.set_prec(p1>p2?p1:p2);
 
@@ -2347,16 +2378,17 @@ inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode =
     return a;
 }
 
-inline const mpreal sum (const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
 {
-    mpreal x;
-    mpfr_ptr* t;
-    unsigned long int i;
+    mpfr_srcptr *p = new mpfr_srcptr[n];
 
-    t = new mpfr_ptr[n];
-    for (i=0;i<n;i++) t[i] = (mpfr_ptr)tab[i].mp;
-    mpfr_sum(x.mp,t,n,rnd_mode);
-    delete[] t;
+    for (unsigned long int  i = 0; i < n; i++)
+        p[i] = tab[i].mpfr_srcptr();
+
+    mpreal x;
+    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
+
+    delete [] p;
     return x;
 }
 
@@ -2369,9 +2401,9 @@ inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode =
     return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
 }
 
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) 
-{   
-    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);    
+inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
 }
 
 inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
@@ -2383,23 +2415,23 @@ inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = m
 inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
 {
     (void)rnd_mode;
-    
-    /*  
+
+    /*
 
     m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
 
     The following are true by convention:
     - mod(x,0) is x
     - mod(x,x) is 0
-    - mod(x,y) for x != y and y != 0 has the same sign as y.    
-    
+    - mod(x,y) for x != y and y != 0 has the same sign as y.
+
     */
 
     if(iszero(y)) return x;
     if(x == y) return 0;
 
     mpreal m = x - floor(x / y) * y;
-    
+
     m.setSign(sgn(y)); // make sure result has the same sign as Y
 
     return m;
@@ -2410,8 +2442,8 @@ inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode =
     mpreal a;
     mp_prec_t yp, xp;
 
-    yp = y.get_prec(); 
-    xp = x.get_prec(); 
+    yp = y.get_prec();
+    xp = x.get_prec();
 
     a.set_prec(yp>xp?yp:xp);
 
@@ -2553,33 +2585,24 @@ inline const mpreal nextbelow  (const mpreal& x)
 inline const mpreal urandomb (gmp_randstate_t& state)
 {
     mpreal x;
-    mpfr_urandomb(x.mp,state);
+    mpfr_urandomb(x.mpfr_ptr(),state);
     return x;
 }
 
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-// use gmp_randinit_default() to init state, gmp_randclear() to clear
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
 inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
 {
     mpreal x;
-    mpfr_urandom(x.mp,state,rnd_mode);
+    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
     return x;
 }
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_grandom(x.mp, NULL, state, rnd_mode);
-    return x;
-}
-
-#endif 
+#endif
 
 #if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
 inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
 {
     mpreal x;
-    mpfr_random2(x.mp,size,exp);
+    mpfr_random2(x.mpfr_ptr(),size,exp);
     return x;
 }
 #endif
@@ -2590,16 +2613,15 @@ inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
 // seed != 0
 inline const mpreal random(unsigned int seed = 0)
 {
-
 #if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
     static gmp_randstate_t state;
-    static bool isFirstTime = true;
+    static bool initialize = true;
 
-    if(isFirstTime)
+    if(initialize)
     {
         gmp_randinit_default(state);
         gmp_randseed_ui(state,0);
-        isFirstTime = false;
+        initialize = false;
     }
 
     if(seed != 0)    gmp_randseed_ui(state,seed);
@@ -2612,17 +2634,25 @@ inline const mpreal random(unsigned int seed = 0)
 
 }
 
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+
+inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x;
+    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
+    return x;
+}
+
 inline const mpreal grandom(unsigned int seed = 0)
 {
     static gmp_randstate_t state;
-    static bool isFirstTime = true;
+    static bool initialize = true;
 
-    if(isFirstTime)
+    if(initialize)
     {
         gmp_randinit_default(state);
         gmp_randseed_ui(state,0);
-        isFirstTime = false;
+        initialize = false;
     }
 
     if(seed != 0) gmp_randseed_ui(state,seed);
@@ -2634,17 +2664,17 @@ inline const mpreal grandom(unsigned int seed = 0)
 //////////////////////////////////////////////////////////////////////////
 // Set/Get global properties
 inline void mpreal::set_default_prec(mp_prec_t prec)
-{ 
-    mpfr_set_default_prec(prec); 
+{
+    mpfr_set_default_prec(prec);
 }
 
 inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{ 
-    mpfr_set_default_rounding_mode(rnd_mode); 
+{
+    mpfr_set_default_rounding_mode(rnd_mode);
 }
 
 inline bool mpreal::fits_in_bits(double x, int n)
-{   
+{
     int i;
     double t;
     return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
@@ -2894,7 +2924,7 @@ inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
     else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
 }
 
-// pow long double 
+// pow long double
 inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
 {
     return pow(mpreal(a),mpreal(b),rnd_mode);
@@ -2953,9 +2983,9 @@ namespace std
 {
   // we are allowed to extend namespace std with specializations only
     template <>
-    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y) 
-    { 
-        return mpfr::swap(x, y); 
+    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
+    {
+        return mpfr::swap(x, y);
     }
 
     template<>
@@ -2966,7 +2996,7 @@ namespace std
         static const bool is_signed         = true;
         static const bool is_integer        = false;
         static const bool is_exact          = false;
-        static const int  radix             = 2;    
+        static const int  radix             = 2;
 
         static const bool has_infinity      = true;
         static const bool has_quiet_NaN     = true;
@@ -2986,7 +3016,7 @@ namespace std
 
         // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
         inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
-    
+
         // Returns smallest eps such that x + eps != x (relative machine epsilon)
         inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
 
@@ -2994,8 +3024,8 @@ namespace std
         {
             mp_rnd_t r = mpfr::mpreal::get_default_rnd();
 
-            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision); 
-            else               return mpfr::mpreal(1.0, precision);    
+            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision);
+            else               return mpfr::mpreal(1.0, precision);
         }
 
         inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
@@ -3006,17 +3036,17 @@ namespace std
         // Please note, exponent range is not fixed in MPFR
         static const int min_exponent = MPFR_EMIN_DEFAULT;
         static const int max_exponent = MPFR_EMAX_DEFAULT;
-        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811); 
-        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811); 
+        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
+        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
 
 #ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
 
         // Following members should be constant according to standard, but they can be variable in MPFR
-        // So we define them as functions here. 
+        // So we define them as functions here.
         //
         // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
-        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost. 
-        // See below for compatible implementation. 
+        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
+        // See below for compatible implementation.
         inline static float_round_style round_style()
         {
             mp_rnd_t r = mpfr::mpreal::get_default_rnd();
@@ -3024,9 +3054,9 @@ namespace std
             switch (r)
             {
             case GMP_RNDN: return round_to_nearest;
-            case GMP_RNDZ: return round_toward_zero; 
-            case GMP_RNDU: return round_toward_infinity; 
-            case GMP_RNDD: return round_toward_neg_infinity; 
+            case GMP_RNDZ: return round_toward_zero;
+            case GMP_RNDU: return round_toward_infinity;
+            case GMP_RNDD: return round_toward_neg_infinity;
             default: return round_indeterminate;
             }
         }
@@ -3053,13 +3083,13 @@ namespace std
         // If possible, please use functions digits() and round_style() defined above.
         //
         // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
-        // Change them accordingly to your application. 
+        // Change them accordingly to your application.
         //
         // For example, if you use 256 bits of precision uniformly in your program, then:
         // digits       = 256
-        // digits10     = 77 
+        // digits10     = 77
         // max_digits10 = 78
-        // 
+        //
         // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
 
         static const std::float_round_style round_style = round_to_nearest;