From e6b10202218631be755f19c41fe01287b9a37f90 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 24 Jan 2017 13:55:18 -0800
Subject: [PATCH 1/5] Adds a fast memcpy function to Eigen. This takes
 advantage of the following:

1. For small fixed sizes, the compiler generates inline code for memcpy, which is much faster.

2. My colleague eriche at googl dot com discovered that for large sizes, memmove is significantly faster than memcpy (at least on Linux with GCC or Clang). See benchmark numbers measured on a Haswell (HP Z440) workstation here: https://docs.google.com/a/google.com/spreadsheets/d/1jLs5bKzXwhpTySw65MhG1pZpsIwkszZqQTjwrd_n0ic/pubhtml This is of course surprising since memcpy is a less constrained version of memmove. This stackoverflow thread contains some speculation as to the causes: http://stackoverflow.com/questions/22793669/poor-memcpy-performance-on-linux

Below are numbers for copying and slicing tensors using the multithreaded TensorDevice. The numbers show significant improvements for memcpy of very small blocks and for memcpy of large blocks single threaded (we were already able to saturate memory bandwidth for >1 threads before on large blocks). The "slicingSmallPieces" benchmark also shows small consistent improvements, since memcpy cost is a fair portion of that particular computation.

The benchmarks operate on NxN matrices, and the names are of the form BM_$OP_${NUMTHREADS}T/${N}.

Measured improvements in wall clock time:

Run on rmlarsen3.mtv (12 X 3501 MHz CPUs); 2017-01-20T11:26:31.493023454-08:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_memcpy_1T/2                          3.48      2.39    +31.3%
BM_memcpy_1T/8                          12.3      6.51    +47.0%
BM_memcpy_1T/64                          371       383     -3.2%
BM_memcpy_1T/512                       66922     66720     +0.3%
BM_memcpy_1T/4k                      9892867   6849682    +30.8%
BM_memcpy_1T/5k                     14951099  10332856    +30.9%
BM_memcpy_2T/2                          3.50      2.46    +29.7%
BM_memcpy_2T/8                          12.3      7.66    +37.7%
BM_memcpy_2T/64                          371       376     -1.3%
BM_memcpy_2T/512                       66652     66788     -0.2%
BM_memcpy_2T/4k                      6145012   6117776     +0.4%
BM_memcpy_2T/5k                      9181478   9010942     +1.9%
BM_memcpy_4T/2                          3.47      2.47    +31.0%
BM_memcpy_4T/8                          12.3      6.67    +45.8
BM_memcpy_4T/64                          374       376     -0.5%
BM_memcpy_4T/512                       67833     68019     -0.3%
BM_memcpy_4T/4k                      5057425   5188253     -2.6%
BM_memcpy_4T/5k                      7555638   7779468     -3.0%
BM_memcpy_6T/2                          3.51      2.50    +28.8%
BM_memcpy_6T/8                          12.3      7.61    +38.1%
BM_memcpy_6T/64                          373       378     -1.3%
BM_memcpy_6T/512                       66871     66774     +0.1%
BM_memcpy_6T/4k                      5112975   5233502     -2.4%
BM_memcpy_6T/5k                      7614180   7772246     -2.1%
BM_memcpy_8T/2                          3.47      2.41    +30.5%
BM_memcpy_8T/8                          12.4      10.5    +15.3%
BM_memcpy_8T/64                          372       388     -4.3%
BM_memcpy_8T/512                       67373     66588     +1.2%
BM_memcpy_8T/4k                      5148462   5254897     -2.1%
BM_memcpy_8T/5k                      7660989   7799058     -1.8%
BM_memcpy_12T/2                         3.50      2.40    +31.4%
BM_memcpy_12T/8                         12.4      7.55    +39.1
BM_memcpy_12T/64                         374       378     -1.1%
BM_memcpy_12T/512                      67132     66683     +0.7%
BM_memcpy_12T/4k                     5185125   5292920     -2.1%
BM_memcpy_12T/5k                     7717284   7942684     -2.9%
BM_slicingSmallPieces_1T/2              47.3      47.5     +0.4%
BM_slicingSmallPieces_1T/8              53.6      52.3     +2.4%
BM_slicingSmallPieces_1T/64              491       476     +3.1%
BM_slicingSmallPieces_1T/512           21734     18814    +13.4%
BM_slicingSmallPieces_1T/4k           394660    396760     -0.5%
BM_slicingSmallPieces_1T/5k           218722    209244     +4.3%
BM_slicingSmallPieces_2T/2              80.7      79.9     +1.0%
BM_slicingSmallPieces_2T/8              54.2      53.1     +2.0
BM_slicingSmallPieces_2T/64              497       477     +4.0%
BM_slicingSmallPieces_2T/512           21732     18822    +13.4%
BM_slicingSmallPieces_2T/4k           392885    390490     +0.6%
BM_slicingSmallPieces_2T/5k           221988    208678     +6.0%
BM_slicingSmallPieces_4T/2              80.8      80.1     +0.9%
BM_slicingSmallPieces_4T/8              54.1      53.2     +1.7%
BM_slicingSmallPieces_4T/64              493       476     +3.4%
BM_slicingSmallPieces_4T/512           21702     18758    +13.6%
BM_slicingSmallPieces_4T/4k           393962    404023     -2.6%
BM_slicingSmallPieces_4T/5k           249667    211732    +15.2%
BM_slicingSmallPieces_6T/2              80.5      80.1     +0.5%
BM_slicingSmallPieces_6T/8              54.4      53.4     +1.8%
BM_slicingSmallPieces_6T/64              488       478     +2.0%
BM_slicingSmallPieces_6T/512           21719     18841    +13.3%
BM_slicingSmallPieces_6T/4k           394950    397583     -0.7%
BM_slicingSmallPieces_6T/5k           223080    210148     +5.8%
BM_slicingSmallPieces_8T/2              81.2      80.4     +1.0%
BM_slicingSmallPieces_8T/8              58.1      53.5     +7.9%
BM_slicingSmallPieces_8T/64              489       480     +1.8%
BM_slicingSmallPieces_8T/512           21586     18798    +12.9%
BM_slicingSmallPieces_8T/4k           394592    400165     -1.4%
BM_slicingSmallPieces_8T/5k           219688    208301     +5.2%
BM_slicingSmallPieces_12T/2             80.2      79.8     +0.7%
BM_slicingSmallPieces_12T/8             54.4      53.4     +1.8
BM_slicingSmallPieces_12T/64             488       476     +2.5%
BM_slicingSmallPieces_12T/512          21931     18831    +14.1%
BM_slicingSmallPieces_12T/4k          393962    396541     -0.7%
BM_slicingSmallPieces_12T/5k          218803    207965     +5.0%
---
 Eigen/src/Core/util/Memory.h                  | 61 +++++++++++++++----
 .../CXX11/src/Tensor/TensorContraction.h      |  2 +-
 .../CXX11/src/Tensor/TensorDeviceDefault.h    |  2 +-
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorFFT.h        |  4 +-
 5 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index c634d7ea0..6b8e307c8 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -63,7 +63,7 @@ namespace Eigen {
 
 namespace internal {
 
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline void throw_std_bad_alloc()
 {
   #ifdef EIGEN_EXCEPTIONS
@@ -74,6 +74,41 @@ inline void throw_std_bad_alloc()
   #endif
 }
 
+EIGEN_DEVICE_FUNC
+inline void fast_memcpy(void* dst, const void* src, size_t size) {
+#if defined(__CUDA__) || defined(__ANDROID__)
+  ::memcpy(dst, src, size);
+#else
+    switch(size) {
+    // Most compilers will generate inline code for fixed sizes,
+    // which is significantly faster for small copies.
+    case  1: memcpy(dst, src, 1); break;
+    case  2: memcpy(dst, src, 2); break;
+    case  3: memcpy(dst, src, 3); break;
+    case  4: memcpy(dst, src, 4); break;
+    case  5: memcpy(dst, src, 5); break;
+    case  6: memcpy(dst, src, 6); break;
+    case  7: memcpy(dst, src, 7); break;
+    case  8: memcpy(dst, src, 8); break;
+    case  9: memcpy(dst, src, 9); break;
+    case 10: memcpy(dst, src, 10); break;
+    case 11: memcpy(dst, src, 11); break;
+    case 12: memcpy(dst, src, 12); break;
+    case 13: memcpy(dst, src, 13); break;
+    case 14: memcpy(dst, src, 14); break;
+    case 15: memcpy(dst, src, 15); break;
+    case 16: memcpy(dst, src, 16); break;
+#ifdef EIGEN_OS_LINUX
+    // On Linux, memmove appears to be faster than memcpy for
+    // large sizes, strangely enough.
+    default: memmove(dst, src, size); break;
+#else
+    default: memcpy(dst, src, size); break;
+#endif
+    }
+#endif
+}
+
 /*****************************************************************************
 *** Implementation of handmade aligned functions                           ***
 *****************************************************************************/
@@ -114,7 +149,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
-  
+
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -142,7 +177,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
-#else 
+#else
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
@@ -471,8 +506,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index
 }
 
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
-  */ 
-template<typename Index> 
+  */
+template<typename Index>
 inline Index first_multiple(Index size, Index base)
 {
   return ((size+base-1)/base)*base;
@@ -502,7 +537,7 @@ template<typename T> struct smart_copy_helper<T,false> {
   { std::copy(start, end, target); }
 };
 
-// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. 
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise.
 template<typename T, bool UseMemmove> struct smart_memmove_helper;
 
 template<typename T> void smart_memmove(const T* start, const T* end, T* target)
@@ -522,15 +557,15 @@ template<typename T> struct smart_memmove_helper<T,true> {
 
 template<typename T> struct smart_memmove_helper<T,false> {
   static inline void run(const T* start, const T* end, T* target)
-  { 
+  {
     if (UIntPtr(target) < UIntPtr(start))
     {
       std::copy(start, end, target);
     }
-    else                                 
+    else
     {
       std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
-      std::copy_backward(start, end, target + count); 
+      std::copy_backward(start, end, target + count);
     }
   }
 };
@@ -603,7 +638,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 {
   std::swap(a.ptr(),b.ptr());
 }
-    
+
 } // end namespace internal
 
 /** \internal
@@ -622,7 +657,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
   */
 #ifdef EIGEN_ALLOCA
-  
+
   #if EIGEN_DEFAULT_ALIGN_BYTES>0
     // We always manually re-align the result of EIGEN_ALLOCA.
     // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
@@ -645,7 +680,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
     Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
-    
+
 #endif
 
 
@@ -701,7 +736,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 * Example:
 * \code
 * // Matrix4f requires 16 bytes alignment:
-* std::map< int, Matrix4f, std::less<int>, 
+* std::map< int, Matrix4f, std::less<int>,
 *           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
 * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
 * std::map< int, Vector3f > my_map_vec3;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 442c14fac..39012b937 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -56,7 +56,7 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index
   } else {
     // Naive memcpy calls
     for (Index col = 0; col < cols; ++col) {
-      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
+      internal::fast_memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
     }
   }
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6cb2..b133781ae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -22,7 +22,7 @@ struct DefaultDevice {
     internal::aligned_free(buffer);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
+    internal::fast_memcpy(dst, src, n);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 16180ca69..facdea735 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ struct ThreadPoolDevice {
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
+    internal::fast_memcpy(dst, src, n);
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 08eb5595a..f060191ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);

From 3be5ee2352423427c95b133ed749f4d9316fe135 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 24 Jan 2017 14:22:49 -0800
Subject: [PATCH 2/5] Update copy helper to use fast_memcpy.

---
 Eigen/src/Core/util/Memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 6b8e307c8..572b1fe69 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -528,7 +528,7 @@ template<typename T> struct smart_copy_helper<T,true> {
     IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
-    memcpy(target, start, size);
+    fast_memcpy(target, start, size);
   }
 };
 

From d06a48959abac6369336d3873d46aee78f8fbec2 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 25 Jan 2017 15:27:13 +0100
Subject: [PATCH 3/5] bug #1383: Fix regression from 3.2 with
 LinSpaced(n,0,n-1) with n==0.

---
 Eigen/src/Core/functors/NullaryFunctors.h |  2 +-
 test/nullary.cpp                          | 24 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 0311d9035..60c5f7f2a 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -93,7 +93,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
   linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
     m_low(low),
     m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
-    m_divisor(convert_index<Scalar>(num_steps+high-low)/(high-low+1)),
+    m_divisor(convert_index<Scalar>(num_steps+high-low)/((high-low+1)==0?1:(high-low+1))),
     m_use_divisor((high+1)<(low+num_steps))
   {}
 
diff --git a/test/nullary.cpp b/test/nullary.cpp
index 351d26e74..bb0cea937 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -152,6 +152,30 @@ void testVectorType(const VectorType& base)
     m.tail(size-1).setLinSpaced(low, high);
     VERIFY_IS_APPROX(m(size-1), high);
   }
+
+  // regression test for bug 1383 (LinSpaced with empty size/range)
+  {
+    Index n0 = VectorType::SizeAtCompileTime==Dynamic ? 0 : VectorType::SizeAtCompileTime;
+    low = internal::random<Scalar>();
+    m = VectorType::LinSpaced(n0,low,low-1);
+    VERIFY(m.size()==n0);
+
+    if(VectorType::SizeAtCompileTime==Dynamic)
+    {
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,0,Scalar(n0-1)).sum(),Scalar(0));
+      VERIFY_IS_EQUAL(VectorType::LinSpaced(n0,low,low-1).sum(),Scalar(0));
+    }
+
+    m.setLinSpaced(n0,0,Scalar(n0-1));
+    VERIFY(m.size()==n0);
+    m.setLinSpaced(n0,low,low-1);
+    VERIFY(m.size()==n0);
+
+    // empty range only:
+    VERIFY_IS_APPROX(VectorType::LinSpaced(size,low,low),VectorType::Constant(size,low));
+    m.setLinSpaced(size,low,low);
+    VERIFY_IS_APPROX(m,VectorType::Constant(size,low));
+  }
 }
 
 template<typename MatrixType>

From 296d24be4dd3c700089d1d9182a843c60d68019c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 25 Jan 2017 17:39:01 +0100
Subject: [PATCH 4/5] bug #1381: fix sparse.diagonal() used as a rvalue. The
 problem was that is "sparse" is not const, then sparse.diagonal() must have
 the LValueBit flag meaning that sparse.diagonal().coeff(i) must returns a
 const reference, const Scalar&. However, sparse::coeff() cannot returns a
 reference for a non-existing zero coefficient. The trick is to return a
 reference to a local member of evaluator<SparseMatrix>.

---
 Eigen/src/Core/CoreEvaluators.h             |  4 +--
 Eigen/src/Core/util/XprHelper.h             |  2 +-
 Eigen/src/SparseCore/SparseCompressedBase.h | 38 +++++++++++++++------
 test/sparse_basic.cpp                       |  4 +++
 4 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 24fc7835b..412f5a661 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -1613,9 +1613,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
   { }
  
   typedef typename XprType::Scalar Scalar;
-  // FIXME having to check whether ArgType is sparse here i not very nice.
-  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
-                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index) const
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index b94dd61ff..d5e565cb8 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -638,7 +638,7 @@ struct plain_constant_type
 template<typename ExpressionType>
 struct is_lvalue
 {
-  enum { value = !bool(is_const<ExpressionType>::value) &&
+  enum { value = (!bool(is_const<ExpressionType>::value)) &&
                  bool(traits<ExpressionType>::Flags & LvalueBit) };
 };
 
diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h
index d32a8df40..e0b3c22b6 100644
--- a/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -295,11 +295,11 @@ struct evaluator<SparseCompressedBase<Derived> >
     Flags = Derived::Flags
   };
   
-  evaluator() : m_matrix(0)
+  evaluator() : m_matrix(0), m_zero(0)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
-  explicit evaluator(const Derived &mat) : m_matrix(&mat)
+  explicit evaluator(const Derived &mat) : m_matrix(&mat), m_zero(0)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -312,26 +312,42 @@ struct evaluator<SparseCompressedBase<Derived> >
   operator const Derived&() const { return *m_matrix; }
   
   typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
-  Scalar coeff(Index row, Index col) const
-  { return m_matrix->coeff(row,col); }
-  
+  const Scalar& coeff(Index row, Index col) const
+  {
+    Index p = find(row,col);
+
+    if(p==Dynamic)
+      return m_zero;
+    else
+      return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
   Scalar& coeffRef(Index row, Index col)
+  {
+    Index p = find(row,col);
+    eigen_assert(p!=Dynamic && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+protected:
+
+  Index find(Index row, Index col) const
   {
     eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
-      
+
     const Index outer = Derived::IsRowMajor ? row : col;
     const Index inner = Derived::IsRowMajor ? col : row;
 
     Index start = m_matrix->outerIndexPtr()[outer];
     Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
-    eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
-    const Index p =   std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner)
-                    - m_matrix->innerIndexPtr();
-    eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist");
-    return m_matrix->const_cast_derived().valuePtr()[p];
+    eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr();
+
+    return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic;
   }
 
   const Derived *m_matrix;
+  const Scalar m_zero;
 };
 
 }
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp
index 208a66f12..91b7cb335 100644
--- a/test/sparse_basic.cpp
+++ b/test/sparse_basic.cpp
@@ -485,6 +485,10 @@ template<typename SparseMatrixType> void sparse_basic(const SparseMatrixType& re
     SparseMatrixType m2(rows, cols);
     initSparse<Scalar>(density, refMat2, m2);
     VERIFY_IS_APPROX(m2.diagonal(), refMat2.diagonal().eval());
+    DenseVector d = m2.diagonal();
+    VERIFY_IS_APPROX(d, refMat2.diagonal().eval());
+    d = m2.diagonal().array();
+    VERIFY_IS_APPROX(d, refMat2.diagonal().eval());
     VERIFY_IS_APPROX(const_cast<const SparseMatrixType&>(m2).diagonal(), refMat2.diagonal().eval());
     
     initSparse<Scalar>(density, refMat2, m2, ForceNonZeroDiag);

From 850ca961d28df99a0ba44bd8bf034ac08e39686e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 25 Jan 2017 18:13:53 +0100
Subject: [PATCH 5/5] bug #1383: fix regression in LinSpaced for integers and
 high<low

---
 Eigen/src/Core/functors/NullaryFunctors.h |  4 ++--
 test/nullary.cpp                          | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h
index 60c5f7f2a..6a30466fb 100644
--- a/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/Eigen/src/Core/functors/NullaryFunctors.h
@@ -93,8 +93,8 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
   linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
     m_low(low),
     m_multiplier((high-low)/convert_index<Scalar>(num_steps<=1 ? 1 : num_steps-1)),
-    m_divisor(convert_index<Scalar>(num_steps+high-low)/((high-low+1)==0?1:(high-low+1))),
-    m_use_divisor((high+1)<(low+num_steps))
+    m_divisor(convert_index<Scalar>((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))),
+    m_use_divisor(num_steps>1 && (numext::abs(high-low)+1)<num_steps)
   {}
 
   template<typename IndexType>
diff --git a/test/nullary.cpp b/test/nullary.cpp
index bb0cea937..acd55506e 100644
--- a/test/nullary.cpp
+++ b/test/nullary.cpp
@@ -175,6 +175,21 @@ void testVectorType(const VectorType& base)
     VERIFY_IS_APPROX(VectorType::LinSpaced(size,low,low),VectorType::Constant(size,low));
     m.setLinSpaced(size,low,low);
     VERIFY_IS_APPROX(m,VectorType::Constant(size,low));
+
+    if(NumTraits<Scalar>::IsInteger)
+    {
+      VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+size-1)), VectorType::LinSpaced(size,Scalar(low+size-1),low).reverse() );
+
+      if(VectorType::SizeAtCompileTime==Dynamic)
+      {
+        // Check negative multiplicator path:
+        for(Index k=1; k<5; ++k)
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size,low,Scalar(low+(size-1)*k)), VectorType::LinSpaced(size,Scalar(low+(size-1)*k),low).reverse() );
+        // Check negative divisor path:
+        for(Index k=1; k<5; ++k)
+          VERIFY_IS_APPROX( VectorType::LinSpaced(size*k,low,Scalar(low+size-1)), VectorType::LinSpaced(size*k,Scalar(low+size-1),low).reverse() );
+      }
+    }
   }
 }
 
@@ -222,7 +237,8 @@ void test_nullary()
     CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );
     CALL_SUBTEST_8( testVectorType(Matrix<float,1,1>()) );
 
-    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,300))) );
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(1,10))) );
+    CALL_SUBTEST_9( testVectorType(VectorXi(internal::random<int>(9,300))) );
     CALL_SUBTEST_9( testVectorType(Matrix<int,1,1>()) );
   }