diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 297ed2456..55652db48 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -65,22 +65,8 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
-  // Here we evaluate to a temporary matrix tmp, which we then copy. The main purpose
-  // of this is to limit the number of instantiations of the template method evalTo<Destination>():
-  // we only instantiate for PlainMatrixType.
-  // Notice that this behaviour is specific to this operator in MatrixBase. The corresponding operator in class Matrix
-  // does not evaluate into a temporary first.
-  // TODO find a way to avoid evaluating into a temporary in the cases that matter. At least Block<> matters
-  // for the implementation of blocked algorithms.
-  // Should we:
-  //  - try a trick like for the products, where the destination is abstracted as an array with stride?
-  //  - or just add an operator in class Block, so we get a separate instantiation there (bad) but at least not more
-  //    than that, and at least that's easy to make work?
-  //  - or, since here we're talking about a compromise between code size and performance, let the user choose?
-  //    Not obvious: many users will never find out about this feature, and it's hard to find a good API.
-  PlainMatrixType tmp;
-  other.evalTo(tmp);
-  return derived() = tmp;
+  other.evalTo(derived());
+  return derived();
 }
 
 #endif // EIGEN_RETURNBYVALUE_H
diff --git a/Eigen/src/LU/LU.h b/Eigen/src/LU/LU.h
index 9ba6162f0..455a7d67e 100644
--- a/Eigen/src/LU/LU.h
+++ b/Eigen/src/LU/LU.h
@@ -504,25 +504,24 @@ struct ei_lu_kernel_impl : public ReturnByValue<ei_lu_kernel_impl<MatrixType> >
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   const LUType& m_lu;
-  int m_rank, m_dimker;
+  int m_rank, m_cols;
   
   ei_lu_kernel_impl(const LUType& lu)
     : m_lu(lu),
       m_rank(lu.rank()),
-      m_dimker(m_lu.matrixLU().cols() - m_rank) {}
+      m_cols(m_rank==lu.matrixLU().cols() ? 1 : lu.matrixLU().cols() - m_rank){}
 
   inline int rows() const { return m_lu.matrixLU().cols(); }
-  inline int cols() const { return m_dimker; }
+  inline int cols() const { return m_cols; }
 
   template<typename Dest> void evalTo(Dest& dst) const
   {
-    const int cols = m_lu.matrixLU().cols();
-    if(m_dimker == 0)
+    const int cols = m_lu.matrixLU().cols(), dimker = cols - m_rank;
+    if(dimker == 0)
     {
       // The Kernel is just {0}, so it doesn't have a basis properly speaking, but let's
       // avoid crashing/asserting as that depends on floating point calculations. Let's
       // just return a single column vector filled with zeros.
-      dst.resize(cols,1);
       dst.setZero();
       return;
     }
@@ -543,8 +542,6 @@ struct ei_lu_kernel_impl : public ReturnByValue<ei_lu_kernel_impl<MatrixType> >
       * independent vectors in Ker U.
       */
 
-    dst.resize(cols, m_dimker);
-
     Matrix<int, Dynamic, 1, 0, LUType::MaxSmallDimAtCompileTime, 1> pivots(m_rank);
     RealScalar premultiplied_threshold = m_lu.maxPivot() * m_lu.threshold();
     int p = 0;
@@ -569,15 +566,15 @@ struct ei_lu_kernel_impl : public ReturnByValue<ei_lu_kernel_impl<MatrixType> >
 
     m.corner(TopLeft, m_rank, m_rank)
      .template triangularView<UpperTriangular>().solveInPlace(
-        m.corner(TopRight, m_rank, m_dimker)
+        m.corner(TopRight, m_rank, dimker)
       );
 
     for(int i = m_rank-1; i >= 0; --i)
       m.col(i).swap(m.col(pivots.coeff(i)));
 
-    for(int i = 0; i < m_rank; ++i) dst.row(m_lu.permutationQ().coeff(i)) = -m.row(i).end(m_dimker);
+    for(int i = 0; i < m_rank; ++i) dst.row(m_lu.permutationQ().coeff(i)) = -m.row(i).end(dimker);
     for(int i = m_rank; i < cols; ++i) dst.row(m_lu.permutationQ().coeff(i)).setZero();
-    for(int k = 0; k < m_dimker; ++k) dst.coeffRef(m_lu.permutationQ().coeff(m_rank+k), k) = Scalar(1);
+    for(int k = 0; k < dimker; ++k) dst.coeffRef(m_lu.permutationQ().coeff(m_rank+k), k) = Scalar(1);
   }
 };
 
@@ -603,14 +600,16 @@ struct ei_lu_image_impl : public ReturnByValue<ei_lu_image_impl<MatrixType> >
   typedef LU<MatrixType> LUType;
   typedef typename MatrixType::RealScalar RealScalar;
   const LUType& m_lu;
-  int m_rank;
+  int m_rank, m_cols;
   const MatrixType& m_originalMatrix;
   
   ei_lu_image_impl(const LUType& lu, const MatrixType& originalMatrix)
-    : m_lu(lu), m_rank(lu.rank()), m_originalMatrix(originalMatrix) {}
+    : m_lu(lu), m_rank(lu.rank()),
+      m_cols(m_rank == 0 ? 1 : m_rank),
+      m_originalMatrix(originalMatrix) {}
 
-  inline int rows() const { return m_lu.matrixLU().cols(); }
-  inline int cols() const { return m_rank; }
+  inline int rows() const { return m_lu.matrixLU().rows(); }
+  inline int cols() const { return m_cols; }
 
   template<typename Dest> void evalTo(Dest& dst) const
   {
@@ -619,7 +618,6 @@ struct ei_lu_image_impl : public ReturnByValue<ei_lu_image_impl<MatrixType> >
       // The Image is just {0}, so it doesn't have a basis properly speaking, but let's
       // avoid crashing/asserting as that depends on floating point calculations. Let's
       // just return a single column vector filled with zeros.
-      dst.resize(m_originalMatrix.rows(), 1);
       dst.setZero();
       return;
     }
@@ -632,7 +630,6 @@ struct ei_lu_image_impl : public ReturnByValue<ei_lu_image_impl<MatrixType> >
         pivots.coeffRef(p++) = i;
     ei_assert(p == m_rank && "You hit a bug in Eigen! Please report (backtrace and matrix)!");
     
-    dst.resize(m_originalMatrix.rows(), m_rank);
     for(int i = 0; i < m_rank; ++i)
       dst.col(i) = m_originalMatrix.col(m_lu.permutationQ().coeff(pivots.coeff(i)));
   }
@@ -682,8 +679,6 @@ struct ei_lu_solve_impl : public ReturnByValue<ei_lu_solve_impl<MatrixType, Rhs>
     ei_assert(m_rhs.rows() == rows);
     const int smalldim = std::min(rows, cols);
 
-    dst.resize(m_lu.matrixLU().cols(), m_rhs.cols());
-
     if(nonzero_pivots == 0)
     {
       dst.setZero();
diff --git a/test/lu.cpp b/test/lu.cpp
index a08614e28..90755f59c 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -30,15 +30,43 @@ template<typename MatrixType> void lu_non_invertible()
   /* this test covers the following files:
      LU.h
   */
-  int rows = ei_random<int>(20,200), cols = ei_random<int>(20,200), cols2 = ei_random<int>(20,200);
+  int rows, cols, cols2;
+  if(MatrixType::RowsAtCompileTime==Dynamic)
+  {
+    rows = ei_random<int>(20,200);
+  }
+  else
+  {
+    rows = MatrixType::RowsAtCompileTime;
+  }
+  if(MatrixType::ColsAtCompileTime==Dynamic)
+  {
+    cols = ei_random<int>(20,200);
+    cols2 = ei_random<int>(20,200);
+  }
+  else
+  {
+    cols2 = cols = MatrixType::ColsAtCompileTime;
+  }
+
+  typedef typename ei_lu_kernel_impl<MatrixType>::ReturnMatrixType KernelMatrixType;
+  typedef typename ei_lu_image_impl <MatrixType>::ReturnMatrixType ImageMatrixType;
+  typedef Matrix<typename MatrixType::Scalar, Dynamic, Dynamic> DynamicMatrixType;
+  typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime>
+          CMatrixType;
+
   int rank = ei_random<int>(1, std::min(rows, cols)-1);
 
-  MatrixType m1(rows, cols), m2(cols, cols2), m3(rows, cols2), k(1,1);
+  // The image of the zero matrix should consist of a single (zero) column vector
+  VERIFY((MatrixType::Zero(rows,cols).lu().image(MatrixType::Zero(rows,cols)).cols() == 1));
+  
+  MatrixType m1(rows, cols), m3(rows, cols2);
+  CMatrixType m2(cols, cols2);
   createRandomMatrixOfRank(rank, rows, cols, m1);
 
   LU<MatrixType> lu(m1);
-  typename ei_lu_kernel_impl<MatrixType>::ReturnMatrixType m1kernel = lu.kernel();
-  typename ei_lu_image_impl <MatrixType>::ReturnMatrixType m1image  = lu.image(m1);
+  KernelMatrixType m1kernel = lu.kernel();
+  ImageMatrixType m1image  = lu.image(m1);
 
   // std::cerr << rank << " " << lu.rank() << std::endl;
   VERIFY(rank == lu.rank());
@@ -48,19 +76,18 @@ template<typename MatrixType> void lu_non_invertible()
   VERIFY(!lu.isSurjective());
   VERIFY((m1 * m1kernel).isMuchSmallerThan(m1));
   VERIFY(m1image.lu().rank() == rank);
-  MatrixType sidebyside(m1.rows(), m1.cols() + m1image.cols());
+  DynamicMatrixType sidebyside(m1.rows(), m1.cols() + m1image.cols());
   sidebyside << m1, m1image;
   VERIFY(sidebyside.lu().rank() == rank);
-  m2 = MatrixType::Random(cols,cols2);
+  m2 = CMatrixType::Random(cols,cols2);
   m3 = m1*m2;
-  m2 = MatrixType::Random(cols,cols2);
+  m2 = CMatrixType::Random(cols,cols2);
   // test that the code, which does resize(), may be applied to an xpr
   m2.block(0,0,m2.rows(),m2.cols()) = lu.solve(m3);
   VERIFY_IS_APPROX(m3, m1*m2);
   
-  typedef Matrix<typename MatrixType::Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
-  SquareMatrixType m4(rows, rows), m5(rows, rows);
-  createRandomMatrixOfRank(rows/2, rows, rows, m4);
+  CMatrixType m4(cols, cols), m5(cols, cols);
+  createRandomMatrixOfRank(cols/2, cols, cols, m4);
   VERIFY(!m4.computeInverseWithCheck(&m5));
 }
 
@@ -84,6 +111,7 @@ template<typename MatrixType> void lu_invertible()
 
   LU<MatrixType> lu(m1);
   VERIFY(0 == lu.dimensionOfKernel());
+  VERIFY(lu.kernel().cols() == 1); // the kernel() should consist of a single (zero) column vector
   VERIFY(size == lu.rank());
   VERIFY(lu.isInjective());
   VERIFY(lu.isSurjective());
@@ -125,6 +153,8 @@ template<typename MatrixType> void lu_verify_assert()
 void test_lu()
 {
   for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST( lu_non_invertible<Matrix3f>() );
+    CALL_SUBTEST( (lu_non_invertible<Matrix<double, 4, 6> >()) );
     CALL_SUBTEST( lu_non_invertible<MatrixXf>() );
     CALL_SUBTEST( lu_non_invertible<MatrixXd>() );
     CALL_SUBTEST( lu_non_invertible<MatrixXcf>() );