diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29d3896d8..c9d5af679 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,10 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
       MESSAGE("Enabling SSE2 in tests/examples")
     ENDIF(TEST_SSE2)
+    IF(TEST_SSE3)
+      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
+      MESSAGE("Enabling SSE3 in tests/examples")
+    ENDIF(TEST_SSE3)
     IF(TEST_ALTIVEC)
       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
       MESSAGE("Enabling AltiVec in tests/examples")
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index d5604824f..1c292d104 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -52,6 +52,9 @@ private:
     InnerSize = int(Derived::Flags)&RowMajorBit
               ? Derived::ColsAtCompileTime
               : Derived::RowsAtCompileTime,
+    InnerMaxSize = int(Derived::Flags)&RowMajorBit
+              ? Derived::MaxColsAtCompileTime
+              : Derived::MaxRowsAtCompileTime,
     PacketSize = ei_packet_traits<typename Derived::Scalar>::size
   };
 
@@ -60,7 +63,9 @@ private:
              && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
     MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
     MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MaySliceVectorize = MightVectorize && InnerSize==Dynamic
+    MaySliceVectorize = MightVectorize && InnerMaxSize==Dynamic /* slice vectorization can be slow, so we only
+      want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
+      of a dynamic block in a fixed-size matrix */
   };
 
 public:
@@ -349,7 +354,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
 template<typename Derived1, typename Derived2>
 struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling>
 {
-  inline static void run(Derived1 &dst, const Derived2 &src)
+  static void run(Derived1 &dst, const Derived2 &src)
   {
     const int size = Derived1::SizeAtCompileTime;
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
@@ -383,8 +388,30 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
 {
   static void run(Derived1 &dst, const Derived2 &src)
   {
-    //FIXME unimplemented, so for now we fall back to non-vectorized path
-    ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>::run(dst, src);
+    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+    const bool rowMajor = Derived1::Flags&RowMajorBit;
+    const int innerSize = rowMajor ? dst.cols() : dst.rows();
+    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int alignedInnerSize = (innerSize/packetSize)*packetSize;
+    
+    for(int i = 0; i < outerSize; i++)
+    {
+      // do the vectorizable part of the assignment
+      for (int index = 0; index<alignedInnerSize ; index+=packetSize)
+      {
+        const int row = rowMajor ? i : index;
+        const int col = rowMajor ? index : i;
+        dst.template writePacket<UnAligned>(row, col, src.template packet<UnAligned>(row, col));
+      }
+
+      // do the non-vectorizable part of the assignment
+      for (int index = alignedInnerSize; index<innerSize ; index++)
+      {
+        const int row = rowMajor ? i : index;
+        const int col = rowMajor ? index : i;
+        dst.coeffRef(row, col) = src.coeff(row, col);
+      }
+    }
   }
 };
 
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 1f387af32..1e90d2ef9 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -38,7 +38,7 @@ enum {
 template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
 struct ei_product_coeff_impl;
 
-template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar>
+template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
 struct ei_product_packet_impl;
 
 template<typename T> class ei_product_eval_to_column_major;
@@ -188,10 +188,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
                                   Unroll ? InnerSize-1 : Dynamic,
                                   _LhsNested, _RhsNested> ScalarCoeffImpl;
 
-    typedef ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
-                                   Unroll ? InnerSize-1 : Dynamic,
-                                   _LhsNested, _RhsNested, PacketScalar> PacketCoeffImpl;
-
   public:
 
     template<typename Lhs, typename Rhs>
@@ -232,7 +228,10 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
     const PacketScalar _packet(int row, int col) const
     {
       PacketScalar res;
-      PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res);
+      ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
+                                   Unroll ? InnerSize-1 : Dynamic,
+                                   _LhsNested, _RhsNested, PacketScalar, LoadMode>
+        ::run(row, col, m_lhs, m_rhs, res);
       return res;
     }
 
@@ -356,63 +355,63 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
 *** Packet path  ***
 *******************/
 
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar>
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
+    ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
   }
 };
 
-template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar>
+template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
+    ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
   {
-    res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
   }
 };
 
-template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar>
+template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
   {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
+    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
       for(int i = 1; i < lhs.cols(); i++)
-        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
+        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
 {
   inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
   {
-    res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
       for(int i = 1; i < lhs.cols(); i++)
-        res =  ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
+        res =  ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
   }
 };
 
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index ffd6aebeb..03fa6bce5 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -94,7 +94,7 @@ inline void ei_pstore(int*    to, const __m128i& from) { _mm_store_si128(reinter
 
 inline void ei_pstoreu(float*  to, const __m128&  from) { _mm_storeu_ps(to, from); }
 inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
-inline void ei_pstoreu(int*    to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
+inline void ei_pstoreu(int*    to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 
 inline float  ei_pfirst(const __m128&  a) { return _mm_cvtss_f32(a); }
 inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f89a07e3b..14ed29a3d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,7 +11,7 @@ ENDIF(CMAKE_COMPILER_IS_GNUCXX)
 
 OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF)
 
-# similar to SET_TARGET_PROPERTIES but append the property instead of overwritting it
+# similar to SET_TARGET_PROPERTIES but append the property instead of overwriting it
 MACRO(EI_ADD_TARGET_PROPERTY target prop value)
 
   GET_TARGET_PROPERTY(previous ${target} ${prop})