Extend support for Packet16b:

* Add ptranspose<*,4> to support matmul and add unit test for Matrix<bool> * Matrix<bool> * work around a bug in slicing of Tensor<bool>. * Add tensor tests This speeds up matmul for boolean matrices by about 10x name old time/op new time/op delta BM_MatMul<bool>/8 267ns ± 0% 479ns ± 0% +79.25% (p=0.008 n=5+5) BM_MatMul<bool>/32 6.42µs ± 0% 0.87µs ± 0% -86.50% (p=0.008 n=5+5) BM_MatMul<bool>/64 43.3µs ± 0% 5.9µs ± 0% -86.42% (p=0.008 n=5+5) BM_MatMul<bool>/128 315µs ± 0% 44µs ± 0% -85.98% (p=0.008 n=5+5) BM_MatMul<bool>/256 2.41ms ± 0% 0.34ms ± 0% -85.68% (p=0.008 n=5+5) BM_MatMul<bool>/512 18.8ms ± 0% 2.7ms ± 0% -85.53% (p=0.008 n=5+5) BM_MatMul<bool>/1k 149ms ± 0% 22ms ± 0% -85.40% (p=0.008 n=5+5)
2025-07-09 22:51:51 +08:00 · 2020-04-24 17:29:25 -07:00 · 2020-04-24 17:29:25 -07:00 · ab773c7e91
commit ab773c7e91
parent b47c777993
10 changed files with 267 additions and 162 deletions
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -179,6 +179,9 @@ preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); }
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a, const Packet& b) { return a+b; }
 // Avoid compiler warning for boolean algebra.
 template<> EIGEN_DEVICE_FUNC inline bool
 padd(const bool& a, const bool& b) { return a || b; }
 /** \internal \returns a - b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@ -196,6 +199,9 @@ pconj(const Packet& a) { return numext::conj(a); }
 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmul(const Packet& a, const Packet& b) { return a*b; }
 // Avoid compiler warning for boolean algebra.
 template<> EIGEN_DEVICE_FUNC inline bool
 pmul(const bool& a, const bool& b) { return a && b; }
 /** \internal \returns a / b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -170,10 +170,10 @@ template<> struct packet_traits<bool> : default_packet_traits
    HasHalfPacket = 0,
    size=16,
-    HasAdd       = 0,
+    HasAdd       = 1,
    HasSub       = 0,
    HasShift     = 0,
-    HasMul       = 0,
+    HasMul       = 1,
    HasNegate    = 0,
    HasAbs       = 0,
    HasAbs2      = 0,
@ -249,6 +249,8 @@ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
@ -290,6 +292,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
@ -646,6 +650,7 @@ template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
 template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
@ -762,6 +767,7 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
  Packet4i tmp0 = _mm_hadd_epi32(a,a);
  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
 }
 #else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
@ -769,8 +775,22 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
 #endif
 #ifdef EIGEN_VECTORIZE_SSE4_1
 template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
  Packet16b tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
  return _mm_extract_epi64(tmp, 0) != 0;
 }
 #else
 template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
 Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
 }
 #endif
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
@ -987,6 +1007,19 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet16b,4>& kernel) {
  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
 }
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@ -39,12 +39,12 @@ struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::padd(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
  { return internal::predux(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@ -56,15 +56,9 @@ struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
  };
 };
-/** \internal
+
-  * \brief Template specialization to deprecate the summation of boolean expressions.
+template<>
-  * This is required to solve Bug 426.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a || b; }
  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
  */
 template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
  EIGEN_DEPRECATED
  scalar_sum_op() {}
 };
 /** \internal
@ -83,12 +77,12 @@ struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
    EIGEN_SCALAR_BINARY_OP_PLUGIN
  }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmul(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
  { return internal::predux_mul(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@ -100,6 +94,10 @@ struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
  };
 };
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a && b; }
 /** \internal
  * \brief Template functor to compute the conjugate product of two scalars
  *
@ -116,11 +114,11 @@ struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const
  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
 };
 template<typename LhsScalar,typename RhsScalar>
@ -141,12 +139,12 @@ struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmin(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
  { return internal::predux_min(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@ -167,12 +165,12 @@ struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
  { return internal::pmax(a,b); }
  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
  { return internal::predux_max(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@ -56,6 +56,31 @@ test_lazy_single(int rows, int cols, int depth)
  VERIFY_IS_APPROX(C+=A.lazyProduct(B), ref_prod(D,A,B));
 }
 template<typename T>
 void test_dynamic_exact()
 {
  int rows = internal::random<int>(1,64);
  int cols = internal::random<int>(1,64);
  int depth = internal::random<int>(1,65);
  typedef Matrix<T,Dynamic,Dynamic> MatrixX;
  MatrixX A(rows,depth); A.setRandom();
  MatrixX B(depth,cols); B.setRandom();
  MatrixX  C(rows,cols);  C.setRandom();
  MatrixX  D(C);
  for(Index i=0;i<C.rows();++i)
    for(Index j=0;j<C.cols();++j)
      for(Index k=0;k<A.cols();++k)
       D.coeffRef(i,j) |= A.coeff(i,k) & B.coeff(k,j);
  C += A * B;
  VERIFY_IS_EQUAL(C, D);
  MatrixX E = B.transpose();
  for(Index i=0;i<B.rows();++i)
    for(Index j=0;j<B.cols();++j)
      VERIFY_IS_EQUAL(B(i,j), E(j,i));
 }
 template<typename T, int Rows, int Cols, int Depth, int OC, int OA, int OB>
 typename internal::enable_if<  ( (Rows ==1&&Depth!=1&&OA==ColMajor)
                              || (Depth==1&&Rows !=1&&OA==RowMajor)
@ -291,6 +316,8 @@ EIGEN_DECLARE_TEST(product_small)
    CALL_SUBTEST_6( bug_1311<3>() );
    CALL_SUBTEST_6( bug_1311<5>() );
    CALL_SUBTEST_9( test_dynamic_exact<bool>() );
  }
  CALL_SUBTEST_6( product_small_regressions<0>() );
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@ -456,7 +456,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
    // slice offsets and sizes.
    IsAligned         = false,
    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
                        // FIXME: Temporary workaround for bug in slicing of bool tensors.
                        !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
    PreferBlockAccess = true,
    Layout            = TensorEvaluator<ArgType, Device>::Layout,
    CoordAccess       = false,
@ -525,7 +527,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    m_impl.evalSubExprsIfNeeded(NULL);
    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@ -233,7 +233,7 @@ static void test_eval_tensor_binary_expr_block() {
  rhs.setRandom();
  VerifyBlockEvaluator<T, NumDims, Layout>(
-      lhs + rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+      lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
 }
 template <typename T, int NumDims, int Layout>
@ -274,7 +274,7 @@ static void test_eval_tensor_broadcast() {
  // Check that desc.destination() memory is not shared between two broadcast
  // materializations.
  VerifyBlockEvaluator<T, NumDims, Layout>(
-      input.broadcast(bcast) + input.square().broadcast(bcast),
+      input.broadcast(bcast) * input.square().broadcast(bcast),
      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
 }
@ -509,7 +509,7 @@ static void test_eval_tensor_reshape_with_bcast() {
  DSizes<Index, 2> dims(dim, dim);
  VerifyBlockEvaluator<T, 2, Layout>(
-      lhs.reshape(reshapeLhs).broadcast(bcastLhs) +
+      lhs.reshape(reshapeLhs).broadcast(bcastLhs) *
          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
 }
@ -529,11 +529,11 @@ static void test_eval_tensor_forced_eval() {
  DSizes<Index, 2> dims(dim, dim);
  VerifyBlockEvaluator<T, 2, Layout>(
-      (lhs.broadcast(bcastLhs) + rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
  VerifyBlockEvaluator<T, 2, Layout>(
-      (lhs.broadcast(bcastLhs) + rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
      [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
 }
@ -755,6 +755,38 @@ static void test_assign_to_tensor_shuffle() {
 #define CALL_SUBTEST_PART(PART) \
  CALL_SUBTEST_##PART
 #define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME)           \
  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>()))
 #define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME)     \
  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
@ -767,36 +799,38 @@ static void test_assign_to_tensor_shuffle() {
  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
-#define CALL_SUBTESTS_LAYOUTS(PART, NAME)             \
+#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME)       \
  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
-  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()))
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()));  \
  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>()))
 EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
  // clang-format off
-  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block);
  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block);
  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_binary_expr_block);
  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
-  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_broadcast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
-  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
-  CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
-  CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_select);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
-  CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);
-  CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping);
-  CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_generator);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator);
-  CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_reverse);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse);
-  CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice);
-  CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_shuffle);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle);
-  CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast);
-  CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_forced_eval);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval);
-  CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_chipping_of_bcast);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast);
-  CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor);
-  CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape);
-  CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping);
-  CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice);
-  CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_shuffle);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle);
  // Force CMake to split this test.
  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
--- a/unsupported/test/cxx11_tensor_block_io.cpp
+++ b/unsupported/test/cxx11_tensor_block_io.cpp
@ -415,7 +415,15 @@ static void test_block_io_squeeze_ones() {
  CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
  CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
  CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
-  CALL_SUBTEST((NAME<float, 5, ColMajor>()))
+  CALL_SUBTEST((NAME<float, 5, ColMajor>())); \
  CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \
  CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \
  CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \
  CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \
  CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \
  CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \
  CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \
  CALL_SUBTEST((NAME<bool, 5, ColMajor>()))
 EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
  // clang-format off
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@ -562,36 +562,40 @@ static void test_large_contraction_with_output_kernel() {
 EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
-  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST_1(test_evals<ColMajor>());
-  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST_1(test_evals<RowMajor>());
-  CALL_SUBTEST(test_scalar<ColMajor>());
+  CALL_SUBTEST_1(test_scalar<ColMajor>());
-  CALL_SUBTEST(test_scalar<RowMajor>());
+  CALL_SUBTEST_1(test_scalar<RowMajor>());
-  CALL_SUBTEST(test_multidims<ColMajor>());
+  CALL_SUBTEST_2(test_multidims<ColMajor>());
-  CALL_SUBTEST(test_multidims<RowMajor>());
+  CALL_SUBTEST_2(test_multidims<RowMajor>());
-  CALL_SUBTEST(test_holes<ColMajor>());
+  CALL_SUBTEST_2(test_holes<ColMajor>());
-  CALL_SUBTEST(test_holes<RowMajor>());
+  CALL_SUBTEST_2(test_holes<RowMajor>());
-  CALL_SUBTEST(test_full_redux<ColMajor>());
+  CALL_SUBTEST_3(test_full_redux<ColMajor>());
-  CALL_SUBTEST(test_full_redux<RowMajor>());
+  CALL_SUBTEST_3(test_full_redux<RowMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>());
-  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST_4(test_expr<ColMajor>());
-  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST_4(test_expr<RowMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>());
-  CALL_SUBTEST(test_consistency<ColMajor>());
+  CALL_SUBTEST_5(test_consistency<ColMajor>());
-  CALL_SUBTEST(test_consistency<RowMajor>());
+  CALL_SUBTEST_5(test_consistency<RowMajor>());
-  CALL_SUBTEST(test_large_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_large_contraction<ColMajor>());
-  CALL_SUBTEST(test_large_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_large_contraction<RowMajor>());
-  CALL_SUBTEST(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<ColMajor>());
-  CALL_SUBTEST(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<RowMajor>());
-  CALL_SUBTEST(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<ColMajor>());
-  CALL_SUBTEST(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<RowMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>());
-  CALL_SUBTEST(test_tensor_product<ColMajor>());
+  CALL_SUBTEST_7(test_tensor_product<ColMajor>());
-  CALL_SUBTEST(test_tensor_product<RowMajor>());
+  CALL_SUBTEST_7(test_tensor_product<RowMajor>());
-  CALL_SUBTEST(test_const_inputs<ColMajor>());
+  CALL_SUBTEST_8(test_const_inputs<ColMajor>());
-  CALL_SUBTEST(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_8(test_const_inputs<RowMajor>());
-  CALL_SUBTEST(test_large_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>());
-  CALL_SUBTEST(test_large_contraction_with_output_kernel<RowMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>());
  // Force CMake to split this test.
  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
 }
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@ -195,26 +195,23 @@ static void test_constants()
 static void test_boolean()
 {
-  Tensor<int, 1> vec(31);
+  const int kSize = 31;
-  std::iota(vec.data(), vec.data() + 31, 0);
+  Tensor<int, 1> vec(kSize);
  std::iota(vec.data(), vec.data() + kSize, 0);
  // Test ||.
  Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
-  VERIFY_IS_EQUAL(bool1[0], true);
+  for (int i = 0; i < kSize; ++i) {
-  VERIFY_IS_EQUAL(bool1[1], false);
+    bool expected = i < 1 || i > 4;
-  VERIFY_IS_EQUAL(bool1[2], false);
+    VERIFY_IS_EQUAL(bool1[i], expected);
-  VERIFY_IS_EQUAL(bool1[3], false);
+  }
  VERIFY_IS_EQUAL(bool1[4], false);
  VERIFY_IS_EQUAL(bool1[5], true);
  // Test &&, including cast of operand vec.
  Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
-  VERIFY_IS_EQUAL(bool2[0], false);
+  for (int i = 0; i < kSize; ++i) {
-  VERIFY_IS_EQUAL(bool2[1], true);
+    bool expected = bool(i) && i < 4;
-  VERIFY_IS_EQUAL(bool2[2], true);
+    VERIFY_IS_EQUAL(bool2[i], expected);
-  VERIFY_IS_EQUAL(bool2[3], true);
+  }
  VERIFY_IS_EQUAL(bool2[4], false);
  VERIFY_IS_EQUAL(bool2[5], false);
  // Compilation tests:
  // Test Tensor<bool> against results of cast or comparison; verifies that
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@ -113,19 +113,19 @@ static void test_reshape_as_lvalue()
  }
 }
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_simple_slice()
 {
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
  tensor.setRandom();
-  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Tensor<T, 5, DataLayout> slice1(1,1,1,1,1);
  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
  slice1 = tensor.slice(indices, sizes);
  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
-  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Tensor<T, 5, DataLayout> slice2(1,1,2,2,3);
  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
  slice2 = tensor.slice(indices2, sizes2);
@ -138,20 +138,20 @@ static void test_simple_slice()
  }
 }
-template<typename=void>
+template<typename T>
 static void test_const_slice()
 {
-  const float b[1] = {42};
+  const T b[1] = {42};
-  TensorMap<Tensor<const float, 1> > m(b, 1);
+  TensorMap<Tensor<const T, 1> > m(b, 1);
  DSizes<DenseIndex, 1> offsets;
  offsets[0] = 0;
-  TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions()));
  VERIFY_IS_EQUAL(slice_ref(0), 42);
 }
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_in_expr() {
-  typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+  typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx;
  Mtx m1(7,7);
  Mtx m2(3,3);
  m1.setRandom();
@ -159,10 +159,10 @@ static void test_slice_in_expr() {
  Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
-  TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7);
-  TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3);
-  Tensor<float, 2, DataLayout> tensor3(3,1);
+  Tensor<T, 2, DataLayout> tensor3(3,1);
-  typedef Tensor<float, 1>::DimensionPair DimPair;
+  typedef typename Tensor<T, 1>::DimensionPair DimPair;
  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
  Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
@ -179,28 +179,28 @@ static void test_slice_in_expr() {
  }
  // Take an arbitrary slice of an arbitrarily sized tensor.
-  TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7);
-  Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
  for (int i = 0; i < 35; ++i) {
    VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
  }
 }
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_as_lvalue()
 {
-  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  Tensor<T, 3, DataLayout> tensor1(2,2,7);
  tensor1.setRandom();
-  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  Tensor<T, 3, DataLayout> tensor2(2,2,7);
  tensor2.setRandom();
-  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  Tensor<T, 3, DataLayout> tensor3(4,3,5);
  tensor3.setRandom();
-  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  Tensor<T, 3, DataLayout> tensor4(4,3,2);
  tensor4.setRandom();
-  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  Tensor<T, 3, DataLayout> tensor5(10,13,12);
  tensor5.setRandom();
-  Tensor<float, 3, DataLayout> result(4,5,7);
+  Tensor<T, 3, DataLayout> result(4,5,7);
  Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
  Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
  result.slice(first_slice, sizes12) = tensor1;
@ -246,10 +246,10 @@ static void test_slice_as_lvalue()
  }
 }
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_raw_data()
 {
-  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  Tensor<T, 4, DataLayout> tensor(3,5,7,11);
  tensor.setRandom();
  Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
@ -276,7 +276,7 @@ static void test_slice_raw_data()
  extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
  auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
-  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0));
  if (DataLayout == ColMajor) {
    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
@ -341,15 +341,15 @@ static void test_slice_raw_data()
 }
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice()
 {
-  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Tensor<T, 5, DataLayout> Tensor5f;
  typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
-  Tensor<float, 2, DataLayout> tensor2(7,11);
+  Tensor<T, 2, DataLayout> tensor2(7,11);
  tensor.setRandom();
  tensor2.setRandom();
@ -435,13 +435,13 @@ static void test_strided_slice()
  }
 }
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice_write()
 {
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
-  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11);
  tensor.setRandom();
  tensor2=tensor;
  Tensor2f slice(2,3);
@ -461,15 +461,14 @@ static void test_strided_slice_write()
  }
 }
-
+template<typename T, int DataLayout>
 template<int DataLayout>
 static void test_composition()
 {
-  Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+  Eigen::Tensor<T, 2, DataLayout> matrix(7, 11);
  matrix.setRandom();
  const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
-  Eigen::Tensor<float, 3, DataLayout> tensor =
+  Eigen::Tensor<T, 3, DataLayout> tensor =
      matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
@ -481,29 +480,27 @@ static void test_composition()
  }
 }
 #define CALL_SUBTEST_PART(PART) \
  CALL_SUBTEST_##PART
 #define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME)       \
  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \
  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>()))
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
 {
  CALL_SUBTEST_1(test_simple_reshape<void>());
  CALL_SUBTEST_1(test_static_reshape<void>());
  CALL_SUBTEST_1(test_reshape_in_expr<void>());
  CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
  CALL_SUBTEST_1(test_reshape_in_expr<void>());
  CALL_SUBTEST_1(test_const_slice<float>());
-  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
+  CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice);
-  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
+  CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue);
-  CALL_SUBTEST_1(test_const_slice());
+  CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data);
-  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
+  CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write);
-  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
+  CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice);
-  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition);
  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
  CALL_SUBTEST_7(test_composition<ColMajor>());
  CALL_SUBTEST_7(test_composition<RowMajor>());
 }