Improved support for vectorization of 16-bit floats

2025-06-30 02:05:18 +08:00 · 2016-06-09 08:22:27 -07:00 · 2016-06-09 08:22:27 -07:00 · aa33446dac
commit aa33446dac
parent 15890c304e
3 changed files with 95 additions and 4 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
 };


+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
@ -119,6 +127,15 @@ template <typename T> struct SumReducer
  }
 };

+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasAdd
+  };
+};
+
+
 template <typename T> struct MeanReducer
 {
  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
    DenseIndex packetCount_;
 };

+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasAdd
+  };
+};
+
+
 template <typename T> struct MaxReducer
 {
  static const bool PacketAccess = packet_traits<T>::HasMax;
@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
  }
 };

+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasMax
+  };
+};
+
+
 template <typename T> struct MinReducer
 {
  static const bool PacketAccess = packet_traits<T>::HasMin;
@ -228,6 +263,14 @@ template <typename T> struct MinReducer
  }
 };

+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasMin
+  };
+};
+

 template <typename T> struct ProdReducer
 {
@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
  }
 };

+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::type::HasMul
+  };
+};
+

 struct AndReducer
 {
@ -280,6 +331,15 @@ struct AndReducer
  }
 };

+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 struct OrReducer {
  static const bool PacketAccess = false;
  static const bool IsStateful = false;
@ -295,6 +355,15 @@ struct OrReducer {
  }
 };

+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
  }
 };

+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
 template <typename T> struct ArgMinTupleReducer
 {
  static const bool PacketAccess = false;
@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
  }
 };

+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+

 // Random number generation
 namespace {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@ -54,6 +54,11 @@ struct PacketType {

 // For CUDA packet types when using a GpuDevice
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <>
+  struct PacketType<half, GpuDevice> {
+  typedef half2 type;
+  static const int size = 2;
+ };
 template <>
 struct PacketType<float, GpuDevice> {
  typedef float4 type;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@ -331,7 +331,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
  #ifdef EIGEN_HAS_CUDA_FP16
  static const bool HasOptimizedImplementation = !Op::IsStateful &&
      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
  static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
@ -346,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
      return;
    }

-    FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
  }
 };

@ -608,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
 #ifdef EIGEN_HAS_CUDA_FP16
  static const bool HasOptimizedImplementation = !Op::IsStateful &&
      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
  static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
@ -627,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
      return true;
    }

-    return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
  }
 };