Preserve the ability to vectorize the evaluation of an expression even when it involves a cast that isn't vectorized (e.g fp16 to float)

2025-09-28 17:23:14 +08:00 · 2016-05-26 14:37:09 -07:00 · 2016-05-26 14:37:09 -07:00 · 1a47844529
commit 1a47844529
parent 36369ab63c
1 changed files with 29 additions and 7 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@ -193,7 +193,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>

  enum {
    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    PacketAccess = true,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess = false
  };
@ -224,11 +224,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
  template<int LoadMode>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
  {
-    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-    PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
-                    SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
-    return converter.template packet<LoadMode>(index);
+    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@ -249,7 +247,31 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }

  protected:
-    TensorEvaluator<ArgType, Device> m_impl;
+  template <int LoadMode, bool ActuallyVectorize>
+  struct PacketConv {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      internal::scalar_cast_op<SrcType, TargetType> converter;
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = converter(impl.coeff(index+i));
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template <int LoadMode>
+  struct PacketConv<LoadMode, true> {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+      return converter.template packet<LoadMode>(index);
+    }
+  };
+
+  TensorEvaluator<ArgType, Device> m_impl;
 };

 } // end namespace Eigen