Enable the use of the packet api to evaluate tensor broadcasts. This speed things up quite a bit:

Before" M_broadcasting/10 500000 3690 27.10 MFlops/s BM_broadcasting/80 500000 4014 1594.24 MFlops/s BM_broadcasting/640 100000 14770 27731.35 MFlops/s BM_broadcasting/4K 5000 632711 39512.48 MFlops/s After: BM_broadcasting/10 500000 4287 23.33 MFlops/s BM_broadcasting/80 500000 4455 1436.41 MFlops/s BM_broadcasting/640 200000 10195 40173.01 MFlops/s BM_broadcasting/4K 5000 423746 58997.57 MFlops/s
2025-07-05 20:55:12 +08:00 · 2016-05-17 09:24:35 -07:00 · 2016-05-17 09:24:35 -07:00 · e7e64c3277
commit e7e64c3277
parent 5fa27574dd
1 changed files with 1 additions and 1 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
  enum {
-    IsAligned = false,
+    IsAligned = true,
    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
    Layout = TensorEvaluator<ArgType, Device>::Layout,
    RawAccess = false