Enable the use of the packet api to evaluate tensor broadcasts. This speed things up quite a bit:

Before"
M_broadcasting/10        500000       3690    27.10 MFlops/s
BM_broadcasting/80        500000       4014  1594.24 MFlops/s
BM_broadcasting/640       100000      14770 27731.35 MFlops/s
BM_broadcasting/4K          5000     632711 39512.48 MFlops/s
After:
BM_broadcasting/10        500000       4287    23.33 MFlops/s
BM_broadcasting/80        500000       4455  1436.41 MFlops/s
BM_broadcasting/640       200000      10195 40173.01 MFlops/s
BM_broadcasting/4K          5000     423746 58997.57 MFlops/s
This commit is contained in:
Benoit Steiner 2016-05-17 09:24:35 -07:00
parent 5fa27574dd
commit e7e64c3277

View File

@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum { enum {
IsAligned = false, IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout, Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false RawAccess = false