diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 278689915..e647b3609 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -284,7 +284,13 @@ struct TensorEvaluator, Device> if (static_cast(Layout) == static_cast(ColMajor)) { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); + #else return m_impl.template packet(index); + #endif } else if (oneByN && !nByOne) { return packetNByOne(index); } else if (!oneByN && nByOne) { @@ -296,7 +302,12 @@ struct TensorEvaluator, Device> } } else { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); + #else return m_impl.template packet(index); + #endif } else if (oneByN && !nByOne) { return packetOneByN(index); } else if (!oneByN && nByOne) {