From 45e67a6fda1f6d292d632f643c95aef83dd41073 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 8 Oct 2021 11:30:09 -0700 Subject: [PATCH] Use reinterpret_cast on GPU for bit_cast. This seems to be the recommended approach for doing type punning in CUDA. See for example - https://stackoverflow.com/questions/47037104/cuda-type-punning-memcpy-vs-ub-union - https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ (the latter puns a double to an `int2`). The issue is that for CUDA, the `memcpy` is not elided, and ends up being an expensive operation. We already have similar `reintepret_cast`s across the Eigen codebase for GPU (as does TensorFlow). --- Eigen/src/Core/NumTraits.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index dfe6fe34f..bb55919d1 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -93,10 +93,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { #endif EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); - Tgt tgt; - EIGEN_USING_STD(memcpy) - memcpy(&tgt, &src, sizeof(Tgt)); - return tgt; + + // On GPU, the standard memcpy approach is not elided, actually producing an + // expensive memcpy. The standard (as used by the CUDA library, and suggested + // in multiple forums) seems to be to violate strict aliasing rules. + #if defined(EIGEN_GPU_COMPILE_PHASE) + return *reinterpret_cast(&src); + #else + Tgt tgt; + EIGEN_USING_STD(memcpy) + memcpy(&tgt, &src, sizeof(Tgt)); + return tgt; + #endif } } // namespace numext