From b86e01332162dfa8479bf5f35f474ea4efe5b079 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 21 Oct 2021 08:11:02 -0700 Subject: [PATCH] Revert bit_cast to use memcpy for CUDA. To elide the memcpy, we need to first load the `src` value into registers by making a local copy. This avoids the need to resort to potential UB by using `reinterpret_cast`. This change doesn't seem to affect CPU (at least not with gcc/clang). With optimizations on, the copy is also elided. --- Eigen/src/Core/NumTraits.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index bb55919d1..63ba4168c 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -91,20 +91,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value && std::is_default_constructible::value, THIS_TYPE_IS_NOT_SUPPORTED); #endif - EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); - // On GPU, the standard memcpy approach is not elided, actually producing an - // expensive memcpy. The standard (as used by the CUDA library, and suggested - // in multiple forums) seems to be to violate strict aliasing rules. - #if defined(EIGEN_GPU_COMPILE_PHASE) - return *reinterpret_cast(&src); - #else - Tgt tgt; - EIGEN_USING_STD(memcpy) - memcpy(&tgt, &src, sizeof(Tgt)); - return tgt; - #endif + Tgt tgt; + // Load src into registers first. This allows the memcpy to be elided by CUDA. + const Src staged = src; + EIGEN_USING_STD(memcpy) + memcpy(&tgt, &staged, sizeof(Tgt)); + return tgt; } } // namespace numext