From 45e67a6fda1f6d292d632f643c95aef83dd41073 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Fri, 8 Oct 2021 11:30:09 -0700
Subject: [PATCH] Use reinterpret_cast on GPU for bit_cast.

This seems to be the recommended approach for doing type punning in
CUDA. See for example
- https://stackoverflow.com/questions/47037104/cuda-type-punning-memcpy-vs-ub-union
- https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
(the latter puns a double to an `int2`).
The issue is that for CUDA, the `memcpy` is not elided, and ends up
being an expensive operation.  We already have similar `reintepret_cast`s across
the Eigen codebase for GPU (as does TensorFlow).
---
 Eigen/src/Core/NumTraits.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index dfe6fe34f..bb55919d1 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -93,10 +93,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
 #endif
 
   EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
-  Tgt tgt;
-  EIGEN_USING_STD(memcpy)
-  memcpy(&tgt, &src, sizeof(Tgt));
-  return tgt;
+
+  // On GPU, the standard memcpy approach is not elided, actually producing an
+  // expensive memcpy. The standard (as used by the CUDA library, and suggested
+  // in multiple forums) seems to be to violate strict aliasing rules.
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
+    return *reinterpret_cast<const Tgt*>(&src);
+  #else
+    Tgt tgt;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&tgt, &src, sizeof(Tgt));
+    return tgt;
+  #endif
 }
 }  // namespace numext