From de218b471dc2225d3e92670c8d5017ee372265e7 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Fri, 24 Sep 2021 20:48:01 -0700
Subject: [PATCH] Add -arch=<arch> argument for nvcc.

Without this flag, when compiling with nvcc, if the compute architecture of a card does
not exactly match any of those listed for `-gencode arch=compute_<arch>,code=sm_<arch>`,
then the kernel will fail to run with:
```
cudaErrorNoKernelImageForDevice: no kernel image is available for execution on the device.
```
This can happen, for example, when compiling with an older cuda version
that does not support a newer architecture (e.g. T4 is `sm_75`, but cuda
9.2 only supports up to `sm_70`).

With the `-arch=<arch>` flag, the code will compile and run at the
supplied architecture.
---
 test/CMakeLists.txt             | 10 ++++++++++
 unsupported/test/CMakeLists.txt | 10 ++++++++++
 2 files changed, 20 insertions(+)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 906160d37..ad0959979 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -403,6 +403,16 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   else()
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
     set(NVCC_ARCH_FLAGS)
+    # Define an -arch=sm_<arch>, otherwise if GPU does not exactly match one of
+    # those in the arch list for -gencode, the kernels will fail to run with
+    #    cudaErrorNoKernelImageForDevice
+    # This can happen with newer cards (e.g. sm_75) and compiling with older
+    # versions of nvcc (e.g. 9.2) that do not support their specific arch.
+    list(LENGTH EIGEN_CUDA_COMPUTE_ARCH EIGEN_CUDA_COMPUTE_ARCH_SIZE)
+    if(EIGEN_CUDA_COMPUTE_ARCH_SIZE)
+      list(GET EIGEN_CUDA_COMPUTE_ARCH 0 EIGEN_CUDA_COMPUTE_DEFAULT)
+      set(NVCC_ARCH_FLAGS " -arch=sm_${EIGEN_CUDA_COMPUTE_DEFAULT}")
+    endif()
     foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
       string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
     endforeach()
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index a962e3f6c..8aa524d54 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -303,6 +303,16 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   else()
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
     set(NVCC_ARCH_FLAGS)
+    # Define an -arch=sm_<arch>, otherwise if GPU does not exactly match one of
+    # those in the arch list for -gencode, the kernels will fail to run with
+    #    cudaErrorNoKernelImageForDevice
+    # This can happen with newer cards (e.g. sm_75) and compiling with older
+    # versions of nvcc (e.g. 9.2) that do not support their specific arch.
+    list(LENGTH EIGEN_CUDA_COMPUTE_ARCH EIGEN_CUDA_COMPUTE_ARCH_SIZE)
+    if(EIGEN_CUDA_COMPUTE_ARCH_SIZE)
+      list(GET EIGEN_CUDA_COMPUTE_ARCH 0 EIGEN_CUDA_COMPUTE_DEFAULT)
+      set(NVCC_ARCH_FLAGS " -arch=sm_${EIGEN_CUDA_COMPUTE_DEFAULT}")
+    endif()
     foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
       string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
     endforeach()