From c285fda7f40ca161e6c8e66481d9a68e50613c48 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 8 Jul 2014 16:30:48 -0700
Subject: [PATCH] Extended the functionality of the TensorDeviceType classes

---
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 59 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index 142edda14..b9c8c19fe 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -21,6 +21,12 @@ struct DefaultDevice {
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
 };
 
 
@@ -28,7 +34,7 @@ struct DefaultDevice {
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
 struct ThreadPoolDevice {
- ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
+  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
   size_t numThreads() const { return num_threads_; }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
@@ -37,6 +43,12 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
+  }
 
  private:
   // todo: NUMA, ...
@@ -47,20 +59,61 @@ struct ThreadPoolDevice {
 
 // GPU offloading
 #ifdef EIGEN_USE_GPU
+static int m_numMultiProcessors = 0;
+static int m_maxThreadsPerBlock = 0;
+static int m_maxThreadsPerMultiProcessor = 0;
+
+static inline int getNumCudaMultiProcessors() {
+  if (m_numMultiProcessors == 0) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
+    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
+    m_numMultiProcessors = deviceProp.multiProcessorCount;
+  }
+  return m_numMultiProcessors;
+}
+static inline int maxCudaThreadsPerBlock() {
+  if (m_maxThreadsPerBlock == 0) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    m_numMultiProcessors = deviceProp.multiProcessorCount;
+    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
+    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
+  }
+  return m_maxThreadsPerBlock;
+}
+static inline int maxCudaThreadsPerMultiProcessor() {
+  if (m_maxThreadsPerBlock == 0) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    m_numMultiProcessors = deviceProp.multiProcessorCount;
+    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
+    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
+  }
+  return m_maxThreadsPerMultiProcessor;
+}
+
 struct GpuDevice {
   // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
   GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
 
   EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+  /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     void* result;
     cudaMalloc(&result, num_bytes);
     return result;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+  /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     cudaFree(buffer);
   }
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_);
+  }
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+    cudaMemsetAsync(buffer, c, n, *stream_);
+  }
 
  private:
   // TODO: multigpu.