Extended the functionality of the TensorDeviceType classes

2025-09-11 17:03:15 +08:00 · 2014-07-08 16:30:48 -07:00 · 2014-07-08 16:30:48 -07:00 · c285fda7f4
commit c285fda7f4
parent 7d53633e05
1 changed files with 56 additions and 3 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@ -21,6 +21,12 @@ struct DefaultDevice {
  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    internal::aligned_free(buffer);
  }
  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    ::memcpy(dst, src, n);
  }
  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
  }
 };
@ -37,6 +43,12 @@ struct ThreadPoolDevice {
  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    internal::aligned_free(buffer);
  }
  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    ::memcpy(dst, src, n);
  }
  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
  }
 private:
  // todo: NUMA, ...
@ -47,20 +59,61 @@ struct ThreadPoolDevice {
 // GPU offloading
 #ifdef EIGEN_USE_GPU
 static int m_numMultiProcessors = 0;
 static int m_maxThreadsPerBlock = 0;
 static int m_maxThreadsPerMultiProcessor = 0;
 static inline int getNumCudaMultiProcessors() {
  if (m_numMultiProcessors == 0) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
    m_numMultiProcessors = deviceProp.multiProcessorCount;
  }
  return m_numMultiProcessors;
 }
 static inline int maxCudaThreadsPerBlock() {
  if (m_maxThreadsPerBlock == 0) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    m_numMultiProcessors = deviceProp.multiProcessorCount;
    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
  }
  return m_maxThreadsPerBlock;
 }
 static inline int maxCudaThreadsPerMultiProcessor() {
  if (m_maxThreadsPerBlock == 0) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    m_numMultiProcessors = deviceProp.multiProcessorCount;
    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
  }
  return m_maxThreadsPerMultiProcessor;
 }
 struct GpuDevice {
  // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
  GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
  EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+  /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
    void* result;
    cudaMalloc(&result, num_bytes);
    return result;
  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+  /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
    cudaFree(buffer);
  }
  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
    cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_);
  }
  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    cudaMemsetAsync(buffer, c, n, *stream_);
  }
 private:
  // TODO: multigpu.