Improved support for CUDA devices.

Improved contractions on GPU
This commit is contained in:
Benoit Steiner 2014-10-03 19:18:07 -07:00
parent 1269392822
commit af2e5995e2
3 changed files with 1237 additions and 8 deletions

View File

@ -44,6 +44,7 @@
#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"

File diff suppressed because it is too large Load Diff

View File

@ -104,19 +104,41 @@ struct GpuDevice {
EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
/*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
#ifndef __CUDA_ARCH__
void* result; void* result;
cudaMalloc(&result, num_bytes); assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
assert(result != NULL);
return result; return result;
#else
assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
} }
/*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
cudaFree(buffer); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
assert(buffer != NULL);
assert(cudaFree(buffer) == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
} }
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
} }
EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
cudaMemsetAsync(buffer, c, n, *stream_); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef __CUDA_ARCH__
assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
} }
EIGEN_STRONG_INLINE size_t numThreads() const { EIGEN_STRONG_INLINE size_t numThreads() const {