mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 04:35:57 +08:00
Refined the #ifdef __CUDACC__ guard to ensure that when trying to compile gpu code with a non cuda compiler results in a linking error instead of bogus code.
This commit is contained in:
parent
ac99b49249
commit
9ea39ce13c
@ -106,7 +106,7 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
|
||||
{
|
||||
public:
|
||||
|
@ -287,6 +287,7 @@ class StreamInterface {
|
||||
virtual void deallocate(void* buffer) const = 0;
|
||||
};
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
static cudaDeviceProp* m_deviceProperties;
|
||||
static bool m_devicePropInitialized = false;
|
||||
|
||||
@ -362,7 +363,7 @@ class CudaStreamDevice : public StreamInterface {
|
||||
const cudaStream_t* stream_;
|
||||
int device_;
|
||||
};
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
struct GpuDevice {
|
||||
// The StreamInterface is not owned: the caller is
|
||||
@ -450,7 +451,7 @@ struct GpuDevice {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
|
||||
cudaError_t err = cudaStreamSynchronize(stream_->stream());
|
||||
assert(err == cudaSuccess);
|
||||
#else
|
||||
@ -477,8 +478,12 @@ struct GpuDevice {
|
||||
// This function checks if the CUDA runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
inline bool ok() const {
|
||||
#ifdef __CUDACC__
|
||||
cudaError_t error = cudaStreamQuery(stream_->stream());
|
||||
return (error == cudaSuccess) || (error == cudaErrorNotReady);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
@ -493,10 +498,12 @@ struct GpuDevice {
|
||||
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
#ifdef __CUDACC__
|
||||
static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
|
||||
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
|
||||
assert(status == cudaSuccess);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -149,7 +149,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
|
||||
|
||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
#if defined(EIGEN_USE_GPU)
|
||||
|
||||
template <typename Expression>
|
||||
class TensorExecutor<Expression, GpuDevice, false> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static void run(const Expression& expr, const GpuDevice& device);
|
||||
};
|
||||
|
||||
template <typename Expression>
|
||||
class TensorExecutor<Expression, GpuDevice, true> {
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static void run(const Expression& expr, const GpuDevice& device);
|
||||
};
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
|
||||
template <typename Evaluator, typename Index>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
@ -193,14 +210,10 @@ EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename Expression>
|
||||
class TensorExecutor<Expression, GpuDevice, false>
|
||||
/*static*/
|
||||
template <typename Expression>
|
||||
inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
|
||||
{
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static inline void run(const Expression& expr, const GpuDevice& device)
|
||||
{
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
@ -211,16 +224,13 @@ class TensorExecutor<Expression, GpuDevice, false>
|
||||
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/*static*/
|
||||
template<typename Expression>
|
||||
class TensorExecutor<Expression, GpuDevice, true>
|
||||
inline void TensorExecutor<Expression, GpuDevice, false>::run(const Expression& expr, const GpuDevice& device)
|
||||
{
|
||||
public:
|
||||
typedef typename Expression::Index Index;
|
||||
static inline void run(const Expression& expr, const GpuDevice& device)
|
||||
{
|
||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign)
|
||||
@ -231,10 +241,10 @@ class TensorExecutor<Expression, GpuDevice, true>
|
||||
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // __CUDACC__
|
||||
#endif // EIGEN_USE_GPU
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user