mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-06 02:34:05 +08:00
Improved the speed of convolutions when running on cuda devices
This commit is contained in:
parent
33c702c79f
commit
9ac3c821ea
@ -20,6 +20,126 @@ namespace Eigen {
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
|
||||||
|
template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper {
|
||||||
|
public:
|
||||||
|
IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
|
||||||
|
const array<Index, NumKernelDims>& indices) {
|
||||||
|
|
||||||
|
array<Index, NumDims> dimensions = input_dims;
|
||||||
|
for (int i = 0; i < NumKernelDims; ++i) {
|
||||||
|
const Index index = indices[i];
|
||||||
|
const Index input_dim = input_dims[index];
|
||||||
|
const Index kernel_dim = kernel_dims[i];
|
||||||
|
const Index result_dim = input_dim - kernel_dim + 1;
|
||||||
|
dimensions[index] = result_dim;
|
||||||
|
}
|
||||||
|
|
||||||
|
array<Index, NumDims> inputStrides;
|
||||||
|
array<Index, NumDims> outputStrides;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
if (i > 0) {
|
||||||
|
inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
|
||||||
|
outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
|
||||||
|
} else {
|
||||||
|
inputStrides[0] = 1;
|
||||||
|
outputStrides[0] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
array<Index, NumDims> cudaInputDimensions;
|
||||||
|
array<Index, NumDims> cudaOutputDimensions;
|
||||||
|
array<Index, NumDims> tmp = dimensions;
|
||||||
|
array<Index, NumDims> ordering;
|
||||||
|
for (int i = 0; i < NumKernelDims; ++i) {
|
||||||
|
ordering[i] = indices[i];
|
||||||
|
tmp[indices[i]] = -1;
|
||||||
|
cudaInputDimensions[i] = input_dims[ordering[i]];
|
||||||
|
cudaOutputDimensions[i] = dimensions[ordering[i]];
|
||||||
|
}
|
||||||
|
int written = NumKernelDims;
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
if (tmp[i] >= 0) {
|
||||||
|
ordering[written] = i;
|
||||||
|
cudaInputDimensions[written] = input_dims[i];
|
||||||
|
cudaOutputDimensions[written] = dimensions[i];
|
||||||
|
++written;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
m_inputStrides[i] = inputStrides[ordering[i]];
|
||||||
|
m_outputStrides[i] = outputStrides[ordering[i]];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
|
if (i > NumKernelDims) {
|
||||||
|
m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1];
|
||||||
|
m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1];
|
||||||
|
} else {
|
||||||
|
m_cudaInputStrides[i] = 1;
|
||||||
|
m_cudaOutputStrides[i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
|
||||||
|
Index inputIndex = 0;
|
||||||
|
for (int d = NumDims - 1; d > NumKernelDims; --d) {
|
||||||
|
const Index idx = p / m_cudaInputStrides[d];
|
||||||
|
inputIndex += idx * m_inputStrides[d];
|
||||||
|
p -= idx * m_cudaInputStrides[d];
|
||||||
|
}
|
||||||
|
inputIndex += p * m_inputStrides[NumKernelDims];
|
||||||
|
return inputIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
|
||||||
|
Index outputIndex = 0;
|
||||||
|
for (int d = NumDims - 1; d > NumKernelDims; --d) {
|
||||||
|
const Index idx = p / m_cudaOutputStrides[d];
|
||||||
|
outputIndex += idx * m_outputStrides[d];
|
||||||
|
p -= idx * m_cudaOutputStrides[d];
|
||||||
|
}
|
||||||
|
outputIndex += p * m_outputStrides[NumKernelDims];
|
||||||
|
return outputIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
|
||||||
|
return i * m_inputStrides[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
|
||||||
|
return i * m_outputStrides[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
|
||||||
|
return i * m_inputStrides[0] + j*m_inputStrides[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
|
||||||
|
return i * m_outputStrides[0] + j * m_outputStrides[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
|
||||||
|
return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
|
||||||
|
return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static const size_t NumDims = internal::array_size<InputDims>::value;
|
||||||
|
array<Index, NumDims> m_inputStrides;
|
||||||
|
array<Index, NumDims> m_outputStrides;
|
||||||
|
array<Index, NumDims> m_cudaInputStrides;
|
||||||
|
array<Index, NumDims> m_cudaOutputStrides;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<typename Dimensions, typename InputXprType, typename KernelXprType>
|
template<typename Dimensions, typename InputXprType, typename KernelXprType>
|
||||||
struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
|
struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
|
||||||
{
|
{
|
||||||
@ -75,15 +195,15 @@ class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, Input
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
|
||||||
: m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
|
: m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
const Indices& indices() const { return m_indices; }
|
const Indices& indices() const { return m_indices; }
|
||||||
|
|
||||||
/** \returns the nested expressions */
|
/** \returns the nested expressions */
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
const typename internal::remove_all<typename InputXprType::Nested>::type&
|
const typename internal::remove_all<typename InputXprType::Nested>::type&
|
||||||
inputExpression() const { return m_input_xpr; }
|
inputExpression() const { return m_input_xpr; }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
const typename internal::remove_all<typename KernelXprType::Nested>::type&
|
const typename internal::remove_all<typename KernelXprType::Nested>::type&
|
||||||
kernelExpression() const { return m_kernel_xpr; }
|
kernelExpression() const { return m_kernel_xpr; }
|
||||||
|
|
||||||
@ -99,8 +219,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
{
|
{
|
||||||
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
|
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
|
||||||
|
|
||||||
static const int NumDims = TensorEvaluator<InputArgType, Device>::Dimensions::count;
|
static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
|
||||||
static const int KernelDims = internal::array_size<Indices>::value;
|
static const int NumKernelDims = internal::array_size<Indices>::value;
|
||||||
typedef typename XprType::Index Index;
|
typedef typename XprType::Index Index;
|
||||||
typedef DSizes<Index, NumDims> Dimensions;
|
typedef DSizes<Index, NumDims> Dimensions;
|
||||||
|
|
||||||
@ -111,7 +231,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
};
|
};
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||||
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions())
|
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device)
|
||||||
{
|
{
|
||||||
const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
|
const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
|
||||||
const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
|
const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
|
||||||
@ -124,7 +244,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < KernelDims; ++i) {
|
m_dimensions = m_inputImpl.dimensions();
|
||||||
|
for (int i = 0; i < NumKernelDims; ++i) {
|
||||||
const Index index = op.indices()[i];
|
const Index index = op.indices()[i];
|
||||||
const Index input_dim = input_dims[index];
|
const Index input_dim = input_dims[index];
|
||||||
const Index kernel_dim = kernel_dims[i];
|
const Index kernel_dim = kernel_dims[i];
|
||||||
@ -148,6 +269,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
typedef typename XprType::Scalar Scalar;
|
typedef typename XprType::Scalar Scalar;
|
||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||||
@ -195,7 +317,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
|
for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
|
||||||
const Index input = firstIndex + j * m_indexStride[DimIndex];
|
const Index input = firstIndex + j * m_indexStride[DimIndex];
|
||||||
const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
|
const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
|
||||||
if (DimIndex < KernelDims-1) {
|
if (DimIndex < NumKernelDims-1) {
|
||||||
convolve(input, kernel, DimIndex+1, accum);
|
convolve(input, kernel, DimIndex+1, accum);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -207,17 +329,507 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
Scalar* data() const { return NULL; }
|
Scalar* data() const { return NULL; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
// No copy, no assignment
|
||||||
|
TensorEvaluator(const TensorEvaluator&);
|
||||||
|
TensorEvaluator& operator = (const TensorEvaluator&);
|
||||||
|
|
||||||
array<Index, NumDims> m_inputStride;
|
array<Index, NumDims> m_inputStride;
|
||||||
array<Index, NumDims> m_outputStride;
|
array<Index, NumDims> m_outputStride;
|
||||||
|
|
||||||
array<Index, KernelDims> m_indexStride;
|
array<Index, NumKernelDims> m_indexStride;
|
||||||
array<Index, KernelDims> m_kernelStride;
|
array<Index, NumKernelDims> m_kernelStride;
|
||||||
TensorEvaluator<InputArgType, Device> m_inputImpl;
|
TensorEvaluator<InputArgType, Device> m_inputImpl;
|
||||||
TensorEvaluator<KernelArgType, Device> m_kernelImpl;
|
TensorEvaluator<KernelArgType, Device> m_kernelImpl;
|
||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
|
||||||
|
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||||
|
|
||||||
|
template <int StaticKernelSize>
|
||||||
|
struct GetKernelSize {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
|
||||||
|
return StaticKernelSize;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct GetKernelSize<Eigen::Dynamic> {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
|
||||||
|
return kernelSize;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
|
||||||
|
__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) {
|
||||||
|
extern __shared__ float s[];
|
||||||
|
|
||||||
|
const int first_x = blockIdx.x * maxX;
|
||||||
|
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||||
|
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
|
||||||
|
const int num_x_output = last_x - first_x + 1;
|
||||||
|
|
||||||
|
const int first_plane = blockIdx.y * blockDim.y;
|
||||||
|
const int plane_stride = blockDim.y * gridDim.y;
|
||||||
|
|
||||||
|
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
|
||||||
|
// Load inputs to shared memory
|
||||||
|
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||||
|
const int plane_kernel_offset = threadIdx.y * num_x_input;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||||
|
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
|
||||||
|
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Compute the convolution
|
||||||
|
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||||
|
const int kernel_offset = plane_kernel_offset + i;
|
||||||
|
float result = 0.0f;
|
||||||
|
#pragma unroll
|
||||||
|
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
|
||||||
|
result += s[k + kernel_offset] * kernel[k];
|
||||||
|
}
|
||||||
|
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
|
||||||
|
buffer[tensor_index] = result;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
|
||||||
|
__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) {
|
||||||
|
extern __shared__ float s[];
|
||||||
|
|
||||||
|
const int first_x = blockIdx.x * maxX;
|
||||||
|
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||||
|
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
|
||||||
|
const int num_x_output = last_x - first_x + 1;
|
||||||
|
|
||||||
|
const int first_y = blockIdx.y * maxY;
|
||||||
|
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
|
||||||
|
const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
|
||||||
|
const int num_y_output = last_y - first_y + 1;
|
||||||
|
|
||||||
|
const int first_plane = blockIdx.z * blockDim.z;
|
||||||
|
const int plane_stride = blockDim.z * gridDim.z;
|
||||||
|
|
||||||
|
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
|
||||||
|
|
||||||
|
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||||
|
const int plane_kernel_offset = threadIdx.z * num_y_input;
|
||||||
|
|
||||||
|
// Load inputs to shared memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
|
||||||
|
const int input_offset = num_x_input * (j + plane_kernel_offset);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||||
|
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
|
||||||
|
s[i + input_offset] = eval.coeff(tensor_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Convolution
|
||||||
|
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||||
|
float result = 0.0f;
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
|
||||||
|
const int kernel_offset = kernelSizeX * l;
|
||||||
|
const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
|
||||||
|
#pragma unroll
|
||||||
|
for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
|
||||||
|
result += s[k + input_offset] * kernel[k + kernel_offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
|
||||||
|
buffer[tensor_index] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename InputEvaluator, typename Index, typename InputDims>
|
||||||
|
__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) {
|
||||||
|
extern __shared__ float s[];
|
||||||
|
|
||||||
|
// Load inputs to shared memory
|
||||||
|
const int first_x = blockIdx.x * maxX;
|
||||||
|
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||||
|
const int num_x_input = last_x - first_x + kernelSizeX;
|
||||||
|
|
||||||
|
const int first_y = blockIdx.y * maxY;
|
||||||
|
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
|
||||||
|
const int num_y_input = last_y - first_y + kernelSizeY;
|
||||||
|
|
||||||
|
const int first_z = blockIdx.z * maxZ;
|
||||||
|
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
|
||||||
|
const int num_z_input = last_z - first_z + kernelSizeZ;
|
||||||
|
|
||||||
|
for (int p = 0; p < numPlanes; ++p) {
|
||||||
|
|
||||||
|
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||||
|
const int plane_kernel_offset = 0;
|
||||||
|
|
||||||
|
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
|
||||||
|
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
|
||||||
|
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||||
|
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
|
||||||
|
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Convolution
|
||||||
|
const int num_z_output = last_z - first_z + 1;
|
||||||
|
const int num_y_output = last_y - first_y + 1;
|
||||||
|
const int num_x_output = last_x - first_x + 1;
|
||||||
|
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
|
||||||
|
|
||||||
|
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
|
||||||
|
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||||
|
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||||
|
float result = 0.0f;
|
||||||
|
for (int n = 0; n < kernelSizeZ; ++n) {
|
||||||
|
for (int m = 0; m < kernelSizeY; ++m) {
|
||||||
|
for (int l = 0; l < kernelSizeX; ++l) {
|
||||||
|
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
|
||||||
|
buffer[tensor_index] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<typename Indices, typename InputArgType, typename KernelArgType>
|
||||||
|
struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
|
||||||
|
{
|
||||||
|
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
|
||||||
|
|
||||||
|
static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
|
||||||
|
static const int NumKernelDims = internal::array_size<Indices>::value;
|
||||||
|
typedef typename XprType::Index Index;
|
||||||
|
typedef DSizes<Index, NumDims> Dimensions;
|
||||||
|
typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
|
||||||
|
PacketAccess = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
|
||||||
|
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
|
||||||
|
{
|
||||||
|
const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
|
||||||
|
const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
|
||||||
|
|
||||||
|
m_dimensions = m_inputImpl.dimensions();
|
||||||
|
for (int i = 0; i < NumKernelDims; ++i) {
|
||||||
|
const Index index = op.indices()[i];
|
||||||
|
const Index input_dim = input_dims[index];
|
||||||
|
const Index kernel_dim = kernel_dims[i];
|
||||||
|
const Index result_dim = input_dim - kernel_dim + 1;
|
||||||
|
m_dimensions[index] = result_dim;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
|
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||||
|
typedef typename InputArgType::Scalar Scalar;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||||
|
preloadKernel();
|
||||||
|
m_inputImpl.evalSubExprsIfNeeded(NULL);
|
||||||
|
if (data) {
|
||||||
|
executeEval(data);
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
|
||||||
|
executeEval(m_buf);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void cleanup() {
|
||||||
|
m_inputImpl.cleanup();
|
||||||
|
if (m_buf) {
|
||||||
|
m_device.deallocate(m_buf);
|
||||||
|
m_buf = NULL;
|
||||||
|
}
|
||||||
|
if (m_local_kernel) {
|
||||||
|
m_device.deallocate((void*)m_kernel);
|
||||||
|
m_local_kernel = false;
|
||||||
|
}
|
||||||
|
m_kernel = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void preloadKernel() {
|
||||||
|
// Don't make a local copy of the kernel unless we have to (i.e. it's an
|
||||||
|
// expression that needs to be evaluated)
|
||||||
|
const Scalar* in_place = m_kernelImpl.data();
|
||||||
|
if (in_place) {
|
||||||
|
m_kernel = in_place;
|
||||||
|
m_local_kernel = false;
|
||||||
|
} else {
|
||||||
|
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
|
||||||
|
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
|
||||||
|
typedef TensorEvalToOp<const KernelArgType> EvalTo;
|
||||||
|
EvalTo evalToTmp(local, m_kernelArg);
|
||||||
|
internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device);
|
||||||
|
|
||||||
|
m_kernel = local;
|
||||||
|
m_local_kernel = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int ceil(unsigned int num, unsigned int denom) {
|
||||||
|
const unsigned int rounded_toward_zero = num / denom;
|
||||||
|
if (num > rounded_toward_zero * denom) {
|
||||||
|
return rounded_toward_zero + 1;
|
||||||
|
}
|
||||||
|
return rounded_toward_zero;
|
||||||
|
}
|
||||||
|
|
||||||
|
void executeEval(Scalar* data) const {
|
||||||
|
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
|
||||||
|
|
||||||
|
const int maxSharedMem = sharedMemPerBlock();
|
||||||
|
const int maxThreadsPerBlock = maxCudaThreadsPerBlock();
|
||||||
|
const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
|
||||||
|
const int numMultiProcessors = getNumCudaMultiProcessors();
|
||||||
|
const int warpSize = 32;
|
||||||
|
|
||||||
|
switch (NumKernelDims) {
|
||||||
|
case 1: {
|
||||||
|
const int kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||||
|
|
||||||
|
const int numX = dimensions()[m_indices[0]];
|
||||||
|
const int numP = dimensions().TotalSize() / numX;
|
||||||
|
|
||||||
|
int maxX;
|
||||||
|
dim3 block_size;
|
||||||
|
if (m_indices[0] == 0) {
|
||||||
|
// Maximum the reuse
|
||||||
|
const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
|
||||||
|
maxX = (std::min<int>)(inner_dim, numX);
|
||||||
|
const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
|
||||||
|
block_size.x = (std::min)(maxThreadsPerBlock, maxX);
|
||||||
|
block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Read as much as possible alongside the inner most dimension, that is the plane
|
||||||
|
const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
|
||||||
|
const int maxP = (std::min<int>)(inner_dim, numP);
|
||||||
|
maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
|
||||||
|
|
||||||
|
block_size.x = (std::min)(warpSize, maxX);
|
||||||
|
block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
|
||||||
|
assert(shared_mem <= maxSharedMem);
|
||||||
|
|
||||||
|
const int num_x_blocks = ceil(numX, maxX);
|
||||||
|
const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
|
||||||
|
const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
|
||||||
|
|
||||||
|
dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y)));
|
||||||
|
|
||||||
|
|
||||||
|
//cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
|
||||||
|
|
||||||
|
const array<Index, 1> indices(m_indices[0]);
|
||||||
|
const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
|
||||||
|
internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
|
||||||
|
switch(kernel_size) {
|
||||||
|
case 4: {
|
||||||
|
EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 7: {
|
||||||
|
EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cudaError_t error = cudaGetLastError();
|
||||||
|
assert(error == cudaSuccess);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 2: {
|
||||||
|
const int kernel_size_x = m_kernelImpl.dimensions()[0];
|
||||||
|
const int kernel_size_y = m_kernelImpl.dimensions()[1];
|
||||||
|
|
||||||
|
const int numX = dimensions()[m_indices[0]];
|
||||||
|
const int numY = dimensions()[m_indices[1]];
|
||||||
|
const int numP = dimensions().TotalSize() / (numX*numY);
|
||||||
|
|
||||||
|
const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
|
||||||
|
|
||||||
|
// Snap maxX to warp size
|
||||||
|
int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
|
||||||
|
const int maxX = (std::min<int>)(inner_dim, numX);
|
||||||
|
const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
|
||||||
|
const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
|
||||||
|
|
||||||
|
dim3 block_size;
|
||||||
|
block_size.x = (std::min)(1024, maxX);
|
||||||
|
block_size.y = (std::min<int>)(1024/block_size.x, maxY);
|
||||||
|
block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
|
||||||
|
|
||||||
|
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
|
||||||
|
assert(shared_mem <= maxSharedMem);
|
||||||
|
|
||||||
|
const int num_x_blocks = ceil(numX, maxX);
|
||||||
|
const int num_y_blocks = ceil(numY, maxY);
|
||||||
|
const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
|
||||||
|
const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
|
||||||
|
|
||||||
|
dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z)));
|
||||||
|
|
||||||
|
|
||||||
|
//cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
|
||||||
|
|
||||||
|
const array<Index, 2> indices(m_indices[0], m_indices[1]);
|
||||||
|
const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]);
|
||||||
|
internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
|
||||||
|
switch (kernel_size_x) {
|
||||||
|
case 4: {
|
||||||
|
switch (kernel_size_y) {
|
||||||
|
case 7: {
|
||||||
|
EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 7: {
|
||||||
|
switch (kernel_size_y) {
|
||||||
|
case 4: {
|
||||||
|
EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Eigen::Dynamic, Eigen::Dynamic> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cudaError_t error = cudaGetLastError();
|
||||||
|
assert(error == cudaSuccess);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 3: {
|
||||||
|
const int kernel_size_x = m_kernelImpl.dimensions()[0];
|
||||||
|
const int kernel_size_y = m_kernelImpl.dimensions()[1];
|
||||||
|
const int kernel_size_z = m_kernelImpl.dimensions()[2];
|
||||||
|
|
||||||
|
const int numX = dimensions()[m_indices[0]];
|
||||||
|
const int numY = dimensions()[m_indices[1]];
|
||||||
|
const int numZ = dimensions()[m_indices[2]];
|
||||||
|
const int numP = dimensions().TotalSize() / (numX*numY*numZ);
|
||||||
|
|
||||||
|
const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
|
||||||
|
const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
|
||||||
|
const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
|
||||||
|
|
||||||
|
dim3 block_size;
|
||||||
|
block_size.x = (std::min)(32, maxX);
|
||||||
|
block_size.y = (std::min)(32, maxY);
|
||||||
|
block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
|
||||||
|
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
|
||||||
|
|
||||||
|
const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
|
||||||
|
assert(shared_mem <= maxSharedMem);
|
||||||
|
|
||||||
|
//cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
|
||||||
|
const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]);
|
||||||
|
const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]);
|
||||||
|
internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
|
||||||
|
|
||||||
|
EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims> <<<num_blocks, block_size, shared_mem, m_device.stream()>>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
|
||||||
|
cudaError_t error = cudaGetLastError();
|
||||||
|
assert(error == cudaSuccess);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default: {
|
||||||
|
assert(false && "not supported yet");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||||
|
{
|
||||||
|
assert(m_buf);
|
||||||
|
assert(index < m_dimensions.TotalSize());
|
||||||
|
return m_buf[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// No assignment (copies are needed by the kernels)
|
||||||
|
TensorEvaluator& operator = (const TensorEvaluator&);
|
||||||
|
|
||||||
|
TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
|
||||||
|
TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
|
||||||
|
KernelArgType m_kernelArg;
|
||||||
|
Indices m_indices;
|
||||||
|
Dimensions m_dimensions;
|
||||||
|
Scalar* m_buf;
|
||||||
|
const Scalar* m_kernel;
|
||||||
|
bool m_local_kernel;
|
||||||
|
|
||||||
|
const GpuDevice& m_device;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
|
||||||
|
Loading…
x
Reference in New Issue
Block a user