mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 19:59:05 +08:00
Re-enabled the optimized reduction CUDA code.
This commit is contained in:
parent
b557662e58
commit
780623261e
@ -527,7 +527,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Attempt to use an optimized reduction.
|
// Attempt to use an optimized reduction.
|
||||||
#if 0
|
|
||||||
else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
|
else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
|
||||||
bool reducing_inner_dims = true;
|
bool reducing_inner_dims = true;
|
||||||
for (int i = 0; i < NumReducedDims; ++i) {
|
for (int i = 0; i < NumReducedDims; ++i) {
|
||||||
@ -563,7 +562,6 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,7 +126,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
|||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
|
const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
|
||||||
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
|
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -222,7 +222,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
|
|||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = 32;
|
const int num_blocks = 32;
|
||||||
|
|
||||||
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread>),
|
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
num_blocks, block_size, block_size*sizeof(float), device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -279,7 +279,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||||
|
|
||||||
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread>),
|
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user