mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 11:19:02 +08:00
Merged from trunk
This commit is contained in:
commit
67fcf47ecb
@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, c
|
||||
to[stride*1] = from.y;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
|
||||
return a.x;
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
|
||||
return a.x;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
|
||||
return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
|
||||
return make_double2(abs(a.x), abs(a.y));
|
||||
}
|
||||
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<float4,4>& kernel) {
|
||||
double tmp = kernel.packet[0].y;
|
||||
|
@ -297,7 +297,9 @@ namespace Eigen {
|
||||
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
|
||||
* vectorized and non-vectorized code.
|
||||
*/
|
||||
#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
|
||||
#if (defined __CUDACC__)
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
|
||||
#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
||||
#elif (defined _MSC_VER)
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
|
||||
|
@ -149,26 +149,26 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
|
||||
// GPU: the evaluation of the expression is offloaded to a GPU.
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
template <typename Evaluator>
|
||||
template <typename Evaluator, typename Index>
|
||||
__global__ void
|
||||
__launch_bounds__(1024)
|
||||
EigenMetaKernel(Evaluator eval, unsigned int size) {
|
||||
EigenMetaKernel(Evaluator eval, Index size) {
|
||||
|
||||
const int first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int step_size = blockDim.x * gridDim.x;
|
||||
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index step_size = blockDim.x * gridDim.x;
|
||||
|
||||
if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
|
||||
// Use the scalar path
|
||||
for (int i = first_index; i < size; i += step_size) {
|
||||
for (Index i = first_index; i < size; i += step_size) {
|
||||
eval.evalScalar(i);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Use the vector path
|
||||
const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const int vectorized_step_size = step_size * PacketSize;
|
||||
const int vectorized_size = (size / PacketSize) * PacketSize;
|
||||
int i = first_index * PacketSize;
|
||||
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
const Index vectorized_step_size = step_size * PacketSize;
|
||||
const Index vectorized_size = (size / PacketSize) * PacketSize;
|
||||
Index i = first_index * PacketSize;
|
||||
for ( ; i < vectorized_size; i += vectorized_step_size) {
|
||||
eval.evalPacket(i);
|
||||
}
|
||||
@ -193,7 +193,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable>
|
||||
const int block_size = maxCudaThreadsPerBlock();
|
||||
|
||||
const Index size = array_prod(evaluator.dimensions());
|
||||
EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
|
||||
EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index><<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
|
||||
assert(cudaGetLastError() == cudaSuccess);
|
||||
}
|
||||
evaluator.cleanup();
|
||||
|
@ -459,7 +459,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
|
||||
}
|
||||
else {
|
||||
CoeffReturnType values[packetSize];
|
||||
EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
this->m_impl.coeffRef(inputIndices[0]) = values[0];
|
||||
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
|
||||
|
@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
m_dimensions[i] += m_padding[i].first + m_padding[i].second;
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
m_inputStrides[0] = 1;
|
||||
m_outputStrides[0] = 1;
|
||||
@ -125,6 +124,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
eigen_assert(index < dimensions().TotalSize());
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const Index idx = index / m_outputStrides[i];
|
||||
@ -151,11 +151,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
const Index initialIndex = index;
|
||||
Index inputIndex = 0;
|
||||
for (int i = NumDims - 1; i > 0; --i) {
|
||||
const int first = index;
|
||||
const int last = index + packetSize - 1;
|
||||
const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
||||
const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
||||
const int lastPaddedRight = m_outputStrides[i+1];
|
||||
const Index first = index;
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
||||
const Index lastPaddedRight = m_outputStrides[i+1];
|
||||
|
||||
if (last < lastPaddedLeft) {
|
||||
// all the coefficient are in the padding zone.
|
||||
@ -179,9 +179,9 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
||||
|
||||
const Index last = index + packetSize - 1;
|
||||
const Index first = index;
|
||||
const int lastPaddedLeft = m_padding[0].first;
|
||||
const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
||||
const int lastPaddedRight = m_outputStrides[1];
|
||||
const Index lastPaddedLeft = m_padding[0].first;
|
||||
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
||||
const Index lastPaddedRight = m_outputStrides[1];
|
||||
|
||||
if (last < lastPaddedLeft) {
|
||||
// all the coefficient are in the padding zone.
|
||||
|
Loading…
x
Reference in New Issue
Block a user