Merged from trunk

This commit is contained in:
Benoit Steiner 2014-10-30 21:59:22 -07:00
commit 67fcf47ecb
5 changed files with 38 additions and 21 deletions

View File

@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, c
to[stride*1] = from.y; to[stride*1] = from.y;
} }
template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
return a.x;
}
template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
return a.x;
}
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
}
template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(abs(a.x), abs(a.y));
}
template<> EIGEN_DEVICE_FUNC inline void template<> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) { ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y; double tmp = kernel.packet[0].y;

View File

@ -297,7 +297,9 @@ namespace Eigen {
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code. * vectorized and non-vectorized code.
*/ */
#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) #if (defined __CUDACC__)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#elif (defined _MSC_VER) #elif (defined _MSC_VER)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))

View File

@ -149,26 +149,26 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
// GPU: the evaluation of the expression is offloaded to a GPU. // GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <typename Evaluator> template <typename Evaluator, typename Index>
__global__ void __global__ void
__launch_bounds__(1024) __launch_bounds__(1024)
EigenMetaKernel(Evaluator eval, unsigned int size) { EigenMetaKernel(Evaluator eval, Index size) {
const int first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const int step_size = blockDim.x * gridDim.x; const Index step_size = blockDim.x * gridDim.x;
if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
// Use the scalar path // Use the scalar path
for (int i = first_index; i < size; i += step_size) { for (Index i = first_index; i < size; i += step_size) {
eval.evalScalar(i); eval.evalScalar(i);
} }
} }
else { else {
// Use the vector path // Use the vector path
const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const int vectorized_step_size = step_size * PacketSize; const Index vectorized_step_size = step_size * PacketSize;
const int vectorized_size = (size / PacketSize) * PacketSize; const Index vectorized_size = (size / PacketSize) * PacketSize;
int i = first_index * PacketSize; Index i = first_index * PacketSize;
for ( ; i < vectorized_size; i += vectorized_step_size) { for ( ; i < vectorized_size; i += vectorized_step_size) {
eval.evalPacket(i); eval.evalPacket(i);
} }
@ -193,7 +193,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable>
const int block_size = maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock();
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size); EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index><<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
assert(cudaGetLastError() == cudaSuccess); assert(cudaGetLastError() == cudaSuccess);
} }
evaluator.cleanup(); evaluator.cleanup();

View File

@ -459,7 +459,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
this->m_impl.template writePacket<StoreMode>(inputIndices[0], x); this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
} }
else { else {
CoeffReturnType values[packetSize]; EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x); internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[0]) = values[0];
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];

View File

@ -98,7 +98,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] += m_padding[i].first + m_padding[i].second; m_dimensions[i] += m_padding[i].first + m_padding[i].second;
} }
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
m_inputStrides[0] = 1; m_inputStrides[0] = 1;
m_outputStrides[0] = 1; m_outputStrides[0] = 1;
@ -125,6 +124,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{ {
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0; Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) { for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i]; const Index idx = index / m_outputStrides[i];
@ -151,11 +151,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index initialIndex = index; const Index initialIndex = index;
Index inputIndex = 0; Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) { for (int i = NumDims - 1; i > 0; --i) {
const int first = index; const Index first = index;
const int last = index + packetSize - 1; const Index last = index + packetSize - 1;
const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
const int lastPaddedRight = m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i+1];
if (last < lastPaddedLeft) { if (last < lastPaddedLeft) {
// all the coefficient are in the padding zone. // all the coefficient are in the padding zone.
@ -179,9 +179,9 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
const Index last = index + packetSize - 1; const Index last = index + packetSize - 1;
const Index first = index; const Index first = index;
const int lastPaddedLeft = m_padding[0].first; const Index lastPaddedLeft = m_padding[0].first;
const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second); const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
const int lastPaddedRight = m_outputStrides[1]; const Index lastPaddedRight = m_outputStrides[1];
if (last < lastPaddedLeft) { if (last < lastPaddedLeft) {
// all the coefficient are in the padding zone. // all the coefficient are in the padding zone.