mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-23 10:09:36 +08:00
Merged in ezhulenev/eigen-01 (pull request PR-514)
Add tests for evalShardedByInnerDim contraction + fix bugs
This commit is contained in:
commit
7c1b47840a
@ -167,61 +167,61 @@ struct TensorBlockCopyOp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (src_stride == 1) {
|
if (src_stride == 1) {
|
||||||
const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
||||||
if (dst_stride == 1) {
|
if (dst_stride == 1) {
|
||||||
// LINEAR
|
// LINEAR
|
||||||
for (Index i = 0; i < vectorized_size; i += PacketSize) {
|
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||||
Packet p = internal::ploadu<Packet>(src + i);
|
Packet p = internal::ploadu<Packet>(src + i);
|
||||||
internal::pstoreu<Scalar, Packet>(dst + i, p);
|
internal::pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||||
dst[i] = src[i];
|
dst[i] = src[i];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// SCATTER
|
// SCATTER
|
||||||
for (Index i = 0; i < vectorized_size; i += PacketSize) {
|
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||||
Packet p = internal::ploadu<Packet>(src + i);
|
Packet p = internal::ploadu<Packet>(src + i);
|
||||||
internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
}
|
}
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||||
dst[i * dst_stride] = src[i];
|
dst[i * dst_stride] = src[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (src_stride == 0) {
|
} else if (src_stride == 0) {
|
||||||
const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
||||||
if (dst_stride == 1) {
|
if (dst_stride == 1) {
|
||||||
// LINEAR
|
// LINEAR
|
||||||
for (Index i = 0; i < vectorized_size; i += PacketSize) {
|
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||||
Packet p = internal::pload1<Packet>(src);
|
Packet p = internal::pload1<Packet>(src);
|
||||||
internal::pstoreu<Scalar, Packet>(dst + i, p);
|
internal::pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||||
dst[i] = *src;
|
dst[i] = *src;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// SCATTER
|
// SCATTER
|
||||||
for (Index i = 0; i < vectorized_size; i += PacketSize) {
|
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||||
Packet p = internal::pload1<Packet>(src);
|
Packet p = internal::pload1<Packet>(src);
|
||||||
internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
}
|
}
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||||
dst[i * dst_stride] = *src;
|
dst[i * dst_stride] = *src;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (dst_stride == 1) {
|
if (dst_stride == 1) {
|
||||||
// GATHER
|
// GATHER
|
||||||
const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize;
|
||||||
for (Index i = 0; i < vectorized_size; i += PacketSize) {
|
for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) {
|
||||||
Packet p = internal::pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
Packet p = internal::pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
||||||
internal::pstoreu<Scalar, Packet>(dst + i, p);
|
internal::pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) {
|
||||||
dst[i] = src[i * src_stride];
|
dst[i] = src[i * src_stride];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// RANDOM
|
// RANDOM
|
||||||
for (Index i = 0; i < num_coeff_to_copy; ++i) {
|
for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) {
|
||||||
dst[i * dst_stride] = src[i * src_stride];
|
dst[i * dst_stride] = src[i * src_stride];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -671,7 +671,17 @@ struct TensorContractionEvaluatorBase
|
|||||||
0, k, 1);
|
0, k, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
|
||||||
|
bool rhs_inner_dim_reordered, int Alignment>
|
||||||
|
EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
|
||||||
|
Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
|
||||||
|
evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
|
||||||
|
rhs_inner_dim_reordered, Alignment,
|
||||||
|
/*use_output_kernel*/ false>(buffer, k_start, k_end,
|
||||||
|
num_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel = true>
|
||||||
EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
|
EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
|
||||||
// columns in left side, rows in right side
|
// columns in left side, rows in right side
|
||||||
const Index k = this->m_k_size;
|
const Index k = this->m_k_size;
|
||||||
@ -740,7 +750,7 @@ struct TensorContractionEvaluatorBase
|
|||||||
const Index actual_mc = numext::mini(i2+mc,m)-i2;
|
const Index actual_mc = numext::mini(i2+mc,m)-i2;
|
||||||
for (Index k2 = k_start; k2 < k_end; k2 += kc) {
|
for (Index k2 = k_start; k2 < k_end; k2 += kc) {
|
||||||
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
|
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
|
||||||
const Index actual_kc = numext::mini(k2 + kc, k) - k2;
|
const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
|
||||||
TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2),
|
TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2),
|
||||||
actual_kc, actual_mc);
|
actual_kc, actual_mc);
|
||||||
|
|
||||||
@ -759,7 +769,7 @@ struct TensorContractionEvaluatorBase
|
|||||||
Scalar(1));
|
Scalar(1));
|
||||||
|
|
||||||
// We are done with this [i2, j2] output block.
|
// We are done with this [i2, j2] output block.
|
||||||
if (k2 + kc >= k) {
|
if (use_output_kernel && k2 + kc >= k_end) {
|
||||||
m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
|
m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
|
||||||
actual_mc, actual_nc);
|
actual_mc, actual_nc);
|
||||||
}
|
}
|
||||||
|
@ -798,14 +798,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
auto process_block = [=, &barrier](Scalar* buf, Index first, Index last) {
|
auto process_block = [=, &barrier](Scalar* buf, Index first, Index last) {
|
||||||
::memset(buf, 0, m * n * sizeof(Scalar));
|
::memset(buf, 0, m * n * sizeof(Scalar));
|
||||||
TENSOR_CONTRACTION_DISPATCH(
|
TENSOR_CONTRACTION_DISPATCH(
|
||||||
this->template evalGemmPartial, Alignment,
|
this->template evalGemmPartialWithoutOutputKernel, Alignment,
|
||||||
(buf, first, last, this->m_device.numThreads()));
|
(buf, first, last, this->m_device.numThreads()));
|
||||||
barrier.Notify();
|
barrier.Notify();
|
||||||
};
|
};
|
||||||
Index start = 0;
|
Index start = 0;
|
||||||
for (int blocks_left = num_blocks; blocks_left > 0; --blocks_left) {
|
for (int blocks_left = num_blocks; blocks_left > 0; --blocks_left) {
|
||||||
// The underlying GEMM kernel assumes that k is a multiple of 8 and
|
// The underlying GEMM kernel assumes that k is a multiple of packet size
|
||||||
// subtle breakage occurs if this is violated.
|
// (currently largest packet size is 8) and subtle breakage occurs if
|
||||||
|
// this is violated.
|
||||||
block_size = 8 * divup<Index>(k - start, 8 * blocks_left);
|
block_size = 8 * divup<Index>(k - start, 8 * blocks_left);
|
||||||
Scalar* buf;
|
Scalar* buf;
|
||||||
if (start == 0) {
|
if (start == 0) {
|
||||||
@ -830,6 +831,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
addToBuffer<Alignment>(m * n, buf, result);
|
addToBuffer<Alignment>(m * n, buf, result);
|
||||||
this->m_device.deallocate(buf);
|
this->m_device.deallocate(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Finally call output kernel with finalized output buffer.
|
||||||
|
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
|
||||||
|
this->m_output_kernel(OutputMapper(result, m),
|
||||||
|
this->m_tensor_contraction_params,
|
||||||
|
static_cast<Eigen::Index>(0),
|
||||||
|
static_cast<Eigen::Index>(0),
|
||||||
|
m, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
|
TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
|
||||||
|
@ -306,6 +306,86 @@ static void test_multithread_contraction_with_output_kernel() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We are triggering 'evalShardedByInnerDim' optimization.
|
||||||
|
template <int DataLayout>
|
||||||
|
static void test_sharded_by_inner_dim_contraction()
|
||||||
|
{
|
||||||
|
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||||
|
|
||||||
|
const int num_threads = internal::random<int>(4, 16);
|
||||||
|
ThreadPool threads(num_threads);
|
||||||
|
Eigen::ThreadPoolDevice device(&threads, num_threads);
|
||||||
|
|
||||||
|
Tensor<float, 2, DataLayout> t_left(2, 10000);
|
||||||
|
Tensor<float, 2, DataLayout> t_right(10000, 10);
|
||||||
|
Tensor<float, 2, DataLayout> t_result(2, 10);
|
||||||
|
|
||||||
|
t_left.setRandom();
|
||||||
|
t_right.setRandom();
|
||||||
|
// Put trash in t_result to verify contraction clears output memory.
|
||||||
|
t_result.setRandom();
|
||||||
|
|
||||||
|
// Add a little offset so that the results won't be close to zero.
|
||||||
|
t_left += t_left.constant(1.0f);
|
||||||
|
t_right += t_right.constant(1.0f);
|
||||||
|
|
||||||
|
typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
|
||||||
|
MapXf m_left(t_left.data(), 2, 10000);
|
||||||
|
MapXf m_right(t_right.data(), 10000, 10);
|
||||||
|
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
|
||||||
|
|
||||||
|
// this contraction should be equivalent to a single matrix multiplication
|
||||||
|
Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
|
||||||
|
|
||||||
|
// compute results by separate methods
|
||||||
|
t_result.device(device) = t_left.contract(t_right, dims);
|
||||||
|
m_result = m_left * m_right;
|
||||||
|
|
||||||
|
for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
|
||||||
|
VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
|
||||||
|
template <int DataLayout>
|
||||||
|
static void test_sharded_by_inner_dim_contraction_with_output_kernel()
|
||||||
|
{
|
||||||
|
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||||
|
|
||||||
|
const int num_threads = internal::random<int>(4, 16);
|
||||||
|
ThreadPool threads(num_threads);
|
||||||
|
Eigen::ThreadPoolDevice device(&threads, num_threads);
|
||||||
|
|
||||||
|
Tensor<float, 2, DataLayout> t_left(2, 10000);
|
||||||
|
Tensor<float, 2, DataLayout> t_right(10000, 10);
|
||||||
|
Tensor<float, 2, DataLayout> t_result(2, 10);
|
||||||
|
|
||||||
|
t_left.setRandom();
|
||||||
|
t_right.setRandom();
|
||||||
|
// Put trash in t_result to verify contraction clears output memory.
|
||||||
|
t_result.setRandom();
|
||||||
|
|
||||||
|
// Add a little offset so that the results won't be close to zero.
|
||||||
|
t_left += t_left.constant(1.0f);
|
||||||
|
t_right += t_right.constant(1.0f);
|
||||||
|
|
||||||
|
typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
|
||||||
|
MapXf m_left(t_left.data(), 2, 10000);
|
||||||
|
MapXf m_right(t_right.data(), 10000, 10);
|
||||||
|
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
|
||||||
|
|
||||||
|
// this contraction should be equivalent to a single matrix multiplication
|
||||||
|
Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
|
||||||
|
|
||||||
|
// compute results by separate methods
|
||||||
|
t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
|
||||||
|
m_result = m_left * m_right;
|
||||||
|
|
||||||
|
for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
|
||||||
|
VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<int DataLayout>
|
template<int DataLayout>
|
||||||
void test_full_contraction() {
|
void test_full_contraction() {
|
||||||
int contract_size1 = internal::random<int>(1, 500);
|
int contract_size1 = internal::random<int>(1, 500);
|
||||||
@ -446,21 +526,26 @@ EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
|
|||||||
CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
|
CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
|
||||||
CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
|
CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
|
||||||
|
|
||||||
|
CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<ColMajor>());
|
||||||
|
CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<RowMajor>());
|
||||||
|
CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
|
||||||
|
CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
|
||||||
|
|
||||||
// Exercise various cases that have been problematic in the past.
|
// Exercise various cases that have been problematic in the past.
|
||||||
CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
|
CALL_SUBTEST_5(test_contraction_corner_cases<ColMajor>());
|
||||||
CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
|
CALL_SUBTEST_5(test_contraction_corner_cases<RowMajor>());
|
||||||
|
|
||||||
CALL_SUBTEST_4(test_full_contraction<ColMajor>());
|
CALL_SUBTEST_6(test_full_contraction<ColMajor>());
|
||||||
CALL_SUBTEST_4(test_full_contraction<RowMajor>());
|
CALL_SUBTEST_6(test_full_contraction<RowMajor>());
|
||||||
|
|
||||||
CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
|
CALL_SUBTEST_7(test_multithreaded_reductions<ColMajor>());
|
||||||
CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
|
CALL_SUBTEST_7(test_multithreaded_reductions<RowMajor>());
|
||||||
|
|
||||||
CALL_SUBTEST_6(test_memcpy());
|
CALL_SUBTEST_7(test_memcpy());
|
||||||
CALL_SUBTEST_6(test_multithread_random());
|
CALL_SUBTEST_7(test_multithread_random());
|
||||||
|
|
||||||
TestAllocator test_allocator;
|
TestAllocator test_allocator;
|
||||||
CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>(NULL));
|
CALL_SUBTEST_7(test_multithread_shuffle<ColMajor>(NULL));
|
||||||
CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>(&test_allocator));
|
CALL_SUBTEST_7(test_multithread_shuffle<RowMajor>(&test_allocator));
|
||||||
CALL_SUBTEST_6(test_threadpool_allocate(&test_allocator));
|
CALL_SUBTEST_7(test_threadpool_allocate(&test_allocator));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user