mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-29 16:22:03 +08:00
[Eigen] Vectorize evaluation of coefficient-wise functions over tensor blocks if the strides are known to be 1. Provides up to 20-25% speedup of the TF cross entropy op with AVX.
A few benchmark numbers: name old time/op new time/op delta BM_Xent_16_10000_cpu 448µs ± 3% 389µs ± 2% -13.21% (p=0.008 n=5+5) BM_Xent_32_10000_cpu 575µs ± 6% 454µs ± 3% -21.00% (p=0.008 n=5+5) BM_Xent_64_10000_cpu 933µs ± 4% 712µs ± 1% -23.71% (p=0.008 n=5+5)
This commit is contained in:
parent
0987126165
commit
eab7e52db2
@ -483,6 +483,7 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
|
|||||||
* result of the cwise unary op to the strided output array.
|
* result of the cwise unary op to the strided output array.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
template <bool Vectorizable>
|
||||||
struct TensorBlockCwiseUnaryOp {
|
struct TensorBlockCwiseUnaryOp {
|
||||||
template <typename StorageIndex, typename UnaryFunctor,
|
template <typename StorageIndex, typename UnaryFunctor,
|
||||||
typename OutputScalar, typename InputScalar>
|
typename OutputScalar, typename InputScalar>
|
||||||
@ -507,6 +508,31 @@ struct TensorBlockCwiseUnaryOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
struct TensorBlockCwiseUnaryOp<true> {
|
||||||
|
template <typename StorageIndex, typename UnaryFunctor,
|
||||||
|
typename OutputScalar, typename InputScalar>
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const UnaryFunctor& functor, const StorageIndex num_coeff,
|
||||||
|
const StorageIndex output_index, const StorageIndex output_stride,
|
||||||
|
OutputScalar* output_data, const StorageIndex input_index,
|
||||||
|
const StorageIndex input_stride, const InputScalar* input_data) {
|
||||||
|
if (input_stride == 1 && output_stride == 1) {
|
||||||
|
typedef const Array<InputScalar, Dynamic, 1> Input;
|
||||||
|
typedef Array<OutputScalar, Dynamic, 1> Output;
|
||||||
|
|
||||||
|
const Map<Input> input(&input_data[input_index], num_coeff);
|
||||||
|
Map<Output> output(&output_data[output_index], num_coeff);
|
||||||
|
|
||||||
|
output = CwiseUnaryOp<UnaryFunctor, Map<Input> >(input, functor);
|
||||||
|
} else {
|
||||||
|
TensorBlockCwiseUnaryOp<false>::Run(
|
||||||
|
functor, num_coeff, output_index, output_stride, output_data,
|
||||||
|
input_index, input_stride, input_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class TensorBlockCwiseUnaryIO
|
* \class TensorBlockCwiseUnaryIO
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \ingroup CXX11_Tensor_Module
|
||||||
@ -521,6 +547,11 @@ struct TensorBlockCwiseUnaryIO {
|
|||||||
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
|
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims,
|
||||||
Layout>::Dimensions Dimensions;
|
Layout>::Dimensions Dimensions;
|
||||||
|
|
||||||
|
typedef TensorBlockCwiseUnaryOp<
|
||||||
|
packet_traits<OutputScalar>::Vectorizable &&
|
||||||
|
functor_traits<UnaryFunctor>::PacketAccess>
|
||||||
|
TensorBlockCwiseUnaryOpImpl;
|
||||||
|
|
||||||
struct BlockIteratorState {
|
struct BlockIteratorState {
|
||||||
StorageIndex output_stride, output_span;
|
StorageIndex output_stride, output_span;
|
||||||
StorageIndex input_stride, input_span;
|
StorageIndex input_stride, input_span;
|
||||||
@ -595,7 +626,7 @@ struct TensorBlockCwiseUnaryIO {
|
|||||||
const StorageIndex block_total_size =
|
const StorageIndex block_total_size =
|
||||||
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
||||||
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
||||||
TensorBlockCwiseUnaryOp::Run(functor, inner_dim_size, output_index,
|
TensorBlockCwiseUnaryOpImpl::Run(functor, inner_dim_size, output_index,
|
||||||
output_stride, output_data, input_index,
|
output_stride, output_data, input_index,
|
||||||
input_stride, input_data);
|
input_stride, input_data);
|
||||||
// Update index.
|
// Update index.
|
||||||
@ -624,6 +655,7 @@ struct TensorBlockCwiseUnaryIO {
|
|||||||
* result of the cwise binary op to the strided output array.
|
* result of the cwise binary op to the strided output array.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
template<bool Vectorizable>
|
||||||
struct TensorBlockCwiseBinaryOp {
|
struct TensorBlockCwiseBinaryOp {
|
||||||
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
||||||
typename LeftScalar, typename RightScalar>
|
typename LeftScalar, typename RightScalar>
|
||||||
@ -654,6 +686,40 @@ struct TensorBlockCwiseBinaryOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
struct TensorBlockCwiseBinaryOp<true> {
|
||||||
|
template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
|
||||||
|
typename LeftScalar, typename RightScalar>
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
|
const BinaryFunctor& functor, const StorageIndex num_coeff,
|
||||||
|
const StorageIndex output_index, const StorageIndex output_stride,
|
||||||
|
OutputScalar* output_data, const StorageIndex left_index,
|
||||||
|
const StorageIndex left_stride, const LeftScalar* left_data,
|
||||||
|
const StorageIndex right_index, const StorageIndex right_stride,
|
||||||
|
const RightScalar* right_data) {
|
||||||
|
if (left_stride == 1 && right_stride == 1 && output_stride == 1) {
|
||||||
|
typedef const Array<LeftScalar, Dynamic, 1> Lhs;
|
||||||
|
typedef const Array<RightScalar, Dynamic, 1> Rhs;
|
||||||
|
typedef Array<OutputScalar, Dynamic, 1> Out;
|
||||||
|
|
||||||
|
const LeftScalar* lhs_base = &left_data[left_index];
|
||||||
|
const RightScalar* rhs_base = &right_data[right_index];
|
||||||
|
OutputScalar* out_base = &output_data[output_index];
|
||||||
|
|
||||||
|
const Map<Lhs> lhs(lhs_base, num_coeff);
|
||||||
|
const Map<Rhs> rhs(rhs_base, num_coeff);
|
||||||
|
Map<Out> out(out_base, num_coeff);
|
||||||
|
|
||||||
|
out = CwiseBinaryOp<BinaryFunctor, Map<Lhs>, Map<Rhs> >(lhs, rhs, functor);
|
||||||
|
} else {
|
||||||
|
TensorBlockCwiseBinaryOp<false>::Run(
|
||||||
|
functor, num_coeff, output_index, output_stride, output_data,
|
||||||
|
left_index, left_stride, left_data, right_index, right_stride,
|
||||||
|
right_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class TensorBlockCwiseBinaryIO
|
* \class TensorBlockCwiseBinaryIO
|
||||||
* \ingroup CXX11_Tensor_Module
|
* \ingroup CXX11_Tensor_Module
|
||||||
@ -668,6 +734,11 @@ template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
|
|||||||
struct TensorBlockCwiseBinaryIO {
|
struct TensorBlockCwiseBinaryIO {
|
||||||
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
|
typedef typename TensorBlock<OutputScalar, StorageIndex, NumDims, Layout>::Dimensions Dimensions;
|
||||||
|
|
||||||
|
typedef TensorBlockCwiseBinaryOp<
|
||||||
|
packet_traits<OutputScalar>::Vectorizable &&
|
||||||
|
functor_traits<BinaryFunctor>::PacketAccess>
|
||||||
|
TensorBlockCwiseBinaryOpImpl;
|
||||||
|
|
||||||
struct BlockIteratorState {
|
struct BlockIteratorState {
|
||||||
StorageIndex output_stride, output_span;
|
StorageIndex output_stride, output_span;
|
||||||
StorageIndex left_stride, left_span;
|
StorageIndex left_stride, left_span;
|
||||||
@ -748,7 +819,7 @@ struct TensorBlockCwiseBinaryIO {
|
|||||||
const StorageIndex block_total_size =
|
const StorageIndex block_total_size =
|
||||||
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
NumDims == 0 ? 1 : block_sizes.TotalSize();
|
||||||
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
|
||||||
TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
|
TensorBlockCwiseBinaryOpImpl::Run(functor, inner_dim_size, output_index,
|
||||||
output_stride, output_data, left_index,
|
output_stride, output_data, left_index,
|
||||||
left_stride, left_data, right_index,
|
left_stride, left_data, right_index,
|
||||||
right_stride, right_data);
|
right_stride, right_data);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user