mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-13 04:09:10 +08:00
Choose TensorBlock StridedLinearCopy type statically
This commit is contained in:
parent
c97b208468
commit
c64396b4c6
@ -459,6 +459,16 @@ class StridedLinearBufferCopy {
|
|||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
// Specifying linear copy kind statically gives ~30% speedup for small sizes.
|
||||||
|
enum Kind {
|
||||||
|
Linear = 0, // src_stride == 1 && dst_stride == 1
|
||||||
|
Scatter = 1, // src_stride == 1 && dst_stride != 1
|
||||||
|
FillLinear = 2, // src_stride == 0 && dst_stride == 1
|
||||||
|
FillScatter = 3, // src_stride == 0 && dst_stride != 1
|
||||||
|
Gather = 4, // dst_stride == 1
|
||||||
|
Random = 5 // everything else
|
||||||
|
};
|
||||||
|
|
||||||
struct Dst {
|
struct Dst {
|
||||||
Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
|
Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
|
||||||
|
|
||||||
@ -476,14 +486,16 @@ class StridedLinearBufferCopy {
|
|||||||
const Scalar* data;
|
const Scalar* data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <StridedLinearBufferCopy::Kind kind>
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
|
||||||
const Src& src,
|
const Src& src,
|
||||||
const size_t count) {
|
const size_t count) {
|
||||||
Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
|
Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
|
||||||
src.data);
|
src.data);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
template <StridedLinearBufferCopy::Kind kind>
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
|
||||||
const IndexType count, const IndexType dst_offset,
|
const IndexType count, const IndexType dst_offset,
|
||||||
const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
|
||||||
@ -499,13 +511,14 @@ class StridedLinearBufferCopy {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const IndexType unrolled_size = count - 4 * PacketSize;
|
|
||||||
const IndexType vectorized_size = count - PacketSize;
|
const IndexType vectorized_size = count - PacketSize;
|
||||||
IndexType i = 0;
|
IndexType i = 0;
|
||||||
|
|
||||||
if (src_stride == 1 && dst_stride == 1) {
|
if (kind == Linear) {
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
// Linear copy from `src` to `dst`.
|
// Linear copy from `src` to `dst`.
|
||||||
|
const IndexType unrolled_size = count - 4 * PacketSize;
|
||||||
|
eigen_assert(src_stride == 1 && dst_stride == 1);
|
||||||
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
||||||
for (int j = 0; j < 4; ++j) {
|
for (int j = 0; j < 4; ++j) {
|
||||||
Packet p = ploadu<Packet>(src + i + j * PacketSize);
|
Packet p = ploadu<Packet>(src + i + j * PacketSize);
|
||||||
@ -520,8 +533,9 @@ class StridedLinearBufferCopy {
|
|||||||
dst[i] = src[i];
|
dst[i] = src[i];
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else if (src_stride == 1 && dst_stride != 1) {
|
} else if (kind == Scatter) {
|
||||||
// Scatter from `src` to `dst`.
|
// Scatter from `src` to `dst`.
|
||||||
|
eigen_assert(src_stride == 1 && dst_stride != 1);
|
||||||
for (; i <= vectorized_size; i += PacketSize) {
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
Packet p = ploadu<Packet>(src + i);
|
Packet p = ploadu<Packet>(src + i);
|
||||||
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
@ -530,14 +544,10 @@ class StridedLinearBufferCopy {
|
|||||||
dst[i * dst_stride] = src[i];
|
dst[i * dst_stride] = src[i];
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else if (src_stride == 0 && dst_stride == 1) {
|
} else if (kind == FillLinear) {
|
||||||
// Fill `dst` with value at `*src`.
|
// Fill `dst` with value at `*src`.
|
||||||
|
eigen_assert(src_stride == 0 && dst_stride == 1);
|
||||||
Packet p = pload1<Packet>(src);
|
Packet p = pload1<Packet>(src);
|
||||||
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
|
||||||
for (int j = 0; j < 4; ++j) {
|
|
||||||
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (; i <= vectorized_size; i += PacketSize) {
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
pstoreu<Scalar, Packet>(dst + i, p);
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
@ -545,8 +555,9 @@ class StridedLinearBufferCopy {
|
|||||||
dst[i] = *src;
|
dst[i] = *src;
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else if (src_stride == 0 && dst_stride != 1) {
|
} else if (kind == FillScatter) {
|
||||||
// Scatter `*src` into `dst`.
|
// Scatter `*src` into `dst`.
|
||||||
|
eigen_assert(src_stride == 0 && dst_stride != 1);
|
||||||
Packet p = pload1<Packet>(src);
|
Packet p = pload1<Packet>(src);
|
||||||
for (; i <= vectorized_size; i += PacketSize) {
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
@ -555,8 +566,9 @@ class StridedLinearBufferCopy {
|
|||||||
dst[i * dst_stride] = *src;
|
dst[i * dst_stride] = *src;
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else if (dst_stride == 1) {
|
} else if (kind == Gather) {
|
||||||
// Gather from `src` into `dst`.
|
// Gather from `src` into `dst`.
|
||||||
|
eigen_assert(dst_stride == 1);
|
||||||
for (; i <= vectorized_size; i += PacketSize) {
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
||||||
pstoreu<Scalar, Packet>(dst + i, p);
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
@ -565,11 +577,13 @@ class StridedLinearBufferCopy {
|
|||||||
dst[i] = src[i * src_stride];
|
dst[i] = src[i * src_stride];
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else {
|
} else if (kind == Random) {
|
||||||
// Random.
|
// Random.
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
dst[i * dst_stride] = src[i * src_stride];
|
dst[i * dst_stride] = src[i * src_stride];
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
eigen_assert(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -716,25 +730,40 @@ class TensorBlockIOV2 {
|
|||||||
// Iterate copying data from src to dst.
|
// Iterate copying data from src to dst.
|
||||||
const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
|
const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
|
||||||
|
|
||||||
for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) {
|
#define COPY_INNER_DIM(KIND) \
|
||||||
// Copy data for the innermost dimension.
|
for (IndexType i = 0; i < block_total_size; i += dst_inner_dim_size) { \
|
||||||
LinCopy::Run(
|
LinCopy::template Run<KIND>( \
|
||||||
typename LinCopy::Dst(output_offset, output_stride, dst.data),
|
typename LinCopy::Dst(output_offset, output_stride, dst.data), \
|
||||||
typename LinCopy::Src(input_offset, input_stride, src.data),
|
typename LinCopy::Src(input_offset, input_stride, src.data), \
|
||||||
dst_inner_dim_size);
|
dst_inner_dim_size); \
|
||||||
|
\
|
||||||
|
for (int j = 0; j < idx; ++j) { \
|
||||||
|
if (++it[j].count < it[j].size) { \
|
||||||
|
input_offset += it[j].input_stride; \
|
||||||
|
output_offset += it[j].output_stride; \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
it[j].count = 0; \
|
||||||
|
input_offset -= it[j].input_span; \
|
||||||
|
output_offset -= it[j].output_span; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
// Update offsets (idx is the number of initialize block iterators).
|
if (input_stride == 1 && output_stride == 1) {
|
||||||
for (int j = 0; j < idx; ++j) {
|
COPY_INNER_DIM(LinCopy::Linear);
|
||||||
if (++it[j].count < it[j].size) {
|
} else if (input_stride == 1 && output_stride != 1) {
|
||||||
input_offset += it[j].input_stride;
|
COPY_INNER_DIM(LinCopy::Scatter);
|
||||||
output_offset += it[j].output_stride;
|
} else if (input_stride == 0 && output_stride == 1) {
|
||||||
break;
|
COPY_INNER_DIM(LinCopy::FillLinear);
|
||||||
}
|
} else if (input_stride == 0 && output_stride != 1) {
|
||||||
it[j].count = 0;
|
COPY_INNER_DIM(LinCopy::FillScatter);
|
||||||
input_offset -= it[j].input_span;
|
} else if (output_stride == 1) {
|
||||||
output_offset -= it[j].output_span;
|
COPY_INNER_DIM(LinCopy::Gather);
|
||||||
}
|
} else {
|
||||||
|
COPY_INNER_DIM(LinCopy::Random);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef COPY_INNER_DIM
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy from `src` to `dst` with an identity src->dst dimension map.
|
// Copy from `src` to `dst` with an identity src->dst dimension map.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user