mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-10 18:59:01 +08:00
Add half-Packet
operations to StridedLinearBufferCopy
.
This commit is contained in:
parent
c9a14f48d9
commit
874f5947f4
@ -994,9 +994,12 @@ class TensorTernaryExprBlock {
|
|||||||
template <typename Scalar, typename IndexType>
|
template <typename Scalar, typename IndexType>
|
||||||
class StridedLinearBufferCopy {
|
class StridedLinearBufferCopy {
|
||||||
typedef typename packet_traits<Scalar>::type Packet;
|
typedef typename packet_traits<Scalar>::type Packet;
|
||||||
|
typedef typename unpacket_traits<Packet>::half HalfPacket;
|
||||||
enum {
|
enum {
|
||||||
Vectorizable = packet_traits<Scalar>::Vectorizable,
|
Vectorizable = packet_traits<Scalar>::Vectorizable,
|
||||||
PacketSize = packet_traits<Scalar>::size
|
PacketSize = packet_traits<Scalar>::size,
|
||||||
|
HasHalfPacket = unpacket_traits<HalfPacket>::size < PacketSize,
|
||||||
|
HalfPacketSize = unpacket_traits<HalfPacket>::size,
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -1070,6 +1073,14 @@ class StridedLinearBufferCopy {
|
|||||||
Packet p = ploadu<Packet>(src + i);
|
Packet p = ploadu<Packet>(src + i);
|
||||||
pstoreu<Scalar, Packet>(dst + i, p);
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
|
if (HasHalfPacket) {
|
||||||
|
const IndexType vectorized_half_size = count - HalfPacketSize;
|
||||||
|
if (i <= vectorized_half_size) {
|
||||||
|
HalfPacket p = ploadu<HalfPacket>(src + i);
|
||||||
|
pstoreu<Scalar, HalfPacket>(dst + i, p);
|
||||||
|
i += HalfPacketSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
dst[i] = src[i];
|
dst[i] = src[i];
|
||||||
}
|
}
|
||||||
@ -1081,6 +1092,14 @@ class StridedLinearBufferCopy {
|
|||||||
Packet p = ploadu<Packet>(src + i);
|
Packet p = ploadu<Packet>(src + i);
|
||||||
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
}
|
}
|
||||||
|
if (HasHalfPacket) {
|
||||||
|
const IndexType vectorized_half_size = count - HalfPacketSize;
|
||||||
|
if (i <= vectorized_half_size) {
|
||||||
|
HalfPacket p = ploadu<HalfPacket>(src + i);
|
||||||
|
pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
|
||||||
|
i += HalfPacketSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
dst[i * dst_stride] = src[i];
|
dst[i * dst_stride] = src[i];
|
||||||
}
|
}
|
||||||
@ -1089,7 +1108,8 @@ class StridedLinearBufferCopy {
|
|||||||
// Fill `dst` with value at `*src`.
|
// Fill `dst` with value at `*src`.
|
||||||
eigen_assert(src_stride == 0 && dst_stride == 1);
|
eigen_assert(src_stride == 0 && dst_stride == 1);
|
||||||
const IndexType unrolled_size = count - 4 * PacketSize;
|
const IndexType unrolled_size = count - 4 * PacketSize;
|
||||||
Packet p = pload1<Packet>(src);
|
Scalar s = *src;
|
||||||
|
Packet p = pset1<Packet>(s);
|
||||||
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
for (; i <= unrolled_size; i += 4 * PacketSize) {
|
||||||
for (int j = 0; j < 4; ++j) {
|
for (int j = 0; j < 4; ++j) {
|
||||||
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
|
pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
|
||||||
@ -1098,19 +1118,36 @@ class StridedLinearBufferCopy {
|
|||||||
for (; i <= vectorized_size; i += PacketSize) {
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
pstoreu<Scalar, Packet>(dst + i, p);
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
|
if (HasHalfPacket) {
|
||||||
|
const IndexType vectorized_half_size = count - HalfPacketSize;
|
||||||
|
if (i <= vectorized_half_size) {
|
||||||
|
HalfPacket hp = pset1<HalfPacket>(s);
|
||||||
|
pstoreu<Scalar, HalfPacket>(dst + i, hp);
|
||||||
|
i += HalfPacketSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
dst[i] = *src;
|
dst[i] = s;
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
|
} else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
|
||||||
// Scatter `*src` into `dst`.
|
// Scatter `*src` into `dst`.
|
||||||
eigen_assert(src_stride == 0 && dst_stride != 1);
|
eigen_assert(src_stride == 0 && dst_stride != 1);
|
||||||
Packet p = pload1<Packet>(src);
|
Scalar s = *src;
|
||||||
|
Packet p = pset1<Packet>(s);
|
||||||
for (; i <= vectorized_size; i += PacketSize) {
|
for (; i <= vectorized_size; i += PacketSize) {
|
||||||
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
|
||||||
}
|
}
|
||||||
|
if (HasHalfPacket) {
|
||||||
|
const IndexType vectorized_half_size = count - HalfPacketSize;
|
||||||
|
if (i <= vectorized_half_size) {
|
||||||
|
HalfPacket hp = pset1<HalfPacket>(s);
|
||||||
|
pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
|
||||||
|
i += HalfPacketSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
dst[i * dst_stride] = *src;
|
dst[i * dst_stride] = s;
|
||||||
}
|
}
|
||||||
// ******************************************************************** //
|
// ******************************************************************** //
|
||||||
} else if (kind == StridedLinearBufferCopy::Kind::Gather) {
|
} else if (kind == StridedLinearBufferCopy::Kind::Gather) {
|
||||||
@ -1120,6 +1157,15 @@ class StridedLinearBufferCopy {
|
|||||||
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
|
||||||
pstoreu<Scalar, Packet>(dst + i, p);
|
pstoreu<Scalar, Packet>(dst + i, p);
|
||||||
}
|
}
|
||||||
|
if (HasHalfPacket) {
|
||||||
|
const IndexType vectorized_half_size = count - HalfPacketSize;
|
||||||
|
if (i <= vectorized_half_size) {
|
||||||
|
HalfPacket p =
|
||||||
|
pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
|
||||||
|
pstoreu<Scalar, HalfPacket>(dst + i, p);
|
||||||
|
i += HalfPacketSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
dst[i] = src[i * src_stride];
|
dst[i] = src[i * src_stride];
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user