mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-22 22:33:15 +08:00
Made the Tensor code compile with clang 3.9
This commit is contained in:
parent
09ae0e6586
commit
a71943b9a4
@ -529,7 +529,6 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
|||||||
float2 rhs_shmem2[][8], const Index m_size,
|
float2 rhs_shmem2[][8], const Index m_size,
|
||||||
const Index n_size, const Index k_size,
|
const Index n_size, const Index k_size,
|
||||||
const Index base_m, const Index base_n) {
|
const Index base_m, const Index base_n) {
|
||||||
typedef float Scalar;
|
|
||||||
|
|
||||||
// prefetch registers
|
// prefetch registers
|
||||||
float4 lhs_pf0, rhs_pf0;
|
float4 lhs_pf0, rhs_pf0;
|
||||||
@ -543,12 +542,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
|||||||
#define prefetch_lhs(reg, row, col) \
|
#define prefetch_lhs(reg, row, col) \
|
||||||
if (!CHECK_LHS_BOUNDARY) { \
|
if (!CHECK_LHS_BOUNDARY) { \
|
||||||
if (col < k_size) { \
|
if (col < k_size) { \
|
||||||
reg =lhs.loadPacket<Unaligned>(row, col); \
|
reg =lhs.template loadPacket<Unaligned>(row, col); \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
if (col < k_size) { \
|
if (col < k_size) { \
|
||||||
if (row + 3 < m_size) { \
|
if (row + 3 < m_size) { \
|
||||||
reg =lhs.loadPacket<Unaligned>(row, col); \
|
reg =lhs.template loadPacket<Unaligned>(row, col); \
|
||||||
} else if (row + 2 < m_size) { \
|
} else if (row + 2 < m_size) { \
|
||||||
reg.x =lhs(row + 0, col); \
|
reg.x =lhs(row + 0, col); \
|
||||||
reg.y =lhs(row + 1, col); \
|
reg.y =lhs(row + 1, col); \
|
||||||
@ -578,7 +577,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
|||||||
if (!CHECK_RHS_BOUNDARY) {
|
if (!CHECK_RHS_BOUNDARY) {
|
||||||
if ((rhs_vert + 3) < k_size) {
|
if ((rhs_vert + 3) < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
||||||
} else if (rhs_vert + 2 < k_size) {
|
} else if (rhs_vert + 2 < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
||||||
@ -593,7 +592,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
|||||||
} else {
|
} else {
|
||||||
if (rhs_horiz0 < n_size) {
|
if (rhs_horiz0 < n_size) {
|
||||||
if ((rhs_vert + 3) < k_size) {
|
if ((rhs_vert + 3) < k_size) {
|
||||||
rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
||||||
} else if ((rhs_vert + 2) < k_size) {
|
} else if ((rhs_vert + 2) < k_size) {
|
||||||
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
||||||
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
|
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
|
||||||
@ -766,7 +765,6 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
float2 rhs_shmem2[][8], const Index m_size,
|
float2 rhs_shmem2[][8], const Index m_size,
|
||||||
const Index n_size, const Index k_size,
|
const Index n_size, const Index k_size,
|
||||||
const Index base_m, const Index base_n) {
|
const Index base_m, const Index base_n) {
|
||||||
typedef float Scalar;
|
|
||||||
|
|
||||||
// prefetch registers
|
// prefetch registers
|
||||||
float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
|
float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
|
||||||
@ -790,37 +788,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
|
|
||||||
if (!CHECK_LHS_BOUNDARY) {
|
if (!CHECK_LHS_BOUNDARY) {
|
||||||
if ((threadIdx.y/4+k+24) < k_size) {
|
if ((threadIdx.y/4+k+24) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
||||||
lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
||||||
lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
|
lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
|
||||||
} else if ((threadIdx.y/4+k+16) < k_size) {
|
} else if ((threadIdx.y/4+k+16) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
||||||
lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
||||||
} else if ((threadIdx.y/4+k+8) < k_size) {
|
} else if ((threadIdx.y/4+k+8) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
||||||
} else if ((threadIdx.y/4+k) < k_size) {
|
} else if ((threadIdx.y/4+k) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// just CHECK_LHS_BOUNDARY
|
// just CHECK_LHS_BOUNDARY
|
||||||
if (lhs_vert + 3 < m_size) {
|
if (lhs_vert + 3 < m_size) {
|
||||||
if ((threadIdx.y/4+k+24) < k_size) {
|
if ((threadIdx.y/4+k+24) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
||||||
lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
||||||
lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
|
lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
|
||||||
} else if ((threadIdx.y/4+k+16) < k_size) {
|
} else if ((threadIdx.y/4+k+16) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
||||||
lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
|
||||||
} else if ((threadIdx.y/4+k+8) < k_size) {
|
} else if ((threadIdx.y/4+k+8) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
|
||||||
} else if ((threadIdx.y/4+k) < k_size) {
|
} else if ((threadIdx.y/4+k) < k_size) {
|
||||||
lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
|
||||||
}
|
}
|
||||||
} else if (lhs_vert + 2 < m_size) {
|
} else if (lhs_vert + 2 < m_size) {
|
||||||
if ((threadIdx.y/4+k+24) < k_size) {
|
if ((threadIdx.y/4+k+24) < k_size) {
|
||||||
@ -909,8 +907,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
if (!CHECK_RHS_BOUNDARY) {
|
if (!CHECK_RHS_BOUNDARY) {
|
||||||
if ((rhs_vert + 3) < k_size) {
|
if ((rhs_vert + 3) < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
||||||
rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
|
rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
|
||||||
} else if (rhs_vert + 2 < k_size) {
|
} else if (rhs_vert + 2 < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
||||||
@ -932,8 +930,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
if (rhs_horiz1 < n_size) {
|
if (rhs_horiz1 < n_size) {
|
||||||
if ((rhs_vert + 3) < k_size) {
|
if ((rhs_vert + 3) < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
||||||
rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
|
rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
|
||||||
} else if (rhs_vert + 2 < k_size) {
|
} else if (rhs_vert + 2 < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
||||||
@ -954,7 +952,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
} else if (rhs_horiz0 < n_size) {
|
} else if (rhs_horiz0 < n_size) {
|
||||||
if ((rhs_vert + 3) < k_size) {
|
if ((rhs_vert + 3) < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
|
||||||
} else if ((rhs_vert + 2) < k_size) {
|
} else if ((rhs_vert + 2) < k_size) {
|
||||||
// just CHECK_RHS_BOUNDARY
|
// just CHECK_RHS_BOUNDARY
|
||||||
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
|
||||||
@ -1137,9 +1135,6 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
|
|||||||
typedef float2 LHS_MEM[64][32];
|
typedef float2 LHS_MEM[64][32];
|
||||||
typedef float2 RHS_MEM[128][8];
|
typedef float2 RHS_MEM[128][8];
|
||||||
|
|
||||||
typedef float2 LHS_MEM16x16[32][16];
|
|
||||||
typedef float2 RHS_MEM16x16[64][8];
|
|
||||||
|
|
||||||
const Index m_block_idx = blockIdx.x;
|
const Index m_block_idx = blockIdx.x;
|
||||||
const Index n_block_idx = blockIdx.y;
|
const Index n_block_idx = blockIdx.y;
|
||||||
|
|
||||||
|
@ -287,7 +287,6 @@ struct FullReductionLauncher<
|
|||||||
void>::type> {
|
void>::type> {
|
||||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
|
||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
typedef typename Self::CoeffReturnType Scalar;
|
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user