mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 19:59:05 +08:00
merge
This commit is contained in:
commit
91e1375ba9
@ -52,14 +52,19 @@ __device__ half operator /= (half& a, const half& b) {
|
|||||||
a = a / b;
|
a = a / b;
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
__device__ half __shfl_xor(half a, int) {
|
|
||||||
assert(false && "tbd");
|
namespace std {
|
||||||
return a;
|
__device__ half abs(const half& a) {
|
||||||
|
half result;
|
||||||
|
result.x = a.x & 0x7FFF;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
template<> struct is_arithmetic<half> { enum { value = true }; };
|
||||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||||
|
|
||||||
template<> struct packet_traits<half> : default_packet_traits
|
template<> struct packet_traits<half> : default_packet_traits
|
||||||
@ -214,17 +219,20 @@ template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
|
template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
|
||||||
assert(false && "tbd");
|
half2 result;
|
||||||
return half2();
|
result.x = a.x & 0x7FFF7FFF;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC inline void
|
EIGEN_DEVICE_FUNC inline void
|
||||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||||
assert(false && "tbd");
|
half a1 = __low2half(kernel.packet[0]);
|
||||||
// half tmp = kernel.packet[0].y;
|
half a2 = __high2half(kernel.packet[0]);
|
||||||
// kernel.packet[0].y = kernel.packet[1].x;
|
half b1 = __low2half(kernel.packet[1]);
|
||||||
// kernel.packet[1].x = tmp;
|
half b2 = __high2half(kernel.packet[1]);
|
||||||
|
kernel.packet[0] = __halves2half2(a1, b1);
|
||||||
|
kernel.packet[1] = __halves2half2(a2, b2);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
@ -15,7 +15,7 @@ using Eigen::TensorMap;
|
|||||||
|
|
||||||
// TODO(bsteiner): also templatize on the input type since we have users
|
// TODO(bsteiner): also templatize on the input type since we have users
|
||||||
// for int8 as well as floats.
|
// for int8 as well as floats.
|
||||||
template <typename Device> class BenchmarkSuite {
|
template <typename Device, typename T> class BenchmarkSuite {
|
||||||
public:
|
public:
|
||||||
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
|
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
|
||||||
: m_(m), k_(k), n_(n), device_(device) {
|
: m_(m), k_(k), n_(n), device_(device) {
|
||||||
@ -37,7 +37,7 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
|
device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
|
||||||
}
|
}
|
||||||
// Record the number of values copied per second
|
// Record the number of values copied per second
|
||||||
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
|
||||||
@ -45,13 +45,15 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void typeCasting(int num_iters) {
|
void typeCasting(int num_iters) {
|
||||||
eigen_assert(m_ == n_);
|
eigen_assert(m_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
|
Eigen::array<TensorIndex, 2> sizes;
|
||||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
|
sizes[0] = m_;
|
||||||
|
sizes[1] = k_;
|
||||||
|
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
|
||||||
TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
|
TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
B.device(device_) = A.cast<int>();
|
B.device(device_) = A.template cast<int>();
|
||||||
}
|
}
|
||||||
// Record the number of values copied per second
|
// Record the number of values copied per second
|
||||||
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
|
finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
|
||||||
@ -59,8 +61,10 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void random(int num_iters) {
|
void random(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
Eigen::array<TensorIndex, 2> sizes;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
sizes[0] = m_;
|
||||||
|
sizes[1] = m_;
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -72,10 +76,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void slicing(int num_iters) {
|
void slicing(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
Eigen::array<TensorIndex, 2> sizes;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
sizes[0] = m_;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
sizes[1] = m_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
|
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
|
||||||
const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
|
const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
|
||||||
@ -100,10 +106,13 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void rowChip(int num_iters) {
|
void rowChip(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
Eigen::array<TensorIndex, 2> input_size;
|
||||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
input_size[0] = k_;
|
||||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
input_size[1] = n_;
|
||||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||||
|
Eigen::array<TensorIndex, 1> output_size;
|
||||||
|
output_size[0] = n_;
|
||||||
|
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -114,10 +123,13 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void colChip(int num_iters) {
|
void colChip(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
|
Eigen::array<TensorIndex, 2> input_size;
|
||||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
input_size[0] = k_;
|
||||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
input_size[1] = n_;
|
||||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||||
|
Eigen::array<TensorIndex, 1> output_size;
|
||||||
|
output_size[0] = n_;
|
||||||
|
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -129,12 +141,18 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void shuffling(int num_iters) {
|
void shuffling(int num_iters) {
|
||||||
eigen_assert(m_ == n_);
|
eigen_assert(m_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
Eigen::array<TensorIndex, 2> size_a;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
size_a[0] = m_;
|
||||||
const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
|
size_a[1] = k_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
|
Eigen::array<TensorIndex, 2> size_b;
|
||||||
|
size_b[0] = k_;
|
||||||
|
size_b[1] = m_;
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||||
|
|
||||||
const Eigen::array<int, 2> shuffle = {{1, 0}};
|
Eigen::array<int, 2> shuffle;
|
||||||
|
shuffle[0] = 1;
|
||||||
|
shuffle[1] = 0;
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -146,10 +164,14 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void padding(int num_iters) {
|
void padding(int num_iters) {
|
||||||
eigen_assert(m_ == k_);
|
eigen_assert(m_ == k_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_-3}};
|
Eigen::array<TensorIndex, 2> size_a;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
size_a[0] = m_;
|
||||||
const Eigen::array<TensorIndex, 2> size_b = {{k_, m_}};
|
size_a[1] = k_-3;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
|
Eigen::array<TensorIndex, 2> size_b;
|
||||||
|
size_b[0] = k_;
|
||||||
|
size_b[1] = m_;
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||||
|
|
||||||
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
|
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
|
||||||
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
|
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
|
||||||
@ -165,12 +187,18 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void striding(int num_iters) {
|
void striding(int num_iters) {
|
||||||
eigen_assert(m_ == k_);
|
eigen_assert(m_ == k_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
Eigen::array<TensorIndex, 2> size_a;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
size_a[0] = m_;
|
||||||
const Eigen::array<TensorIndex, 2> size_b = {{m_, k_ / 2}};
|
size_a[1] = k_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
|
Eigen::array<TensorIndex, 2> size_b;
|
||||||
|
size_b[0] = m_;
|
||||||
|
size_b[1] = k_/2;
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
|
||||||
|
|
||||||
const Eigen::array<TensorIndex, 2> strides = {{1, 2}};
|
Eigen::array<TensorIndex, 2> strides;
|
||||||
|
strides[0] = 1;
|
||||||
|
strides[1] = 2;
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -181,13 +209,19 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void broadcasting(int num_iters) {
|
void broadcasting(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> size_a = {{m_, 1}};
|
Eigen::array<TensorIndex, 2> size_a;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
size_a[0] = m_;
|
||||||
const Eigen::array<TensorIndex, 2> size_c = {{m_, n_}};
|
size_a[1] = 1;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
|
||||||
|
Eigen::array<TensorIndex, 2> size_c;
|
||||||
|
size_c[0] = m_;
|
||||||
|
size_c[1] = n_;
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
|
||||||
|
|
||||||
#ifndef EIGEN_HAS_INDEX_LIST
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
const Eigen::array<int, 2> broadcast = {{1, n_}};
|
Eigen::array<int, 2> broadcast;
|
||||||
|
broadcast[0] = 1;
|
||||||
|
broadcast[1] = n_;
|
||||||
#else
|
#else
|
||||||
// Take advantage of cxx11 to give the compiler information it can use to
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
// optimize the code.
|
// optimize the code.
|
||||||
@ -205,10 +239,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void coeffWiseOp(int num_iters) {
|
void coeffWiseOp(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
Eigen::array<TensorIndex, 2> sizes;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
sizes[0] = m_;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
sizes[1] = m_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -221,10 +257,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void algebraicFunc(int num_iters) {
|
void algebraicFunc(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
Eigen::array<TensorIndex, 2> sizes;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
sizes[0] = m_;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
sizes[1] = m_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -237,10 +275,12 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
void transcendentalFunc(int num_iters) {
|
void transcendentalFunc(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
Eigen::array<TensorIndex, 2> sizes;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
sizes[0] = m_;
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
sizes[1] = m_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
|
||||||
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -253,13 +293,16 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
// Row reduction
|
// Row reduction
|
||||||
void rowReduction(int num_iters) {
|
void rowReduction(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
Eigen::array<TensorIndex, 2> input_size;
|
||||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
input_size[0] = k_;
|
||||||
|
input_size[1] = n_;
|
||||||
|
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
#ifndef EIGEN_HAS_INDEX_LIST
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
|
Eigen::array<TensorIndex, 1> sum_along_dim;
|
||||||
|
sum_along_dim[0] = 0;
|
||||||
#else
|
#else
|
||||||
// Take advantage of cxx11 to give the compiler information it can use to
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
// optimize the code.
|
// optimize the code.
|
||||||
@ -277,15 +320,18 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
// Column reduction
|
// Column reduction
|
||||||
void colReduction(int num_iters) {
|
void colReduction(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
Eigen::array<TensorIndex, 2> input_size;
|
||||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
input_size[0] = k_;
|
||||||
|
input_size[1] = n_;
|
||||||
|
const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
||||||
b_, input_size);
|
b_, input_size);
|
||||||
const Eigen::array<TensorIndex, 1> output_size = {{k_}};
|
const Eigen::array<TensorIndex, 1> output_size = {{k_}};
|
||||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
|
TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
|
||||||
c_, output_size);
|
c_, output_size);
|
||||||
|
|
||||||
#ifndef EIGEN_HAS_INDEX_LIST
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
|
Eigen::array<TensorIndex, 1> sum_along_dim;
|
||||||
|
sum_along_dim = 1;
|
||||||
#else
|
#else
|
||||||
// Take advantage of cxx11 to give the compiler information it can use to
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
// optimize the code.
|
// optimize the code.
|
||||||
@ -303,16 +349,23 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
// do a contraction which is equivalent to a matrix multiplication
|
// do a contraction which is equivalent to a matrix multiplication
|
||||||
void contraction(int num_iters) {
|
void contraction(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> sizeA = {{m_, k_}};
|
Eigen::array<TensorIndex, 2> sizeA;
|
||||||
const Eigen::array<TensorIndex, 2> sizeB = {{k_, n_}};
|
sizeA[0] = m_;
|
||||||
const Eigen::array<TensorIndex, 2> sizeC = {{m_, n_}};
|
sizeA[1] = k_;
|
||||||
|
Eigen::array<TensorIndex, 2> sizeB;
|
||||||
|
sizeB[0] = k_;
|
||||||
|
sizeB[1] = n_;
|
||||||
|
Eigen::array<TensorIndex, 2> sizeC;
|
||||||
|
sizeC[0] = m_;
|
||||||
|
sizeC[1] = n_;
|
||||||
|
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
|
const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
|
||||||
|
|
||||||
typedef typename Tensor<float, 2>::DimensionPair DimPair;
|
typedef typename Tensor<T, 2>::DimensionPair DimPair;
|
||||||
const Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
|
Eigen::array<DimPair, 1> dims;
|
||||||
|
dims[0] = DimPair(1, 0);
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -324,14 +377,21 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void convolution(int num_iters, int kernel_x, int kernel_y) {
|
void convolution(int num_iters, int kernel_x, int kernel_y) {
|
||||||
const Eigen::array<TensorIndex, 2> input_sizes = {{m_, n_}};
|
Eigen::array<TensorIndex, 2> input_sizes;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
|
input_sizes[0] = m_;
|
||||||
const Eigen::array<TensorIndex, 2> kernel_sizes = {{kernel_x, kernel_y}};
|
input_sizes[1] = n_;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
|
||||||
const Eigen::array<TensorIndex, 2> result_sizes =
|
Eigen::array<TensorIndex, 2> kernel_sizes;
|
||||||
{{m_ - kernel_x + 1, n_ - kernel_y + 1}};
|
kernel_sizes[0] = kernel_x;
|
||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
|
kernel_sizes[1] = kernel_y;
|
||||||
Eigen::array<Tensor<float, 2>::Index, 2> dims = {{0, 1}};
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
|
||||||
|
Eigen::array<TensorIndex, 2> result_sizes;
|
||||||
|
result_sizes[0] = m_ - kernel_x + 1;
|
||||||
|
result_sizes[1] = n_ - kernel_y + 1;
|
||||||
|
TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
|
||||||
|
Eigen::array<TensorIndex, 2> dims;
|
||||||
|
dims[0] = 0;
|
||||||
|
dims[1] = 1;
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -345,15 +405,15 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
void initialize() {
|
void initialize() {
|
||||||
a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
|
a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
|
||||||
b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
|
b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
|
||||||
c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
|
c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
|
||||||
|
|
||||||
// Initialize the content of the memory pools to prevent asan from
|
// Initialize the content of the memory pools to prevent asan from
|
||||||
// complaining.
|
// complaining.
|
||||||
device_.memset(a_, 12, m_ * k_ * sizeof(float));
|
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||||
device_.memset(b_, 23, k_ * n_ * sizeof(float));
|
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||||
device_.memset(c_, 31, m_ * n_ * sizeof(float));
|
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||||
|
|
||||||
//BenchmarkUseRealTime();
|
//BenchmarkUseRealTime();
|
||||||
}
|
}
|
||||||
@ -372,9 +432,9 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
TensorIndex m_;
|
TensorIndex m_;
|
||||||
TensorIndex k_;
|
TensorIndex k_;
|
||||||
TensorIndex n_;
|
TensorIndex n_;
|
||||||
float* a_;
|
T* a_;
|
||||||
float* b_;
|
T* b_;
|
||||||
float* c_;
|
T* c_;
|
||||||
Device device_;
|
Device device_;
|
||||||
};
|
};
|
||||||
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
||||||
|
@ -13,7 +13,7 @@ Eigen::ThreadPoolDevice device(&pool, threads);
|
|||||||
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
|
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
|
||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
CREATE_THREAD_POOL(THREADS); \
|
CREATE_THREAD_POOL(THREADS); \
|
||||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
|
||||||
@ -81,15 +81,15 @@ BM_FuncCPU(colReduction, 12);
|
|||||||
|
|
||||||
// Contractions
|
// Contractions
|
||||||
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
|
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
|
||||||
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
|
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) { \
|
||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
if (THREADS == 1) { \
|
if (THREADS == 1) { \
|
||||||
Eigen::DefaultDevice device; \
|
Eigen::DefaultDevice device; \
|
||||||
BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \
|
BenchmarkSuite<Eigen::DefaultDevice, float> suite(device, D1, D2, D3); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
} else { \
|
} else { \
|
||||||
CREATE_THREAD_POOL(THREADS); \
|
CREATE_THREAD_POOL(THREADS); \
|
||||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
|
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, D1, D2, D3); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -138,7 +138,7 @@ BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
|
|||||||
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
|
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
|
||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
CREATE_THREAD_POOL(THREADS); \
|
CREATE_THREAD_POOL(THREADS); \
|
||||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::ThreadPoolDevice, float> suite(device, N); \
|
||||||
suite.FUNC(iters, DIM1, DIM2); \
|
suite.FUNC(iters, DIM1, DIM2); \
|
||||||
} \
|
} \
|
||||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
|
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
Eigen::CudaStreamDevice stream; \
|
Eigen::CudaStreamDevice stream; \
|
||||||
Eigen::GpuDevice device(&stream); \
|
Eigen::GpuDevice device(&stream); \
|
||||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
} \
|
} \
|
||||||
@ -41,7 +41,7 @@ BM_FuncGPU(colReduction);
|
|||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
Eigen::CudaStreamDevice stream; \
|
Eigen::CudaStreamDevice stream; \
|
||||||
Eigen::GpuDevice device(&stream); \
|
Eigen::GpuDevice device(&stream); \
|
||||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
|
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3); \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
suite.FUNC(iters); \
|
suite.FUNC(iters); \
|
||||||
} \
|
} \
|
||||||
@ -60,7 +60,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
|||||||
StopBenchmarkTiming(); \
|
StopBenchmarkTiming(); \
|
||||||
Eigen::CudaStreamDevice stream; \
|
Eigen::CudaStreamDevice stream; \
|
||||||
Eigen::GpuDevice device(&stream); \
|
Eigen::GpuDevice device(&stream); \
|
||||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N); \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
suite.FUNC(iters, DIM1, DIM2); \
|
suite.FUNC(iters, DIM1, DIM2); \
|
||||||
} \
|
} \
|
||||||
|
@ -58,6 +58,7 @@ typedef unsigned __int64 uint64_t;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef EIGEN_USE_GPU
|
#ifdef EIGEN_USE_GPU
|
||||||
|
#include <iostream>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#if defined(__CUDACC__)
|
#if defined(__CUDACC__)
|
||||||
#include <curand_kernel.h>
|
#include <curand_kernel.h>
|
||||||
|
@ -165,6 +165,18 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
|
|||||||
typename XprType::Nested m_xpr;
|
typename XprType::Nested m_xpr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) {
|
||||||
|
impl.evalSubExprsIfNeeded(NULL);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) {
|
||||||
|
return impl.evalSubExprsIfNeeded(data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -195,10 +207,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
|||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
|
||||||
{
|
{
|
||||||
m_impl.evalSubExprsIfNeeded(NULL);
|
return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
|
||||||
|
@ -230,10 +230,10 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
|||||||
pos_j_base_powered[1] = pos_j_base;
|
pos_j_base_powered[1] = pos_j_base;
|
||||||
if (line_len > 2) {
|
if (line_len > 2) {
|
||||||
const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
|
const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
|
||||||
for (int i = 2; i < line_len + 1; ++i) {
|
for (int j = 2; j < line_len + 1; ++j) {
|
||||||
pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
|
pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
|
||||||
pos_j_base_powered[i - 1] /
|
pos_j_base_powered[j - 1] /
|
||||||
pos_j_base_powered[i - 2] * pos_j_base_sq;
|
pos_j_base_powered[j - 2] * pos_j_base_sq;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -468,7 +468,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
|||||||
|
|
||||||
template <int Dir>
|
template <int Dir>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
|
||||||
ComplexScalar* data, int n, int n_power_of_2) {
|
ComplexScalar* data, Index n, Index n_power_of_2) {
|
||||||
// Original code:
|
// Original code:
|
||||||
// RealScalar wtemp = std::sin(M_PI/n);
|
// RealScalar wtemp = std::sin(M_PI/n);
|
||||||
// RealScalar wpi = -std::sin(2 * M_PI/n);
|
// RealScalar wpi = -std::sin(2 * M_PI/n);
|
||||||
@ -482,9 +482,9 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
|||||||
const ComplexScalar wp_one_2 = wp_one * wp_one;
|
const ComplexScalar wp_one_2 = wp_one * wp_one;
|
||||||
const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
|
const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
|
||||||
const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
|
const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
|
||||||
const int n2 = n / 2;
|
const Index n2 = n / 2;
|
||||||
ComplexScalar w(1.0, 0.0);
|
ComplexScalar w(1.0, 0.0);
|
||||||
for (int i = 0; i < n2; i += 4) {
|
for (Index i = 0; i < n2; i += 4) {
|
||||||
ComplexScalar temp0(data[i + n2] * w);
|
ComplexScalar temp0(data[i + n2] * w);
|
||||||
ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
|
ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
|
||||||
ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
|
ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
|
||||||
@ -507,7 +507,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
|
|||||||
|
|
||||||
template <int Dir>
|
template <int Dir>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
|
||||||
ComplexScalar* data, int n, int n_power_of_2) {
|
ComplexScalar* data, Index n, Index n_power_of_2) {
|
||||||
eigen_assert(isPowerOfTwo(n));
|
eigen_assert(isPowerOfTwo(n));
|
||||||
if (n > 8) {
|
if (n > 8) {
|
||||||
compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
|
compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
|
||||||
|
@ -167,7 +167,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = NumDims == 5,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -437,59 +437,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
|
|||||||
Index rowInflateStride() const { return m_row_inflate_strides; }
|
Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||||
Index colInflateStride() const { return m_col_inflate_strides; }
|
Index colInflateStride() const { return m_col_inflate_strides; }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
|
|
||||||
{
|
|
||||||
// Location of the first element of the patch.
|
|
||||||
// ColMajor
|
|
||||||
// 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
|
|
||||||
// RowMajor
|
|
||||||
// 0: number of batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: d
|
|
||||||
const Index patch2DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 1];
|
|
||||||
|
|
||||||
array<Index, NumDims-1> inputCoords;
|
|
||||||
Index input_col_idx = patch2DIndex / m_fastInputColsEff;
|
|
||||||
Index inputCol = input_col_idx + coords[1] * m_in_row_strides - m_rowPaddingTop;
|
|
||||||
Index inputRow = patch2DIndex - input_col_idx * m_input_cols_eff + coords[2] * m_in_col_strides - m_colPaddingLeft;
|
|
||||||
const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
|
|
||||||
const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
|
||||||
inputCoords[0] = coords[0]; // depth
|
|
||||||
inputCoords[1] = origInputCol;
|
|
||||||
inputCoords[2] = origInputRow;
|
|
||||||
inputCoords[3] = coords[4]; // batch
|
|
||||||
} else {
|
|
||||||
inputCoords[3] = coords[4]; // depth
|
|
||||||
inputCoords[2] = origInputCol;
|
|
||||||
inputCoords[1] = origInputRow;
|
|
||||||
inputCoords[0] = coords[0]; // batch
|
|
||||||
}
|
|
||||||
// If the computed coordinates are outside the original image perimeter, return 0.
|
|
||||||
if (inputCol < 0 || inputCol >= m_input_cols_eff || inputRow < 0 || inputRow >= m_input_rows_eff ||
|
|
||||||
((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides)) ||
|
|
||||||
((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
|
|
||||||
return Scalar(m_paddingValue);
|
|
||||||
}
|
|
||||||
if (TensorEvaluator<ArgType, Device>::CoordAccess) {
|
|
||||||
return m_impl.coeff(inputCoords);
|
|
||||||
} else {
|
|
||||||
Index inputIndex;
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
|
||||||
inputIndex =
|
|
||||||
inputCoords[3] * m_patchInputStride +
|
|
||||||
inputCoords[2] * m_colInputStride +
|
|
||||||
inputCoords[1] * m_rowInputStride +
|
|
||||||
inputCoords[0];
|
|
||||||
} else {
|
|
||||||
inputIndex =
|
|
||||||
inputCoords[1] * m_patchInputStride +
|
|
||||||
inputCoords[2] * m_colInputStride +
|
|
||||||
inputCoords[3] * m_rowInputStride +
|
|
||||||
inputCoords[4];
|
|
||||||
}
|
|
||||||
return m_impl.coeff(inputIndex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||||
{
|
{
|
||||||
|
@ -318,7 +318,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -457,15 +457,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords)
|
|
||||||
{
|
|
||||||
array<Index, NumDims> inputCoords;
|
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
|
||||||
inputCoords = coords[i] + this->m_offsets[i];
|
|
||||||
}
|
|
||||||
return m_impl.coeff(inputCoords);
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
|
||||||
Scalar* result = m_impl.data();
|
Scalar* result = m_impl.data();
|
||||||
if (result) {
|
if (result) {
|
||||||
@ -547,7 +538,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -608,15 +599,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords)
|
|
||||||
{
|
|
||||||
array<Index, NumDims> inputCoords;
|
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
|
||||||
inputCoords = coords[i] + this->m_offsets[i];
|
|
||||||
}
|
|
||||||
return this->m_impl.coeffRef(inputCoords);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -151,27 +151,27 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_outputStrides[i];
|
||||||
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||||
index -= idx * m_outputStrides[i];
|
index -= idx * m_outputStrides[i];
|
||||||
}
|
}
|
||||||
if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
|
if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex += (index - m_padding[0].first);
|
inputIndex += (index - m_padding[0].first);
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = index / m_outputStrides[i+1];
|
const Index idx = index / m_outputStrides[i+1];
|
||||||
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||||
index -= idx * m_outputStrides[i+1];
|
index -= idx * m_outputStrides[i+1];
|
||||||
}
|
}
|
||||||
if (index < m_padding[NumDims-1].first ||
|
if (index < m_padding[NumDims-1].first ||
|
||||||
index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
|
index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex += (index - m_padding[NumDims-1].first);
|
inputIndex += (index - m_padding[NumDims-1].first);
|
||||||
}
|
}
|
||||||
@ -194,14 +194,14 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
{
|
{
|
||||||
const Index idx = coords[0];
|
const Index idx = coords[0];
|
||||||
if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
|
if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex = idx - m_padding[0].first;
|
inputIndex = idx - m_padding[0].first;
|
||||||
}
|
}
|
||||||
for (int i = 1; i < NumDims; ++i) {
|
for (int i = 1; i < NumDims; ++i) {
|
||||||
const Index idx = coords[i];
|
const Index idx = coords[i];
|
||||||
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||||
}
|
}
|
||||||
@ -209,14 +209,14 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
{
|
{
|
||||||
const Index idx = coords[NumDims-1];
|
const Index idx = coords[NumDims-1];
|
||||||
if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
|
if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex = idx - m_padding[NumDims-1].first;
|
inputIndex = idx - m_padding[NumDims-1].first;
|
||||||
}
|
}
|
||||||
for (int i = NumDims - 2; i >= 0; --i) {
|
for (int i = NumDims - 2; i >= 0; --i) {
|
||||||
const Index idx = coords[i];
|
const Index idx = coords[i];
|
||||||
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
|
||||||
return Scalar(0);
|
return internal::scalar_cast_op<int, Scalar>()(0);
|
||||||
}
|
}
|
||||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||||
}
|
}
|
||||||
@ -245,11 +245,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
|
|
||||||
if (last < lastPaddedLeft) {
|
if (last < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
@ -271,11 +271,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
|
|
||||||
if (last < lastPaddedLeft) {
|
if (last < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
@ -304,11 +304,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
|
|
||||||
if (last < lastPaddedLeft) {
|
if (last < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
@ -330,11 +330,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
|
|
||||||
if (last < lastPaddedLeft) {
|
if (last < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (first >= firstPaddedRight && last < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(Scalar(0));
|
return internal::pset1<PacketReturnType>(internal::scalar_cast_op<int, Scalar>()(0));
|
||||||
}
|
}
|
||||||
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
|
@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
|||||||
IsAligned = false,
|
IsAligned = false,
|
||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = true,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -248,56 +248,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
|
|
||||||
{
|
|
||||||
Index patch_coord_idx = Layout == ColMajor ? NumDims - 1 : 0;
|
|
||||||
// Location of the first element of the patch.
|
|
||||||
const Index patchIndex = coords[patch_coord_idx];
|
|
||||||
|
|
||||||
if (TensorEvaluator<ArgType, Device>::CoordAccess) {
|
|
||||||
array<Index, NumDims-1> inputCoords;
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
|
||||||
for (int i = NumDims - 2; i > 0; --i) {
|
|
||||||
const Index patchIdx = patchIndex / m_patchStrides[i];
|
|
||||||
patchIndex -= patchIdx * m_patchStrides[i];
|
|
||||||
const Index offsetIdx = coords[i];
|
|
||||||
inputCoords[i] = coords[i] + patchIdx;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = 0; i < NumDims - 2; ++i) {
|
|
||||||
const Index patchIdx = patchIndex / m_patchStrides[i];
|
|
||||||
patchIndex -= patchIdx * m_patchStrides[i];
|
|
||||||
const Index offsetIdx = coords[i+1];
|
|
||||||
inputCoords[i] = coords[i+1] + patchIdx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
|
|
||||||
inputCoords[0] = (patchIndex + coords[coords_idx]);
|
|
||||||
return m_impl.coeff(inputCoords);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Index inputIndex = 0;
|
|
||||||
if (Layout == ColMajor) {
|
|
||||||
for (int i = NumDims - 2; i > 0; --i) {
|
|
||||||
const Index patchIdx = patchIndex / m_patchStrides[i];
|
|
||||||
patchIndex -= patchIdx * m_patchStrides[i];
|
|
||||||
const Index offsetIdx = coords[i];
|
|
||||||
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = 0; i < NumDims - 2; ++i) {
|
|
||||||
const Index patchIdx = patchIndex / m_patchStrides[i];
|
|
||||||
patchIndex -= patchIdx * m_patchStrides[i];
|
|
||||||
const Index offsetIdx = coords[i+1];
|
|
||||||
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Index coords_idx = Layout == ColMajor ? 0 : NumDims - 1;
|
|
||||||
inputIndex += (patchIndex + coords[coords_idx]);
|
|
||||||
return m_impl.coeff(inputIndex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
@ -180,7 +180,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
|||||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||||
BlockAccess = false,
|
BlockAccess = false,
|
||||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||||
CoordAccess = NumDims == 6,
|
CoordAccess = false,
|
||||||
RawAccess = false
|
RawAccess = false
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -518,79 +518,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
|
|||||||
Index rowInflateStride() const { return m_row_inflate_strides; }
|
Index rowInflateStride() const { return m_row_inflate_strides; }
|
||||||
Index colInflateStride() const { return m_col_inflate_strides; }
|
Index colInflateStride() const { return m_col_inflate_strides; }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
|
|
||||||
{
|
|
||||||
// ColMajor
|
|
||||||
// 0: depth, 1: patch_planes, 2: patch_rows, 3: patch_cols, 4: number of patches, 5: batches
|
|
||||||
// RowMajor
|
|
||||||
// 0: batches, 1: number of patches, 2: patch_cols , 3: patch_rows, 4: patch_planes, 5: depth
|
|
||||||
const Index patch3DIndex = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 4 : 1];
|
|
||||||
const Index colOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 3 : 2];
|
|
||||||
const Index rowOffset= coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 3];
|
|
||||||
const Index planeOffset = coords[static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 4];
|
|
||||||
|
|
||||||
array<Index, NumDims-1> inputCoords;
|
|
||||||
|
|
||||||
const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
|
|
||||||
const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
|
|
||||||
const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
|
|
||||||
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
|
|
||||||
((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
|
|
||||||
return Scalar(m_paddingValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
|
|
||||||
const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
|
|
||||||
const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
|
|
||||||
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
|
|
||||||
((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
|
|
||||||
return Scalar(m_paddingValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
const Index planeIndex = patch3DIndex - colIndex * m_outputPlanesRows - rowIndex * m_outputRows;
|
|
||||||
const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
|
|
||||||
const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
|
|
||||||
if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
|
|
||||||
((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
|
|
||||||
return Scalar(m_paddingValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
|
||||||
inputCoords[0] = coords[0]; // depth
|
|
||||||
inputCoords[1] = origInputPlane;
|
|
||||||
inputCoords[2] = origInputRow;
|
|
||||||
inputCoords[3] = origInputCol;
|
|
||||||
inputCoords[4] = coords[5]; // batch
|
|
||||||
} else {
|
|
||||||
inputCoords[4] = coords[5]; // depth
|
|
||||||
inputCoords[3] = origInputPlane;
|
|
||||||
inputCoords[2] = origInputRow;
|
|
||||||
inputCoords[1] = origInputCol;
|
|
||||||
inputCoords[0] = coords[0]; // batch
|
|
||||||
}
|
|
||||||
if (TensorEvaluator<ArgType, Device>::CoordAccess) {
|
|
||||||
return m_impl.coeff(inputCoords);
|
|
||||||
} else {
|
|
||||||
Index inputIndex;
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
|
||||||
inputIndex =
|
|
||||||
inputCoords[4] * m_otherInputStride +
|
|
||||||
inputCoords[3] * m_colInputStride +
|
|
||||||
inputCoords[2] * m_rowInputStride +
|
|
||||||
inputCoords[1] * m_planeInputStride +
|
|
||||||
inputCoords[0];
|
|
||||||
} else {
|
|
||||||
inputIndex =
|
|
||||||
inputCoords[0] * m_otherInputStride +
|
|
||||||
inputCoords[1] * m_colInputStride +
|
|
||||||
inputCoords[2] * m_rowInputStride +
|
|
||||||
inputCoords[3] * m_planeInputStride +
|
|
||||||
inputCoords[4];
|
|
||||||
}
|
|
||||||
return m_impl.coeff(inputIndex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
|
||||||
{
|
{
|
||||||
|
@ -55,6 +55,44 @@ void test_cuda_conversion() {
|
|||||||
gpu_device.deallocate(d_conv);
|
gpu_device.deallocate(d_conv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void test_cuda_unary() {
|
||||||
|
Eigen::CudaStreamDevice stream;
|
||||||
|
Eigen::GpuDevice gpu_device(&stream);
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||||
|
d_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||||
|
d_res_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||||
|
d_res_float, num_elem);
|
||||||
|
|
||||||
|
gpu_float.device(gpu_device) = gpu_float.random();
|
||||||
|
gpu_res_float.device(gpu_device) = gpu_float.abs();
|
||||||
|
gpu_res_half.device(gpu_device) = gpu_float.cast<half>().abs().cast<float>();
|
||||||
|
|
||||||
|
Tensor<float, 1> half_prec(num_elem);
|
||||||
|
Tensor<float, 1> full_prec(num_elem);
|
||||||
|
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||||
|
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||||
|
gpu_device.synchronize();
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking unary " << i << std::endl;
|
||||||
|
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu_device.deallocate(d_float);
|
||||||
|
gpu_device.deallocate(d_res_half);
|
||||||
|
gpu_device.deallocate(d_res_float);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_cuda_elementwise() {
|
void test_cuda_elementwise() {
|
||||||
Eigen::CudaStreamDevice stream;
|
Eigen::CudaStreamDevice stream;
|
||||||
Eigen::GpuDevice gpu_device(&stream);
|
Eigen::GpuDevice gpu_device(&stream);
|
||||||
@ -201,7 +239,10 @@ void test_cxx11_tensor_of_float16_cuda()
|
|||||||
Eigen::GpuDevice device(&stream);
|
Eigen::GpuDevice device(&stream);
|
||||||
if (device.majorDeviceVersion() > 5 ||
|
if (device.majorDeviceVersion() > 5 ||
|
||||||
(device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
|
(device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
|
||||||
|
std::cout << "Running test on device with capability " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << std::endl;
|
||||||
|
|
||||||
CALL_SUBTEST_1(test_cuda_conversion());
|
CALL_SUBTEST_1(test_cuda_conversion());
|
||||||
|
CALL_SUBTEST_1(test_cuda_unary());
|
||||||
CALL_SUBTEST_1(test_cuda_elementwise());
|
CALL_SUBTEST_1(test_cuda_elementwise());
|
||||||
// CALL_SUBTEST_2(test_cuda_contractions());
|
// CALL_SUBTEST_2(test_cuda_contractions());
|
||||||
CALL_SUBTEST_3(test_cuda_reductions());
|
CALL_SUBTEST_3(test_cuda_reductions());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user