mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-23 10:09:36 +08:00
Added extra tensor benchmarks
This commit is contained in:
parent
7b3044d086
commit
c8d5f21941
@ -45,6 +45,20 @@ template <typename Device> class BenchmarkSuite {
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void typeCasting(int num_iters) {
|
||||
eigen_assert(m_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
|
||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
|
||||
TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.cast<int>();
|
||||
}
|
||||
// Record the number of values copied per second
|
||||
finalizeBenchmark(m_ * k_ * num_iters);
|
||||
}
|
||||
|
||||
void random(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||
@ -87,6 +101,34 @@ template <typename Device> class BenchmarkSuite {
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void rowChip(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.chip(iter % k_, 0);
|
||||
}
|
||||
// Record the number of values copied from the rhs chip to the lhs.
|
||||
finalizeBenchmark(n_ * num_iters);
|
||||
}
|
||||
|
||||
void colChip(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
|
||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.chip(iter % n_, 1);
|
||||
}
|
||||
// Record the number of values copied from the rhs chip to the lhs.
|
||||
finalizeBenchmark(n_ * num_iters);
|
||||
}
|
||||
|
||||
void shuffling(int num_iters) {
|
||||
eigen_assert(m_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
||||
@ -147,7 +189,6 @@ template <typename Device> class BenchmarkSuite {
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
// nvcc doesn't support cxx11
|
||||
const Eigen::array<int, 2> broadcast = {{1, n_}};
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
@ -212,14 +253,20 @@ template <typename Device> class BenchmarkSuite {
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
// Simple reduction
|
||||
void reduction(int num_iters) {
|
||||
// Row reduction
|
||||
void rowReduction(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
|
||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
|
||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||
|
||||
const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
|
||||
#endif
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
@ -227,7 +274,33 @@ template <typename Device> class BenchmarkSuite {
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
finalizeBenchmark(k_ * n_ * num_iters);
|
||||
}
|
||||
|
||||
// Column reduction
|
||||
void colReduction(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
||||
b_, input_size);
|
||||
const Eigen::array<TensorIndex, 1> output_size = {{k_}};
|
||||
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
|
||||
c_, output_size);
|
||||
|
||||
#ifndef EIGEN_HAS_INDEX_LIST
|
||||
const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
|
||||
#endif
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.sum(sum_along_dim);
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(k_ * n_ * num_iters);
|
||||
}
|
||||
|
||||
// do a contraction which is equivalent to a matrix multiplication
|
||||
|
@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4);
|
||||
BM_FuncCPU(memcpy, 8);
|
||||
BM_FuncCPU(memcpy, 12);
|
||||
|
||||
BM_FuncCPU(typeCasting, 4);
|
||||
BM_FuncCPU(typeCasting, 8);
|
||||
BM_FuncCPU(typeCasting, 12);
|
||||
|
||||
BM_FuncCPU(random, 4);
|
||||
BM_FuncCPU(random, 8);
|
||||
BM_FuncCPU(random, 12);
|
||||
@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4);
|
||||
BM_FuncCPU(slicing, 8);
|
||||
BM_FuncCPU(slicing, 12);
|
||||
|
||||
BM_FuncCPU(rowChip, 4);
|
||||
BM_FuncCPU(rowChip, 8);
|
||||
BM_FuncCPU(rowChip, 12);
|
||||
|
||||
BM_FuncCPU(colChip, 4);
|
||||
BM_FuncCPU(colChip, 8);
|
||||
BM_FuncCPU(colChip, 12);
|
||||
|
||||
BM_FuncCPU(shuffling, 4);
|
||||
BM_FuncCPU(shuffling, 8);
|
||||
BM_FuncCPU(shuffling, 12);
|
||||
@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4);
|
||||
BM_FuncCPU(transcendentalFunc, 8);
|
||||
BM_FuncCPU(transcendentalFunc, 12);
|
||||
|
||||
BM_FuncCPU(reduction, 4);
|
||||
BM_FuncCPU(reduction, 8);
|
||||
BM_FuncCPU(reduction, 12);
|
||||
BM_FuncCPU(rowReduction, 4);
|
||||
BM_FuncCPU(rowReduction, 8);
|
||||
BM_FuncCPU(rowReduction, 12);
|
||||
|
||||
BM_FuncCPU(colReduction, 4);
|
||||
BM_FuncCPU(colReduction, 8);
|
||||
BM_FuncCPU(colReduction, 12);
|
||||
|
||||
|
||||
// Contractions
|
||||
@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
|
||||
|
@ -19,6 +19,7 @@
|
||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||
|
||||
BM_FuncGPU(memcpy);
|
||||
BM_FuncGPU(typeCasting);
|
||||
BM_FuncGPU(random);
|
||||
BM_FuncGPU(slicing);
|
||||
BM_FuncGPU(shuffling);
|
||||
@ -26,7 +27,10 @@ BM_FuncGPU(padding);
|
||||
BM_FuncGPU(striding);
|
||||
BM_FuncGPU(broadcasting);
|
||||
BM_FuncGPU(coeffWiseOp);
|
||||
BM_FuncGPU(reduction);
|
||||
BM_FuncGPU(algebraicFunc);
|
||||
BM_FuncGPU(transcendentalFunc);
|
||||
BM_FuncGPU(rowReduction);
|
||||
BM_FuncGPU(colReduction);
|
||||
|
||||
|
||||
// Contractions
|
||||
@ -45,6 +49,7 @@ BM_FuncGPU(reduction);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||
|
||||
|
||||
// Convolutions
|
||||
|
Loading…
x
Reference in New Issue
Block a user