Added extra tensor benchmarks

This commit is contained in:
Benoit Steiner 2016-01-28 16:20:36 -08:00
parent 7b3044d086
commit c8d5f21941
3 changed files with 111 additions and 11 deletions

View File

@ -45,6 +45,20 @@ template <typename Device> class BenchmarkSuite {
finalizeBenchmark(m_ * m_ * num_iters);
}
void typeCasting(int num_iters) {
eigen_assert(m_ == n_);
const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.cast<int>();
}
// Record the number of values copied per second
finalizeBenchmark(m_ * k_ * num_iters);
}
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
@ -87,6 +101,34 @@ template <typename Device> class BenchmarkSuite {
finalizeBenchmark(m_ * m_ * num_iters);
}
void rowChip(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.chip(iter % k_, 0);
}
// Record the number of values copied from the rhs chip to the lhs.
finalizeBenchmark(n_ * num_iters);
}
void colChip(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.chip(iter % n_, 1);
}
// Record the number of values copied from the rhs chip to the lhs.
finalizeBenchmark(n_ * num_iters);
}
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
@ -147,7 +189,6 @@ template <typename Device> class BenchmarkSuite {
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
#ifndef EIGEN_HAS_INDEX_LIST
// nvcc doesn't support cxx11
const Eigen::array<int, 2> broadcast = {{1, n_}};
#else
// Take advantage of cxx11 to give the compiler information it can use to
@ -212,14 +253,20 @@ template <typename Device> class BenchmarkSuite {
finalizeBenchmark(m_ * m_ * num_iters);
}
// Simple reduction
void reduction(int num_iters) {
// Row reduction
void rowReduction(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
#ifndef EIGEN_HAS_INDEX_LIST
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
@ -227,7 +274,33 @@ template <typename Device> class BenchmarkSuite {
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
finalizeBenchmark(k_ * n_ * num_iters);
}
// Column reduction
void colReduction(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
b_, input_size);
const Eigen::array<TensorIndex, 1> output_size = {{k_}};
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
c_, output_size);
#ifndef EIGEN_HAS_INDEX_LIST
const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.sum(sum_along_dim);
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(k_ * n_ * num_iters);
}
// do a contraction which is equivalent to a matrix multiplication

View File

@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4);
BM_FuncCPU(memcpy, 8);
BM_FuncCPU(memcpy, 12);
BM_FuncCPU(typeCasting, 4);
BM_FuncCPU(typeCasting, 8);
BM_FuncCPU(typeCasting, 12);
BM_FuncCPU(random, 4);
BM_FuncCPU(random, 8);
BM_FuncCPU(random, 12);
@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4);
BM_FuncCPU(slicing, 8);
BM_FuncCPU(slicing, 12);
BM_FuncCPU(rowChip, 4);
BM_FuncCPU(rowChip, 8);
BM_FuncCPU(rowChip, 12);
BM_FuncCPU(colChip, 4);
BM_FuncCPU(colChip, 8);
BM_FuncCPU(colChip, 12);
BM_FuncCPU(shuffling, 4);
BM_FuncCPU(shuffling, 8);
BM_FuncCPU(shuffling, 12);
@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4);
BM_FuncCPU(transcendentalFunc, 8);
BM_FuncCPU(transcendentalFunc, 12);
BM_FuncCPU(reduction, 4);
BM_FuncCPU(reduction, 8);
BM_FuncCPU(reduction, 12);
BM_FuncCPU(rowReduction, 4);
BM_FuncCPU(rowReduction, 8);
BM_FuncCPU(rowReduction, 12);
BM_FuncCPU(colReduction, 4);
BM_FuncCPU(colReduction, 8);
BM_FuncCPU(colReduction, 12);
// Contractions
@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);

View File

@ -19,6 +19,7 @@
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
BM_FuncGPU(typeCasting);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(shuffling);
@ -26,7 +27,10 @@ BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
BM_FuncGPU(reduction);
BM_FuncGPU(algebraicFunc);
BM_FuncGPU(transcendentalFunc);
BM_FuncGPU(rowReduction);
BM_FuncGPU(colReduction);
// Contractions
@ -45,6 +49,7 @@ BM_FuncGPU(reduction);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
// Convolutions