mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-19 11:24:26 +08:00
Added extra tensor benchmarks
This commit is contained in:
parent
7b3044d086
commit
c8d5f21941
@ -45,6 +45,20 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
finalizeBenchmark(m_ * m_ * num_iters);
|
finalizeBenchmark(m_ * m_ * num_iters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void typeCasting(int num_iters) {
|
||||||
|
eigen_assert(m_ == n_);
|
||||||
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, k_}};
|
||||||
|
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> A(a_, sizes);
|
||||||
|
TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> B((int*)b_, sizes);
|
||||||
|
|
||||||
|
StartBenchmarkTiming();
|
||||||
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
|
B.device(device_) = A.cast<int>();
|
||||||
|
}
|
||||||
|
// Record the number of values copied per second
|
||||||
|
finalizeBenchmark(m_ * k_ * num_iters);
|
||||||
|
}
|
||||||
|
|
||||||
void random(int num_iters) {
|
void random(int num_iters) {
|
||||||
eigen_assert(m_ == k_ && k_ == n_);
|
eigen_assert(m_ == k_ && k_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
const Eigen::array<TensorIndex, 2> sizes = {{m_, m_}};
|
||||||
@ -87,6 +101,34 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
finalizeBenchmark(m_ * m_ * num_iters);
|
finalizeBenchmark(m_ * m_ * num_iters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void rowChip(int num_iters) {
|
||||||
|
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||||
|
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||||
|
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||||
|
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
|
StartBenchmarkTiming();
|
||||||
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
|
C.device(device_) = B.chip(iter % k_, 0);
|
||||||
|
}
|
||||||
|
// Record the number of values copied from the rhs chip to the lhs.
|
||||||
|
finalizeBenchmark(n_ * num_iters);
|
||||||
|
}
|
||||||
|
|
||||||
|
void colChip(int num_iters) {
|
||||||
|
const Eigen::array<TensorIndex, 2> input_size= {{k_, n_}};
|
||||||
|
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||||
|
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||||
|
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
|
StartBenchmarkTiming();
|
||||||
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
|
C.device(device_) = B.chip(iter % n_, 1);
|
||||||
|
}
|
||||||
|
// Record the number of values copied from the rhs chip to the lhs.
|
||||||
|
finalizeBenchmark(n_ * num_iters);
|
||||||
|
}
|
||||||
|
|
||||||
void shuffling(int num_iters) {
|
void shuffling(int num_iters) {
|
||||||
eigen_assert(m_ == n_);
|
eigen_assert(m_ == n_);
|
||||||
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
const Eigen::array<TensorIndex, 2> size_a = {{m_, k_}};
|
||||||
@ -147,7 +189,6 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
||||||
|
|
||||||
#ifndef EIGEN_HAS_INDEX_LIST
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
// nvcc doesn't support cxx11
|
|
||||||
const Eigen::array<int, 2> broadcast = {{1, n_}};
|
const Eigen::array<int, 2> broadcast = {{1, n_}};
|
||||||
#else
|
#else
|
||||||
// Take advantage of cxx11 to give the compiler information it can use to
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
@ -212,14 +253,20 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
finalizeBenchmark(m_ * m_ * num_iters);
|
finalizeBenchmark(m_ * m_ * num_iters);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simple reduction
|
// Row reduction
|
||||||
void reduction(int num_iters) {
|
void rowReduction(int num_iters) {
|
||||||
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
|
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
|
||||||
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
const Eigen::array<TensorIndex, 1> output_size = {{n_}};
|
||||||
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
|
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
|
||||||
|
|
||||||
const Eigen::array<TensorIndex, 1> sum_along_dim = {{0}};
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
|
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
|
||||||
|
#else
|
||||||
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
|
// optimize the code.
|
||||||
|
Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
|
||||||
|
#endif
|
||||||
|
|
||||||
StartBenchmarkTiming();
|
StartBenchmarkTiming();
|
||||||
for (int iter = 0; iter < num_iters; ++iter) {
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
@ -227,7 +274,33 @@ template <typename Device> class BenchmarkSuite {
|
|||||||
}
|
}
|
||||||
// Record the number of FLOP executed per second (assuming one operation
|
// Record the number of FLOP executed per second (assuming one operation
|
||||||
// per value)
|
// per value)
|
||||||
finalizeBenchmark(m_ * m_ * num_iters);
|
finalizeBenchmark(k_ * n_ * num_iters);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Column reduction
|
||||||
|
void colReduction(int num_iters) {
|
||||||
|
const Eigen::array<TensorIndex, 2> input_size = {{k_, n_}};
|
||||||
|
const TensorMap<Tensor<float, 2, 0, TensorIndex>, Eigen::Aligned> B(
|
||||||
|
b_, input_size);
|
||||||
|
const Eigen::array<TensorIndex, 1> output_size = {{k_}};
|
||||||
|
TensorMap<Tensor<float, 1, 0, TensorIndex>, Eigen::Aligned> C(
|
||||||
|
c_, output_size);
|
||||||
|
|
||||||
|
#ifndef EIGEN_HAS_INDEX_LIST
|
||||||
|
const Eigen::array<TensorIndex, 1> sum_along_dim = {{1}};
|
||||||
|
#else
|
||||||
|
// Take advantage of cxx11 to give the compiler information it can use to
|
||||||
|
// optimize the code.
|
||||||
|
Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
StartBenchmarkTiming();
|
||||||
|
for (int iter = 0; iter < num_iters; ++iter) {
|
||||||
|
C.device(device_) = B.sum(sum_along_dim);
|
||||||
|
}
|
||||||
|
// Record the number of FLOP executed per second (assuming one operation
|
||||||
|
// per value)
|
||||||
|
finalizeBenchmark(k_ * n_ * num_iters);
|
||||||
}
|
}
|
||||||
|
|
||||||
// do a contraction which is equivalent to a matrix multiplication
|
// do a contraction which is equivalent to a matrix multiplication
|
||||||
|
@ -22,6 +22,10 @@ BM_FuncCPU(memcpy, 4);
|
|||||||
BM_FuncCPU(memcpy, 8);
|
BM_FuncCPU(memcpy, 8);
|
||||||
BM_FuncCPU(memcpy, 12);
|
BM_FuncCPU(memcpy, 12);
|
||||||
|
|
||||||
|
BM_FuncCPU(typeCasting, 4);
|
||||||
|
BM_FuncCPU(typeCasting, 8);
|
||||||
|
BM_FuncCPU(typeCasting, 12);
|
||||||
|
|
||||||
BM_FuncCPU(random, 4);
|
BM_FuncCPU(random, 4);
|
||||||
BM_FuncCPU(random, 8);
|
BM_FuncCPU(random, 8);
|
||||||
BM_FuncCPU(random, 12);
|
BM_FuncCPU(random, 12);
|
||||||
@ -30,6 +34,14 @@ BM_FuncCPU(slicing, 4);
|
|||||||
BM_FuncCPU(slicing, 8);
|
BM_FuncCPU(slicing, 8);
|
||||||
BM_FuncCPU(slicing, 12);
|
BM_FuncCPU(slicing, 12);
|
||||||
|
|
||||||
|
BM_FuncCPU(rowChip, 4);
|
||||||
|
BM_FuncCPU(rowChip, 8);
|
||||||
|
BM_FuncCPU(rowChip, 12);
|
||||||
|
|
||||||
|
BM_FuncCPU(colChip, 4);
|
||||||
|
BM_FuncCPU(colChip, 8);
|
||||||
|
BM_FuncCPU(colChip, 12);
|
||||||
|
|
||||||
BM_FuncCPU(shuffling, 4);
|
BM_FuncCPU(shuffling, 4);
|
||||||
BM_FuncCPU(shuffling, 8);
|
BM_FuncCPU(shuffling, 8);
|
||||||
BM_FuncCPU(shuffling, 12);
|
BM_FuncCPU(shuffling, 12);
|
||||||
@ -58,9 +70,13 @@ BM_FuncCPU(transcendentalFunc, 4);
|
|||||||
BM_FuncCPU(transcendentalFunc, 8);
|
BM_FuncCPU(transcendentalFunc, 8);
|
||||||
BM_FuncCPU(transcendentalFunc, 12);
|
BM_FuncCPU(transcendentalFunc, 12);
|
||||||
|
|
||||||
BM_FuncCPU(reduction, 4);
|
BM_FuncCPU(rowReduction, 4);
|
||||||
BM_FuncCPU(reduction, 8);
|
BM_FuncCPU(rowReduction, 8);
|
||||||
BM_FuncCPU(reduction, 12);
|
BM_FuncCPU(rowReduction, 12);
|
||||||
|
|
||||||
|
BM_FuncCPU(colReduction, 4);
|
||||||
|
BM_FuncCPU(colReduction, 8);
|
||||||
|
BM_FuncCPU(colReduction, 12);
|
||||||
|
|
||||||
|
|
||||||
// Contractions
|
// Contractions
|
||||||
@ -98,6 +114,12 @@ BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
|
|||||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
|
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
|
||||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
|
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
|
||||||
|
|
||||||
|
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 1);
|
||||||
|
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 4);
|
||||||
|
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 8);
|
||||||
|
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 12);
|
||||||
|
BM_FuncWithInputDimsCPU(contraction, N, N, 64, 16);
|
||||||
|
|
||||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
|
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
|
||||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
|
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
|
||||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
|
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||||
|
|
||||||
BM_FuncGPU(memcpy);
|
BM_FuncGPU(memcpy);
|
||||||
|
BM_FuncGPU(typeCasting);
|
||||||
BM_FuncGPU(random);
|
BM_FuncGPU(random);
|
||||||
BM_FuncGPU(slicing);
|
BM_FuncGPU(slicing);
|
||||||
BM_FuncGPU(shuffling);
|
BM_FuncGPU(shuffling);
|
||||||
@ -26,7 +27,10 @@ BM_FuncGPU(padding);
|
|||||||
BM_FuncGPU(striding);
|
BM_FuncGPU(striding);
|
||||||
BM_FuncGPU(broadcasting);
|
BM_FuncGPU(broadcasting);
|
||||||
BM_FuncGPU(coeffWiseOp);
|
BM_FuncGPU(coeffWiseOp);
|
||||||
BM_FuncGPU(reduction);
|
BM_FuncGPU(algebraicFunc);
|
||||||
|
BM_FuncGPU(transcendentalFunc);
|
||||||
|
BM_FuncGPU(rowReduction);
|
||||||
|
BM_FuncGPU(colReduction);
|
||||||
|
|
||||||
|
|
||||||
// Contractions
|
// Contractions
|
||||||
@ -45,6 +49,7 @@ BM_FuncGPU(reduction);
|
|||||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||||
|
BM_FuncWithInputDimsGPU(contraction, N, N, 64);
|
||||||
|
|
||||||
|
|
||||||
// Convolutions
|
// Convolutions
|
||||||
|
Loading…
x
Reference in New Issue
Block a user