eigen/bench/tensors/tensor_contract_sycl_bench.cc

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016
// Mehdi Goli    Codeplay Software Ltd.
// Ralph Potter  Codeplay Software Ltd.
// Luke Iwanski  Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_BENCH_CONTRACT_SYCL
#define EIGEN_BENCH_CONTRACT_SYCL
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#include <CL/sycl.hpp>
#include <fstream>
#include <iostream>
#include <chrono>
#include <ctime>

#include <unsupported/Eigen/CXX11/Tensor>

using Eigen::array;
using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
std::ofstream out("Result.txt");

std::chrono::time_point<std::chrono::system_clock> get_time() {
  std::chrono::time_point<std::chrono::system_clock> start, end;
  return std::chrono::system_clock::now();
}

template <typename Start, typename End, typename TensorIndex>
void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_, TensorIndex num_iters,
                       std::string name) {
  std::chrono::duration<double> elapsed_seconds = end - start;
  std::cout << "Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : "
            << static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters) / elapsed_seconds.count()) * 1e-9
            << "\n";
  out << "Kernel Name : " << name << ", M : " << m_ << ",  N : " << n_ << ", K : " << k_ << " GFLOP/s : "
      << static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters) / elapsed_seconds.count()) * 1e-9
      << "\n";
}

// do a contraction which is equivalent to a matrix multiplication
template <typename T, typename Device, typename TensorIndex>
void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.fill(a_, a_ + (m_ * k_), T(12));
  device_.fill(b_, b_ + (k_ * n_), T(23));
  device_.fill(c_, c_ + (m_ * n_), T(31));

  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
  sizeA[1] = k_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = k_;
  sizeB[1] = n_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;

  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);

  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(1, 0);
#ifdef EIGEN_USE_SYCL  // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
#endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
}

// do a contraction which is equivalent to a matrix multiplication
template <typename T, typename Device, typename TensorIndex>
void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
  device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
  device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));

  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
  sizeA[1] = k_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = k_;
  sizeB[1] = n_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;

  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);

  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(1, 0);
#ifdef EIGEN_USE_SYCL  // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
#endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
}

template <typename T, typename Device, typename TensorIndex>
void contractionAT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = k_;
  sizeA[1] = m_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = k_;
  sizeB[1] = n_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;

  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);

  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(0, 0);
#ifdef EIGEN_USE_SYCL  // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
#endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
}

template <typename T, typename Device, typename TensorIndex>
void contractionBT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));

  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
  sizeA[1] = k_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = n_;
  sizeB[1] = k_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;

  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);

  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(1, 1);
#ifdef EIGEN_USE_SYCL  // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
#endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
}

template <typename T, typename Device, typename TensorIndex>
void contractionABT(const Device& device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_) {
  T* a_;
  T* b_;
  T* c_;
  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));

  // Initialize the content of the memory pools to prevent asan from
  // complaining.
  device_.memset(a_, 12, m_ * k_ * sizeof(T));
  device_.memset(b_, 23, k_ * n_ * sizeof(T));
  device_.memset(c_, 31, m_ * n_ * sizeof(T));

  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = k_;
  sizeA[1] = m_;
  Eigen::array<TensorIndex, 2> sizeB;
  sizeB[0] = n_;
  sizeB[1] = k_;
  Eigen::array<TensorIndex, 2> sizeC;
  sizeC[0] = m_;
  sizeC[1] = n_;

  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> A(a_, sizeA);
  const TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> B(b_, sizeB);
  TensorMap<Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> C(c_, sizeC);

  typedef typename Tensor<T, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  dims[0] = DimPair(0, 1);
#ifdef EIGEN_USE_SYCL  // warmup for sycl
  for (int iter = 0; iter < 10; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
#endif
  auto start = get_time();
  for (int iter = 0; iter < num_iters; ++iter) {
    C.device(device_) = A.contract(B, dims);
  }
  auto end = get_time();
  // Record the number of FLOPs executed per second (size_ multiplications and
  // additions for each value in the resulting tensor)
  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
  device_.deallocate(a_);
  device_.deallocate(b_);
  device_.deallocate(c_);
  device_.synchronize();
}

int main() {
  cl::sycl::gpu_selector selector;
  Eigen::QueueInterface queue(selector);
  Eigen::SyclDevice device(&queue);
  int64_t num_iters = 20;
  for (int64_t m = 32; m <= 4096; m *= 2)
    for (int64_t k = 32; k <= 4096; k *= 2)
      for (int64_t n = 32; n <= 4096; n *= 2) {
        (contraction<float>(device, num_iters, m, k, n));
        (contractionRowMajor<float>(device, num_iters, m, k, n));
        (contractionAT<float>(device, num_iters, m, k, n));
        (contractionBT<float>(device, num_iters, m, k, n));
        (contractionABT<float>(device, num_iters, m, k, n));
      }
  return 0;
}

#endif  // EIGEN_BENCH_CONTRACT_SYCL