Remove XSMM support from Tensor module

2025-07-14 09:01:47 +08:00 · 2019-08-19 11:44:25 -07:00 · 2019-08-19 11:44:25 -07:00 · 071311821e
commit 071311821e
parent d55d392e7b
7 changed files with 0 additions and 664 deletions
--- a/cmake/FindXsmm.cmake
+++ b/cmake/FindXsmm.cmake
@ -1,25 +0,0 @@
-# libxsmm support.
-# libxsmm provides matrix multiplication kernels optimized for
-# the latest Intel architectures.
-# Download the library from https://github.com/hfp/libxsmm
-# Compile with make BLAS=0
-
-if (LIBXSMM)
-  set(XSMM_FIND_QUIETLY TRUE)
-  set(XSMM_INCLUDES ${LIBXSMM}/include)
-  set(XSMM_LIBRARIES ${LIBXSMM}/lib)
-endif (LIBXSMM)
-
-find_path(LIBXSMM 
-  NAMES 
-  libxsmm.h 
-  PATHS 
-  $ENV{XSMMDIR}/include 
-  ${INCLUDE_INSTALL_DIR} 
-)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(XSMM DEFAULT_MSG
-                                  LIBXSMM)
-
-mark_as_advanced(LIBXSMM)
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@ -73,10 +73,6 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif

-#if defined(EIGEN_USE_LIBXSMM)
-#include "libxsmm.h"
-#endif
-
 #ifdef EIGEN_USE_THREADS
 #include "ThreadPool"
 #endif
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -20,70 +20,6 @@ namespace Eigen {
  *
  */
 namespace internal {
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-template<typename Scalar, typename Index>
-void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
-  size_t psize = packet_traits<Scalar>::size;           // Packet size
-  typedef typename packet_traits<Scalar>::type Packet;  // Packet type
-  size_t alignment = psize*sizeof(Scalar);              // Needed alignment
-  if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
-     (ldsrc*sizeof(Scalar)) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
-    // Optimized version using packets
-    size_t num_packets = rows / psize;
-    for (Index col = 0; col < cols; ++col) {
-      EIGEN_ASM_COMMENT("begin pack_simple inner copy");
-      // Unrolled manually 4 times.
-      for (size_t i=0; i < num_packets/4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      for (size_t i=0; i < num_packets%4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      dst += lddst - num_packets*psize;
-      src += ldsrc - num_packets*psize;
-      EIGEN_ASM_COMMENT("end pack_simple inner copy");
-    }
-  } else {
-    // Naive memcpy calls
-    for (Index col = 0; col < cols; ++col) {
-      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
-    }
-  }
-}
-
-template<typename LhsScalar, typename RhsScalar, typename Scalar>
-  struct libxsmm_wrapper {
-    libxsmm_wrapper() {}
-    libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {}
-    void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {}
-    void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {}
-  };
-
-  template<>
-  struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
-    libxsmm_wrapper(): libxsmm_mmfunction() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-  };
-
-  template<>
-  struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
-    libxsmm_wrapper(): libxsmm_mmfunction() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-  };
-#endif
-

 template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
@ -640,8 +576,6 @@ struct TensorContractionEvaluatorBase
      }
    }

-    EnableXSMMIfPossible(eval_op_indices);
-
    // If the layout is RowMajor, we need to reverse the m_dimensions
    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@ -780,13 +714,6 @@ struct TensorContractionEvaluatorBase
  EIGEN_DEVICE_FUNC
  #endif
  void evalGemm(Scalar* buffer) const {
-    #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (m_can_use_xsmm) {
-      evalGemmXSMM(buffer);
-      return;
-    }
-    #endif
-
    // columns in left side, rows in right side
    const Index k = this->m_k_size;

@ -942,213 +869,6 @@ struct TensorContractionEvaluatorBase
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }

 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
-    m_can_use_xsmm = false;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    if (!std::is_same<Scalar, LhsScalar>::value ||
-        !std::is_same<Scalar, RhsScalar>::value ||
-        !(std::is_same<Scalar, float>::value ||
-          std::is_same<Scalar, double>::value) ||
-        m_leftImpl.data() == NULL ||
-        m_rightImpl.data() == NULL) {
-      return;
-    }
-
-    // Check if we can use faster matmul algorithms. For contraction to be
-    // equivalent to matmul, we need both lhs and rhs contracting dims sequences
-    // to be either a prefix or suffix of all dims. Also, the order of both
-    // must be the same, so we don't have to do reordering.
-    // For example:
-    // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
-    // Depending if contraction dims are prefix or suffix of all dims we need to
-    // pre-transpose matrices in matmul algorithm:
-    // lhs: prefix -> transpose, suffix -> no transpose
-    // rhs: prefix -> no transpose, suffix -> transpose
-    // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
-    // non-transposed matmul.
-    if (ContractDims == 0) {
-      // This case is totally uninteresting, filter it out to avoid problems
-      // with iterations in further tests.
-      return;
-    }
-
-    // Check if RHS dims list is increasing. LHS already is, so if not, the
-    // order is different and we cannot do matmul.
-    for (int i = 1; i < ContractDims; i++) {
-      if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
-        return;
-      }
-    }
-
-    // Check if no holes.
-    int diff;
-    for (int i = 1; i < ContractDims; i++) {
-      // LHS contract dims are sorted to form an increasing seq.
-      diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
-      if (diff != 1) {
-        return;
-      }
-      // Now we may already assume RHS contract dims seq is increasing too.
-      diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
-      if (diff != 1) {
-        return;
-      }
-    }
-
-    // Check if suffix or prefix.
-    if (eval_op_indices[0].first != 0 &&
-        eval_op_indices[ContractDims-1].first != LDims-1) {
-      return;
-    }
-    if (eval_op_indices[0].second != 0 &&
-        eval_op_indices[ContractDims-1].second != RDims-1) {
-      return;
-    }
-
-    m_can_use_xsmm = true;
-#else
-    EIGEN_UNUSED_VARIABLE(eval_op_indices);
-#endif
-  }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    const bool transposeA = !m_lhs_inner_dim_contiguous;
-    const bool transposeB = !m_rhs_inner_dim_contiguous;
-
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-    internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
-        k, m, n, 1, transposeA, transposeB);
-
-    // Outer blocks sizes
-    const Index mc_outer = blocking.outer_m();
-    const Index nc_outer = blocking.outer_n();
-    const Index kc_outer = blocking.outer_k();
-    // Inner blocks sizes
-    const Index mc = blocking.mc();
-    const Index nc = blocking.nc();
-    const Index kc = blocking.kc();
-    // Decisions whether we should copy parts of matrices
-    const bool copyA = blocking.copyA();
-    const bool copyB = blocking.copyB();
-
-    const LhsScalar* leftData = m_leftImpl.data();
-    const RhsScalar* rightData = m_rightImpl.data();
-
-    const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
-    const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
-    const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
-
-    const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
-    // Use bigger stride to avoid hitting same cache line too often.
-    // This consistently gives +~0.5 Gflops.
-    const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
-        kc % 32 == 0 ? kc + 16 : kc
-    );
-
-    // Kernel for the general case (not edges)
-    internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
-
-    LhsScalar* blockA = NULL;
-    RhsScalar* panelB = NULL;
-
-    if (copyA) {
-      blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
-    }
-    if (copyB) {
-      panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
-    }
-
-    const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
-    const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
-    kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
-
-    // Outer blocking
-    for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
-      for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
-        for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
-          using numext::mini;
-
-          Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
-
-          // Inner blocking
-          for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
-            const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
-            const float beta = ki == 0 ? 0 : 1;
-
-            if (copyB) {
-              if (transposeB) {
-                libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
-              } else {
-                internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
-              }
-            }
-
-            for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
-              const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
-
-              const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
-                                                leftData + ki*stride_A + mi;
-
-              if (copyA) {
-                if (transposeA) {
-                  libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
-                } else {
-                  internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
-                }
-              }
-              const LhsScalar* actual_a = copyA ? blockA : a;
-
-              for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
-                const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
-
-                const RhsScalar* b = rightData + ni*stride_B + ki;
-                Scalar* c = buffer + ni*stride_C + mi;
-                const Scalar* cp = c + nc*stride_C;
-
-                const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
-                const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
-
-                if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
-                  // Most used, cached kernel.
-                  kernel(actual_a, actual_b, c, actual_a, bp, cp);
-                } else {
-                  // Edges - use libxsmm kernel cache.
-                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    if (copyA) {
-      this->m_device.deallocate(blockA);
-    }
-    if (copyB) {
-      this->m_device.deallocate(panelB);
-    }
-  }
-#endif
-
  // Prevent assignment
  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
  Dimensions m_dimensions;
@ -1177,7 +897,6 @@ protected:
  const Device EIGEN_DEVICE_REF m_device;
  OutputKernelType m_output_kernel;
  EvaluatorPointerType m_result;
-  bool m_can_use_xsmm;
 };


--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@ -67,141 +67,6 @@ class TensorContractionBlocking {
  StorageIndex nc_;
 };

-
-
-#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename StorageIndex>
-class TensorXsmmContractionBlocking {
- public:
-  TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
-      size_t max_num_threads = 1, bool transposeA = false,
-      bool transposeB = false):
-    k_(k), m_(m), n_(n), transposeA_(transposeA),
-    transposeB_(transposeB), num_threads_(max_num_threads) {
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-    if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-      mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
-      kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
-      nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
-      outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
-      outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
-      outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
-      copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
-      copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
-      outer_m_ = outer_m_ != 0 ? outer_m_ : m;
-      outer_k_ = outer_k_ != 0 ? outer_k_ : k;
-      outer_n_ = outer_n_ != 0 ? outer_n_ : n;
-    }
-#else
-    // Defaults, possibly overridden per-platform.
-    copyA_ = true;
-    copyB_ = false;
-
-    // If the matrix is small enough, don't do blocking, just call single xsmm
-    // kernel.
-    if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
-      mc_ = m; kc_ = k; nc_ = n;
-      outer_m_ = m; outer_k_ = k; outer_n_ = n;
-      copyA_ = false; copyB_ = false;
-    } else {
-      int arch = libxsmm_cpuid_x86();
-
-      if (arch == LIBXSMM_X86_AVX512_CORE) {
-        // skylake
-        mc_ = 64; kc_ = 64; nc_ = 24;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
-        // Hack to use this kernel architecture as the other one has performance
-        // issues (no hardware prefetching).
-        // TODO(nishantpatil): This should be removed if the issues are fixed,
-        // or this one becomes the default.
-        setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
-      } else if (arch == LIBXSMM_X86_AVX2) {
-        // haswell
-        mc_ = 32; kc_ = 192; nc_ = 33;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
-      } else if (arch == LIBXSMM_X86_AVX) {
-        // ivybridge
-        mc_ = 32; kc_ = 192; nc_ = 48;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
-      } else {
-        // generic kernel size, usually performing well
-        mc_ = 32; kc_ = 128; nc_ = 32;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
-      }
-
-      // Only copy if it makes the stride smaller.
-      copyA_ = copyA_ && (m > mc_);
-      copyB_ = copyB_ && (k > kc_);
-    }
-
-    // We need to copy anyway if transposing
-    copyA_ = copyA_ || transposeA;
-    copyB_ = copyB_ || transposeB;
-
-    // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
-    prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
-
-#endif
-
-    mc_ = mc_ > m ? m : mc_;
-    nc_ = nc_ > n ? n : nc_;
-    kc_ = kc_ > k ? k : kc_;
-
-    size_t compute_parallelism = (m / mc_) * (n / nc_);
-    size_t pack_parallelism = 0;
-    if (copyA_) {
-      pack_parallelism += (m / mc_) * (k / kc_);
-    }
-    if (copyB_) {
-      pack_parallelism += (n / nc_) * (k / kc_);
-    }
-    size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
-
-    num_threads_ = numext::mini<size_t>(num_threads_,
-                                    parallelism / MIN_JOBS_PER_THREAD);
-    num_threads_ = numext::maxi<size_t>(num_threads_, 1);
-
-    // For optimal performance outer block sizes should be multiplies of kernel
-    // sizes, or bigger than matrix size (=no outer blocking).
-    eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
-    eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
-    eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
-  }
-
-  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
-  EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; }
-  EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; }
-  EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; }
-  EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
-  EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
-  EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
-  EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
-  EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
-  EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); }
-  EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); }
-  EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); }
-  EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
-    return prefetch_;
-  }
-
- private:
-  StorageIndex k_, m_, n_;
-  StorageIndex kc_, mc_, nc_;
-  StorageIndex outer_k_, outer_m_, outer_n_;
-  bool copyA_, copyB_, transposeA_, transposeB_;
-  size_t num_threads_;
-
-  // Threshold for m*k*n to skip blocking and just call libxsmm
-  const double LIBXSMM_THRESHOLD = 80*80*80;
-  // For computing optimal number of threads - so that each thread gets at least
-  // that many jobs.
-  const double MIN_JOBS_PER_THREAD = 3;
-  libxsmm_gemm_prefetch_type prefetch_;
-};
-#endif // EIGEN_USE_LIBXSMM
-
 } // end namespace internal
 } // end namespace Eigen

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -78,23 +78,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    const Index k = this->m_k_size;
    if (m == 0 || n == 0 || k == 0) return;

-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (this->m_can_use_xsmm) {
-      bool transposeA = !this->m_lhs_inner_dim_contiguous;
-      bool transposeB = !this->m_rhs_inner_dim_contiguous;
-      internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
-          blocking(k, m, n, this->m_device.numThreads(), transposeA,
-                   transposeB);
-
-      if (blocking.num_threads() == 1) {
-        this->evalGemmXSMM(buffer);
-      } else {
-        ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
-      }
-      return;
-    }
-#endif
-
    // Compute a set of algorithm parameters:
    // - kernel block sizes (bm, bn, bk)
    // - task grain sizes (number of kernels executed per task: gm, gn)
@ -1227,190 +1210,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    return computeBandwidth;
  }

-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  // TODO(ezhulenev): Add support for output kernels and LIBXSMM.
-  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
-                "XSMM does not support contraction output kernels.");
-
-  template<int Alignment>
-  class ContextXsmm {
-   public:
-    ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
-                const internal::TensorXsmmContractionBlocking<LhsScalar,
-                    RhsScalar, Index>& blocking):
-        device(self->m_device),
-        m(m), k(k), n(n),
-        stride_a(blocking.transposeA() ? k : m),
-        stride_b(blocking.transposeB() ? n : k),
-        stride_c(m),
-        bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
-        blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
-        blocks_n(blocking.blocks_n()),
-        copyA(blocking.copyA()), copyB(blocking.copyB()),
-        transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
-        num_threads(blocking.num_threads()),
-        buffer(buffer),
-        leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
-        workers_done(blocking.num_threads()),
-
-        packingA_jobs(0), packingB_jobs(0), compute_jobs(0),
-        packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {}
-
-    void worker() {
-      // Pack
-
-      if (copyA) {
-        while (true) {
-          uint32_t mk = packingA_jobs++;
-          Index mi = mk / blocks_k;
-          Index ki = mk % blocks_k;
-          if (mi >= blocks_m) break;
-
-          LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
-          if (transposeA) {
-            const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
-            libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
-                           actual_bm(mi), stride_a, bm);
-          } else {
-            const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
-            internal::pack_simple<LhsScalar, Index>(blockA, current_a,
-                actual_bk(ki), actual_bm(mi), bm, stride_a);
-          }
-          packingA_done.at(mi)++;
-        }
-      }
-
-      if (copyB) {
-        while (true) {
-          uint32_t nk = packingB_jobs++;
-          Index ni = nk / blocks_k;
-          Index ki = nk % blocks_k;
-          if (ni >= blocks_n) break;
-
-          RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
-          if (transposeB) {
-            const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
-                                          (ni*bn);
-            libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
-                           actual_bk(ki), stride_b, bk);
-          } else {
-            const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
-                                          (ki*bk);
-            internal::pack_simple<RhsScalar, Index>(blockB, current_b,
-                actual_bn(ni), actual_bk(ki), bk, stride_b);
-          }
-          packingB_done.at(ni)++;
-        }
-      }
-
-      // Compute
-
-      while (true) {
-        uint32_t mn = compute_jobs++;
-        Index mi = mn / blocks_n;
-        Index ni = mn % blocks_n;
-        if (mi >= blocks_m) break;
-
-        // Wait for mi, ni packings to be done. This is more fine-grained than
-        // waiting for all workers to finish packing.
-        while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
-               (copyB && (packingB_done.at(ni) < blocks_k)))
-        {}
-
-        for (Index ki=0; ki < blocks_k; ++ki) {
-          const LhsScalar * current_a = copyA ?
-              blocksA + (bk*bm) * (mi*blocks_k+ki) :
-              leftData + (bk*ki)*stride_a + (bm*mi);
-          const RhsScalar * current_b = copyB ?
-              blocksB + (bk*bn) * (ni*blocks_k+ki) :
-              rightData + (ni*bn)*stride_b + (bk*ki);
-
-          Index current_stride_a = copyA ? bm : stride_a;
-          Index current_stride_b = copyB ? bk : stride_b;
-
-          // Memory may not be zeroed, overwrite instead of adding in first
-          // iteration.
-          float beta = ki == 0 ? 0 : 1;
-
-          Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
-          internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
-              0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
-              current_stride_a, current_stride_b, stride_c, 1, beta, 0)
-          (current_a, current_b, current_c);
-        }
-      }
-
-      workers_done.Notify();
-    }
-
-    void run() {
-      // Parallelization strategy.
-      //
-      // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
-      // then shard by m, n.
-      //
-      // Do not use advanced ThreadPool queuing, just run a single long-standing
-      // function in each thread.
-      if (copyA) {
-        blocksA = static_cast<LhsScalar*>(device.allocate(
-            (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
-      }
-      if (copyB) {
-        blocksB = static_cast<RhsScalar*>(device.allocate(
-            (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
-      }
-
-      for (Index i = 0; i < num_threads; ++i) {
-          device.enqueueNoNotification([=]() { worker(); });
-      }
-
-      workers_done.Wait();
-
-      if (copyA) {
-        device.deallocate(blocksA);
-      }
-      if (copyB) {
-        device.deallocate(blocksB);
-      }
-    }
-
-   private:
-    // real block size for block index in [0, ..., blocks - 1].
-    Index actual_bm(Index mi) const {
-      return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
-    }
-    Index actual_bk(Index ki) const {
-      return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
-    }
-    Index actual_bn(Index ni) const {
-      return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
-    }
-
-    const Device& device;
-    Index m, k, n;
-    Index stride_a, stride_b, stride_c;
-    Index bm, bk, bn;  // Block sizes.
-    Index blocks_m, blocks_k, blocks_n;  // Number of blocks in each dimension.
-    bool copyA, copyB, transposeA, transposeB;
-    Index num_threads;
-    Scalar *buffer;
-    const LhsScalar *leftData;
-    const RhsScalar *rightData;
-
-    LhsScalar *blocksA;
-    RhsScalar *blocksB;
-    // barrier for joining all threads after all done.
-    Barrier workers_done;
-    // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
-    std::atomic<uint32_t> packingA_jobs;
-    std::atomic<uint32_t> packingB_jobs;
-    std::atomic<uint32_t> compute_jobs;
-    // already packed blocks for each mi-panel in A and ni-panel in B.
-    std::vector<std::atomic<uint8_t>> packingA_done;
-    std::vector<std::atomic<uint8_t>> packingB_done;
-  };
-#endif
-
 };

 } // end namespace Eigen
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -12,17 +12,6 @@ include_directories(../../test ../../unsupported ../../Eigen

 find_package (Threads)

-find_package(Xsmm)
-if(XSMM_FOUND)
-  add_definitions("-DEIGEN_USE_LIBXSMM")
-  include_directories(${XSMM_INCLUDES})
-  link_directories(${XSMM_LIBRARIES})
-  set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm)
-  ei_add_property(EIGEN_TESTED_BACKENDS  "Xsmm, ")
-else(XSMM_FOUND)
-  ei_add_property(EIGEN_MISSING_BACKENDS  "Xsmm, ")
-endif(XSMM_FOUND)
-
 find_package(GoogleHash)
 if(GOOGLEHASH_FOUND)
  add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@ -511,8 +511,6 @@ static void test_const_inputs()
  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }

-#if !defined(EIGEN_USE_LIBXSMM)
-
 // Apply Sqrt to all output elements.
 struct SqrtOutputKernel {
  template <typename Index, typename Scalar>
@ -562,9 +560,6 @@ static void test_large_contraction_with_output_kernel() {
  }
 }

-#endif  // !defined(EIGEN_USE_LIBXSMM)
-
-
 EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
  CALL_SUBTEST(test_evals<ColMajor>());
@ -597,8 +592,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
  CALL_SUBTEST(test_tensor_product<RowMajor>());
  CALL_SUBTEST(test_const_inputs<ColMajor>());
  CALL_SUBTEST(test_const_inputs<RowMajor>());
-#if !defined(EIGEN_USE_LIBXSMM)
  CALL_SUBTEST(test_large_contraction_with_output_kernel<ColMajor>());
  CALL_SUBTEST(test_large_contraction_with_output_kernel<RowMajor>());
-#endif
 }