Adding sycl backend for TensorCustomOp; fixing the partial lhs modification issue on sycl when the rhs is TensorContraction, reduction or convolution; Fixing the partial modification for memset when sycl backend is used.

2025-09-26 08:13:13 +08:00 · 2017-02-28 17:16:14 +00:00 · 2017-02-28 17:16:14 +00:00 · 8296b87d7b
commit 8296b87d7b
parent e0bd6f5738
15 changed files with 397 additions and 67 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@ -84,7 +84,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
+   if (data) {
      evalTo(data);
      return false;
    } else {
@ -173,6 +173,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
  LhsLocalAcc localLhs;
  RhsLocalAcc localRhs;
  OutAccessor out_res;
+  size_t out_offset;
  Index roundUpK, M, N, K;
  ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
  LeftNocontractT m_i_strides, m_left_nocontract_strides;
@ -182,11 +183,12 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
  Device dev;


-  KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
+  KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, size_t out_offset_,
    Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
    ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
-    :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
+    :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_),
+    out_offset(out_offset_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
    m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
    m_right_contracting_strides(m_right_contracting_strides_),
    m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
@ -316,7 +318,7 @@ typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadS
            for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
                Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
                if(globalCol<N)
-                  out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
+                  out_ptr[globalCol*M + globalRow +ConvertToActualSyclOffset(OutScalar, out_offset)] = privateRes[wLPTM][wLPTN];
            }
          }
      }
@ -356,12 +358,12 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
    // extract lhs functor list
    LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
    // extract rhs functor list
-    RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
+    RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.right_impl());

    Index roundUpK = RoundUp(K, TileSizeDimK);
    Index roundUpM = RoundUp(M, TileSizeDimM);
    Index roundUpN = RoundUp(N, TileSizeDimN);
-
+    ptrdiff_t out_offset = self.device().get_offset(buffer);
    self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
      /// work-around for gcc bug
      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
@ -379,17 +381,16 @@ template< typename Self, typename OutScalar, typename ContractT, typename LeftNo
      typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
      RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);

-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
+      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer> OutAccessor;
      //OutScalar memory
-      OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-
+      OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::read_write>(cgh, buffer);
      // sycl parallel for
      cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
      cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
       RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
-          localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
+          localLhs, localRhs, out_res, out_offset, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
    });
    self.device().asynchronousExec();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -32,14 +32,15 @@ internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::La
 Kernel_accessor kernel_filter;
 const size_t kernelSize, range_x, range_y;
 Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
 Local_accessor local_acc;
 FunctorExpr functors;
 TupleType tuple_of_accessors;
 EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
  Kernel_accessor kernel_filter_,  const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
-  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
-  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+  buffer_acc(buffer_acc_), out_offset(out_offset_),local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}

  void operator()(cl::sycl::nd_item<2> itemID) {
    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@ -75,7 +76,7 @@ EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::inter
      }
      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
-      buffer_ptr[tensor_index] = result;
+      buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
    }
  }
 };
@ -89,14 +90,15 @@ internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::La
 Kernel_accessor kernel_filter;
 const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
 Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
 Local_accessor local_acc;
 FunctorExpr functors;
 TupleType tuple_of_accessors;
 EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
-  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
-  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+  buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}

  void operator()(cl::sycl::nd_item<3> itemID) {
    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@ -141,7 +143,7 @@ EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::inter
      }
      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
-      buffer_ptr[tensor_index] = result;
+      buffer_ptr[tensor_index  +ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
    }
  }
 };
@ -156,16 +158,17 @@ internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::La
 Kernel_accessor kernel_filter;
 const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
 Buffer_accessor buffer_acc;
+ptrdiff_t out_offset;
 Local_accessor local_acc;
 FunctorExpr functors;
 TupleType tuple_of_accessors;
 EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
  const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
-  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
+  Buffer_accessor buffer_acc_, ptrdiff_t out_offset_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
  kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
-  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
+  buffer_acc(buffer_acc_), out_offset(out_offset_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}

  void operator()(cl::sycl::nd_item<3> itemID) {
    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
@ -215,7 +218,7 @@ EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::inter
        }
        const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
        +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
-        buffer_ptr[tensor_index] = result;
+        buffer_ptr[tensor_index+ConvertToActualSyclOffset(CoeffReturnType, out_offset)] = result;
      }

      itemID.barrier(cl::sycl::access::fence_space::local_space);
@ -307,7 +310,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      m_kernel = in_place;
      m_local_kernel = false;
    } else {
-      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
      typedef TensorEvalToOp<const KernelArgType> EvalTo;
      EvalTo evalToTmp(local, m_kernelArg);
@ -325,6 +328,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
    typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
    // extract input functor list
    InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
+    ptrdiff_t out_offset = m_device.get_offset(data);


    m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
@ -358,7 +362,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
          cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
          EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+          indexMapper,kernel_acc, kernel_size, numX, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
          break;
        }

@ -383,7 +387,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
          EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, numX, numY, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
          break;
        }

@ -412,7 +416,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
          EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, kernel_size_z, numX, numY,
-          numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
+          numZ, numP, out_res, out_offset, local_acc, input_functors, tuple_of_accessors));
          break;
        }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@ -140,6 +140,9 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi

  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }

+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+
 protected:
  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
@ -295,6 +298,9 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,

  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }

+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+
 protected:
  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -18,6 +18,8 @@
 namespace Eigen {

  #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
+  #define ConvertToActualSyclOffset(Scalar, offset) offset/sizeof(Scalar)
+

  template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
  public:
@ -43,11 +45,12 @@ namespace Eigen {
  struct memsetkernelFunctor{
   typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
   AccType m_acc;
+   const ptrdiff_t buff_offset;
   const size_t m_rng, m_c;
-   memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
+   memsetkernelFunctor(AccType acc, const ptrdiff_t buff_offset_, const size_t rng, const size_t c):m_acc(acc),  buff_offset(buff_offset_), m_rng(rng), m_c(c){}
   void operator()(cl::sycl::nd_item<1> itemID) {
     auto globalid=itemID.get_global_linear_id();
-     if (globalid< m_rng) m_acc[globalid] = m_c;
+     if (globalid< m_rng) m_acc[globalid + buff_offset] = m_c;
   }

  };
@ -305,6 +308,11 @@ struct SyclDevice {
    synchronize();
  }

+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    auto it = m_queue_stream->find_buffer(ptr);
+    return (static_cast<const uint8_t*>(ptr))-it->first;
+
+  }
  /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
  /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
  /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
@ -343,20 +351,23 @@ struct SyclDevice {
  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
    size_t rng, GRange, tileSize;
    parallel_for_setup(n, tileSize, rng, GRange);
-    sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
+    auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(data));
+    ptrdiff_t buff_offset= (static_cast<const uint8_t*>(data)) - it1->first;
+    sycl_queue().submit(memsetCghFunctor(it1->second, buff_offset, rng, GRange, tileSize, c ));
    synchronize();
  }

  struct memsetCghFunctor{
    cl::sycl::buffer<uint8_t, 1>& m_buf;
+    const ptrdiff_t& buff_offset;
    const size_t& rng , GRange, tileSize;
    const int  &c;
-    memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff,  const size_t& rng_,  const size_t& GRange_,  const size_t& tileSize_, const int& c_)
-    :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
+    memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff,  const ptrdiff_t& buff_offset_, const size_t& rng_,  const size_t& GRange_,  const size_t& tileSize_, const int& c_)
+    :m_buf(buff), buff_offset(buff_offset_), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}

    void operator()(cl::sycl::handler &cgh) const {
      auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
+      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, buff_offset, rng, c));
    }
  };

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@ -27,9 +27,9 @@ namespace internal {

 template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(OP op, BufferTOut& bufOut, ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
  do {
-          auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+          auto f = [length, local, op, out_offset, &bufOut, &bufI](cl::sycl::handler& h) mutable {
            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                    cl::sycl::range<1>{std::min(length, local)}};
            /* Two accessors are used: one to the buffer that is being reduced,
@ -43,7 +43,7 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev

            /* The parallel_for invocation chosen is the variant with an nd_item
             * parameter, since the code requires barriers for correctness. */
-            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch,  length, local));
+            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, out_offset, aI, scratch,  length, local));
          };
            dev.sycl_queue().submit(f);
            dev.asynchronousExec();
@ -60,9 +60,9 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev

 template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
 template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut,ptrdiff_t out_offset, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
   syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
-    bufOut, bufI, dev, length, local);
+    bufOut, out_offset, bufI, dev, length, local);
 }
 };

@ -127,8 +127,9 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {

    // getting final out buffer at the moment the created buffer is true because there is no need for assign
    auto out_buffer =dev.get_sycl_buffer(output);
+    ptrdiff_t out_offset = dev.get_offset(output);
    /// This is used to recursively reduce the tmp value to an element of 1;
-    syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+    syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, out_offset, temp_global_buffer,dev, GRange,  outTileSize);
  }

 };
@ -158,10 +159,11 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
      // create a tuple of accessors from Evaluator
      Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
+      ptrdiff_t out_offset = dev.get_offset(output);
      Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
      TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
-      (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
+      (output_accessor, out_offset, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));

    });
    dev.asynchronousExec();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@ -109,6 +109,7 @@ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > {\
  typedef CVQual ExprNode< typename ConvertToDeviceExpression<Expr>::Type> Type;\
 };

+
 // TensorForcedEvalOp
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(const,TensorForcedEvalOp)
 KERNELBROKERCONVERTFORCEDEVALLAYOUTSWAP(,TensorForcedEvalOp)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@ -236,8 +236,12 @@ EVALTO()
 template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
 CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
-  TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
+  typedef  TensorForcedEvalOp<OrigExpr> XprType;\
+  typedef CVQual TensorMap<\
+                            Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
+                            Eigen::internal::traits<XprType>::Layout, \
+                            MakeGlobalPointer\
+                          > Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
@ -248,6 +252,28 @@ FORCEDEVAL(const)
 FORCEDEVAL()
 #undef FORCEDEVAL

+
+
+#define TENSORCUSTOMUNARYOP(CVQual)\
+template <typename CustomUnaryFunc, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr>,\
+CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, DevExpr>, N>, Params...> {\
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, OrigExpr> XprType;\
+  typedef CVQual TensorMap<\
+                            Tensor<typename XprType::Scalar,XprType::NumDimensions, Eigen::internal::traits<XprType>::Layout,typename XprType::Index>,\
+                            Eigen::internal::traits<XprType>::Layout, \
+                            MakeGlobalPointer\
+                          > Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+};
+
+TENSORCUSTOMUNARYOP(const)
+TENSORCUSTOMUNARYOP()
+#undef TENSORCUSTOMUNARYOP
+
 template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
  static const size_t Res =X;
 };
@ -260,7 +286,7 @@ template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
 template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
 struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
 CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
-  static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
+  static const auto NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
  NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
  Type expr;\
@ -275,28 +301,31 @@ SYCLREDUCTIONEXPR()


 /// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp
-#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
+/// TensorContractionOp, TensorConvolutionOp TensorCustomBinaryOp
+#define SYCLCONTRACTCONVCUSBIOPS(CVQual, ExprNode)\
 template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
 struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
 CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType,  RhsXprType>, N>, Params...> {\
-  static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
-  typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
-  NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
-  typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
-  Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
+  typedef ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> XprTyp;\
+  static const auto NumIndices= Eigen::internal::traits<XprTyp>::NumDimensions;\
+  typedef CVQual TensorMap<\
+                            Tensor<typename XprTyp::Scalar,NumIndices, Eigen::internal::traits<XprTyp>::Layout, typename XprTyp::Index>,\
+                            Eigen::internal::traits<XprTyp>::Layout, \
+                            MakeGlobalPointer\
+                          > Type;\
  Type expr;\
  template <typename FuncDetector>\
  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
 };

-SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTION
-
+SYCLCONTRACTCONVCUSBIOPS(const, TensorContractionOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorContractionOp)
+SYCLCONTRACTCONVCUSBIOPS(const, TensorConvolutionOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorConvolutionOp)
+SYCLCONTRACTCONVCUSBIOPS(const, TensorCustomBinaryOp)
+SYCLCONTRACTCONVCUSBIOPS(, TensorCustomBinaryOp)
+#undef SYCLCONTRACTCONVCUSBIOPS


 #define SYCLSLICEOPEXPR(CVQual)\
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@ -148,6 +148,33 @@ SYCLFORCEDEVALEXTACC()
 #undef SYCLFORCEDEVALEXTACC


+#define SYCLCUSTOMUNARYOPEXTACC(CVQual)\
+template <typename CustomUnaryFunc, typename XprType, typename Dev >\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Dev>& eval)\
+  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+
+SYCLCUSTOMUNARYOPEXTACC(const)
+SYCLCUSTOMUNARYOPEXTACC()
+#undef SYCLCUSTOMUNARYOPEXTACC
+
+
+#define SYCLCUSTOMBINARYOPEXTACC(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType , typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Dev>& eval)\
+  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+};
+
+SYCLCUSTOMBINARYOPEXTACC(const)
+SYCLCUSTOMBINARYOPEXTACC()
+#undef SYCLCUSTOMBIBARYOPEXTACC
+
+
+
+
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
 #define SYCLEVALTOEXTACC(CVQual)\
 template <typename Expr, typename Dev>\
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@ -33,14 +33,17 @@ namespace internal {
 /// re-instantiate them on the device.
 /// We have to pass instantiated functors to the device.
 // This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
+#define DEFALTACTION(Evaluator)\
+typedef typename Evaluator::Dimensions Dimensions;\
+const Dimensions m_dimensions;\
+EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
+FunctorExtractor(const Evaluator& expr): m_dimensions(expr.dimensions()) {}
+
 template <typename Evaluator> struct FunctorExtractor{
-  typedef typename Evaluator::Dimensions Dimensions;
-  const Dimensions m_dimensions;
-  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-  FunctorExtractor(const Evaluator& expr)
-  : m_dimensions(expr.dimensions()) {}
+  DEFALTACTION(Evaluator)
 };

+
 /// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
 ///TensorConversionOp
 #define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
@ -112,6 +115,36 @@ SYCLEXTRFUNCTERNARY(const)
 SYCLEXTRFUNCTERNARY()
 #undef SYCLEXTRFUNCTERNARY

+
+
+//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
+//from the UnaryCategory and it is similar to the general FunctorExtractor.
+/// specialisation of TensorCustomOp
+#define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\
+template <typename CustomUnaryFunc, typename ArgType, typename Dev >\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> > {\
+  typedef TensorEvaluator<CVQual TensorCustomUnaryOp<CustomUnaryFunc, ArgType>, Dev> Evaluator;\
+  DEFALTACTION(Evaluator)\
+};
+
+SYCLEXTRFUNCCUSTOMUNARYOP(const)
+SYCLEXTRFUNCCUSTOMUNARYOP()
+#undef SYCLEXTRFUNCCUSTOMUNARYOP
+
+
+#define SYCLEXTRFUNCCUSTOMBIBARYOP(CVQual)\
+template <typename CustomBinaryFunc, typename ArgType1, typename ArgType2, typename Dev >\
+struct FunctorExtractor<TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> > {\
+  typedef TensorEvaluator<CVQual TensorCustomBinaryOp<CustomBinaryFunc, ArgType1, ArgType2>, Dev> Evaluator;\
+  DEFALTACTION(Evaluator)\
+};
+
+SYCLEXTRFUNCCUSTOMBIBARYOP(const)
+SYCLEXTRFUNCCUSTOMBIBARYOP()
+#undef SYCLEXTRFUNCCUSTOMBIBARYOP
+
+
+
 /// specialisation of the \ref FunctorExtractor struct when the node type is
 /// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
 #define SYCLEXTRFUNCSELECTOP(CVQual)\
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
@ -21,11 +21,12 @@ namespace internal {
  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
    OP op;
    OutputAccessor aOut;
+    ptrdiff_t out_offset;
    InputAccessor aI;
    LocalAccessor scratch;
    size_t length, local;
-    GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
-    : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
+    GenericKernelReducer(OP op_, OutputAccessor aOut_, ptrdiff_t out_offset_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
+    : op(op_), aOut(aOut_), out_offset(out_offset_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
    void operator()(cl::sycl::nd_item<1> itemID) {
      size_t globalid = itemID.get_global(0);
      size_t localid = itemID.get_local(0);
@ -59,7 +60,7 @@ namespace internal {
          aI[itemID.get_group(0)] = scratch[localid];
          if((length<=local) && globalid ==0){
            auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
-            aOutPtr[0]=scratch[0];
+            aOutPtr[0 + ConvertToActualSyclOffset(CoeffReturnType, out_offset)]=scratch[0];
          }
        }
      }
@ -72,8 +73,8 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
 public:
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
-  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
+  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
+  :output_accessor(output_accessor_), out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {

    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@ -93,11 +94,12 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
      functor.finalize(accum);
-      output_accessor_ptr[globalid]= accum;
+      output_accessor_ptr[globalid + ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum;
    }
  }
 private:
  write_accessor output_accessor;
+  ptrdiff_t out_offset;
  FunctorExpr functors;
  Tuple_of_Acc tuple_of_accessors;
  Dims dims;
@ -111,9 +113,9 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
  typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
-  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
+  ReductionFunctor(write_accessor output_accessor_, ptrdiff_t out_offset_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
    Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
-  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
+  :output_accessor(output_accessor_),  out_offset(out_offset_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
  void operator()(cl::sycl::nd_item<1> itemID) {

    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
@ -133,11 +135,12 @@ class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::interna
      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
      functor.finalize(accum);
-      output_accessor_ptr[globalid]= accum/num_values_to_reduce;
+      output_accessor_ptr[globalid+ ConvertToActualSyclOffset(typename DeviceSelf::CoeffReturnType, out_offset)]= accum/num_values_to_reduce;
    }
  }
 private:
  write_accessor output_accessor;
+  ptrdiff_t out_offset;
  FunctorExpr functors;
  Tuple_of_Acc tuple_of_accessors;
  Dims dims;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@ -93,6 +93,26 @@ SYCLFORCEDEVALLEAFCOUNT(const)
 SYCLFORCEDEVALLEAFCOUNT()
 #undef SYCLFORCEDEVALLEAFCOUNT

+#define SYCLCUSTOMUNARYOPLEAFCOUNT(CVQual)\
+template <typename CustomUnaryFunc, typename XprType>\
+struct LeafCount<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {\
+static const size_t Count =1;\
+};
+
+SYCLCUSTOMUNARYOPLEAFCOUNT(const)
+SYCLCUSTOMUNARYOPLEAFCOUNT()
+#undef SYCLCUSTOMUNARYOPLEAFCOUNT
+
+
+#define SYCLCUSTOMBINARYOPLEAFCOUNT(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>\
+struct LeafCount<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {\
+static const size_t Count =1;\
+};
+SYCLCUSTOMBINARYOPLEAFCOUNT( const)
+SYCLCUSTOMBINARYOPLEAFCOUNT()
+#undef SYCLCUSTOMBINARYOPLEAFCOUNT
+
 /// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
 #define EVALTOLAYOUTSWAPLEAFCOUNT(CVQual , ExprNode, Num)\
 template <typename Expr>\
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@ -143,6 +143,33 @@ FORCEDEVAL(const)
 FORCEDEVAL()
 #undef FORCEDEVAL

+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define CUSTOMUNARYOPEVAL(CVQual)\
+template <typename CustomUnaryFunc, typename XprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorCustomUnaryOp<CustomUnaryFunc, XprType>, N> Type;\
+};
+
+CUSTOMUNARYOPEVAL(const)
+CUSTOMUNARYOPEVAL()
+#undef CUSTOMUNARYOPEVAL
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define CUSTOMBINARYOPEVAL(CVQual)\
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, size_t N>\
+struct PlaceHolderExpression<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, N> Type;\
+};
+
+CUSTOMBINARYOPEVAL(const)
+CUSTOMBINARYOPEVAL()
+#undef CUSTOMBINARYOPEVAL
+
+
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorEvalToOp, TensorLayoutSwapOp
 #define EVALTOLAYOUTSWAP(CVQual, ExprNode)\
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -173,6 +173,7 @@ if(EIGEN_TEST_CXX11)
    ei_add_test_sycl(cxx11_tensor_patch_sycl "-std=c++11")
    ei_add_test_sycl(cxx11_tensor_image_patchOP_sycl "-std=c++11")
    ei_add_test_sycl(cxx11_tensor_volume_patchOP_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_custom_op_sycl "-std=c++11")
  endif(EIGEN_TEST_SYCL)
  # It should be safe to always run these tests as there is some fallback code for
  # older compiler that don't support cxx11.
--- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_custom_op_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 5;
+  Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(out.dimension(0), 6);
+  VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), 0);
+    }
+  }
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input1, const TensorType& input2,
+            Output& output, const Device& device) const
+  {
+    typedef typename TensorType::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int64_t i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+    }
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+  Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+  Eigen::array<IndexType, 3> tensorResultRange  = {{2, 7, 5}};
+
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange1);
+  TensorType gpu_in2(gpu_in2_data, tensorRange2);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+  gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 5; ++i) {
+    typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+    TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+void test_cxx11_tensor_custom_op_sycl() {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(custom_op_perDevice<float>(device));
+  }
+}
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@ -44,7 +44,7 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
  /// c=(a+b)*b
  gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));