mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-26 06:44:27 +08:00
Adding mean to TensorReductionSycl.h
This commit is contained in:
parent
42bd5c4e7b
commit
0ee97b60c2
@ -25,8 +25,8 @@
|
|||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template<typename CoeffReturnType> struct syclGenericBufferReducer{
|
template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
|
||||||
template<typename OP, typename BufferTOut, typename BufferTIn>
|
template<typename BufferTOut, typename BufferTIn>
|
||||||
static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||||
do {
|
do {
|
||||||
auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
|
auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
|
||||||
@ -54,13 +54,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
|
|||||||
length = length / local;
|
length = length / local;
|
||||||
|
|
||||||
} while (length > 1);
|
} while (length > 1);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
|
||||||
|
template<typename BufferTOut, typename BufferTIn>
|
||||||
|
static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
||||||
|
syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
|
||||||
|
bufOut, bufI, dev, length, local);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
|
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
|
||||||
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
|
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
|
||||||
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
|
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
|
||||||
@ -123,7 +128,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
|||||||
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
||||||
auto out_buffer =dev.get_sycl_buffer(output);
|
auto out_buffer =dev.get_sycl_buffer(output);
|
||||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||||
syclGenericBufferReducer<CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -135,7 +140,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
|||||||
typedef typename Self::CoeffReturnType CoeffReturnType;
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||||
static const bool HasOptimizedImplementation = false;
|
static const bool HasOptimizedImplementation = false;
|
||||||
|
|
||||||
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
|
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
|
||||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
|
typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
|
||||||
FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
|
FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||||
@ -153,10 +158,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
|||||||
// create a tuple of accessors from Evaluator
|
// create a tuple of accessors from Evaluator
|
||||||
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||||
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
||||||
|
Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
|
||||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||||
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
|
TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
|
||||||
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
|
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
|
||||||
|
|
||||||
});
|
});
|
||||||
dev.asynchronousExec();
|
dev.asynchronousExec();
|
||||||
|
@ -72,7 +72,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
|
|||||||
public:
|
public:
|
||||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||||
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_)
|
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
|
||||||
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
|
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
|
||||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||||
|
|
||||||
@ -105,6 +105,46 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen
|
|||||||
Index range;
|
Index range;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
|
||||||
|
class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
|
||||||
|
public:
|
||||||
|
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||||
|
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||||
|
typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
|
||||||
|
ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
|
||||||
|
Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
|
||||||
|
:output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
|
||||||
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||||
|
|
||||||
|
typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||||
|
auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||||
|
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||||
|
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||||
|
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||||
|
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
||||||
|
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||||
|
/// the device_evaluator is detectable and recognisable on the device.
|
||||||
|
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
|
||||||
|
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||||
|
auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
|
||||||
|
/// const cast added as a naive solution to solve the qualifier drop error
|
||||||
|
auto globalid=static_cast<Index>(itemID.get_global_linear_id());
|
||||||
|
if (globalid< range) {
|
||||||
|
typename DeviceSelf::CoeffReturnType accum = functor.initialize();
|
||||||
|
Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
|
||||||
|
functor.finalize(accum);
|
||||||
|
output_accessor_ptr[globalid]= accum/num_values_to_reduce;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
write_accessor output_accessor;
|
||||||
|
FunctorExpr functors;
|
||||||
|
Tuple_of_Acc tuple_of_accessors;
|
||||||
|
Dims dims;
|
||||||
|
Op functor;
|
||||||
|
Index range;
|
||||||
|
Index num_values_to_reduce;
|
||||||
|
};
|
||||||
|
|
||||||
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
|
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
|
||||||
class FullReductionKernelFunctor{
|
class FullReductionKernelFunctor{
|
||||||
@ -134,14 +174,11 @@ public:
|
|||||||
/// const cast added as a naive solution to solve the qualifier drop error
|
/// const cast added as a naive solution to solve the qualifier drop error
|
||||||
auto globalid=itemID.get_global_linear_id();
|
auto globalid=itemID.get_global_linear_id();
|
||||||
|
|
||||||
if(globalid<rng)
|
tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
|
||||||
tmp_global_accessor.get_pointer()[globalid]=Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op));
|
: static_cast<CoeffReturnType>(op.initialize());
|
||||||
else
|
|
||||||
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(op.initialize());
|
|
||||||
|
|
||||||
if(remaining!=0 && globalid==0 ){
|
if(remaining!=0 && globalid==0 ){
|
||||||
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
||||||
// tmp_global_accessor.get_pointer()[0]+=
|
|
||||||
auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
|
auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
|
||||||
reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
|
reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
|
||||||
auto accum = op.initialize();
|
auto accum = op.initialize();
|
||||||
@ -150,13 +187,58 @@ public:
|
|||||||
op.finalize(accum);
|
op.finalize(accum);
|
||||||
tmp_global_accessor.get_pointer()[0]=accum;
|
tmp_global_accessor.get_pointer()[0]=accum;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType>
|
||||||
|
class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
|
||||||
|
public:
|
||||||
|
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||||
|
typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
|
||||||
|
|
||||||
|
OutAccessor tmp_global_accessor;
|
||||||
|
Index rng , remaining, red_factor;
|
||||||
|
Op op;
|
||||||
|
Dims dims;
|
||||||
|
FunctorExpr functors;
|
||||||
|
TupleType tuple_of_accessors;
|
||||||
|
|
||||||
|
FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
|
||||||
|
:tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
|
||||||
|
|
||||||
|
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||||
|
|
||||||
|
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
||||||
|
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||||
|
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
||||||
|
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
||||||
|
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
||||||
|
const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
|
||||||
|
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
||||||
|
/// the device_evaluator is detectable and recognisable on the device.
|
||||||
|
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
||||||
|
/// const cast added as a naive solution to solve the qualifier drop error
|
||||||
|
auto globalid=itemID.get_global_linear_id();
|
||||||
|
auto scale = (rng*red_factor) + remaining;
|
||||||
|
|
||||||
|
tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
|
||||||
|
:static_cast<CoeffReturnType>(op.initialize())/scale;
|
||||||
|
|
||||||
|
if(remaining!=0 && globalid==0 ){
|
||||||
|
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
||||||
|
auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
|
||||||
|
auto accum = op.initialize();
|
||||||
|
tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
|
||||||
|
op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
|
||||||
|
op.reduce(remaining_reduce, &accum);
|
||||||
|
op.finalize(accum);
|
||||||
|
tmp_global_accessor.get_pointer()[0]=accum/scale;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
|
|
||||||
|
|
||||||
template <typename DataType, int DataLayout, typename IndexType>
|
template <typename DataType, int DataLayout, typename IndexType>
|
||||||
static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
|
static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) {
|
||||||
|
|
||||||
const IndexType num_rows = 452;
|
const IndexType num_rows = 452;
|
||||||
const IndexType num_cols = 765;
|
const IndexType num_cols = 765;
|
||||||
@ -34,6 +34,37 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
|
|||||||
|
|
||||||
in.setRandom();
|
in.setRandom();
|
||||||
|
|
||||||
|
full_redux = in.mean();
|
||||||
|
|
||||||
|
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
|
||||||
|
DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
|
||||||
|
|
||||||
|
TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
|
||||||
|
TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
|
||||||
|
|
||||||
|
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
|
||||||
|
out_gpu.device(sycl_device) = in_gpu.mean();
|
||||||
|
sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
|
||||||
|
// Check that the CPU and GPU reductions return the same result.
|
||||||
|
VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
|
||||||
|
sycl_device.deallocate(gpu_in_data);
|
||||||
|
sycl_device.deallocate(gpu_out_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename DataType, int DataLayout, typename IndexType>
|
||||||
|
static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) {
|
||||||
|
|
||||||
|
const IndexType num_rows = 876;
|
||||||
|
const IndexType num_cols = 953;
|
||||||
|
array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
|
||||||
|
|
||||||
|
Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
|
||||||
|
Tensor<DataType, 0, DataLayout, IndexType> full_redux;
|
||||||
|
Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
|
||||||
|
|
||||||
|
in.setRandom();
|
||||||
|
|
||||||
full_redux = in.minimum();
|
full_redux = in.minimum();
|
||||||
|
|
||||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
|
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
|
||||||
@ -50,8 +81,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
|
|||||||
sycl_device.deallocate(gpu_in_data);
|
sycl_device.deallocate(gpu_in_data);
|
||||||
sycl_device.deallocate(gpu_out_data);
|
sycl_device.deallocate(gpu_out_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename DataType, int DataLayout, typename IndexType>
|
template <typename DataType, int DataLayout, typename IndexType>
|
||||||
static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
|
static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) {
|
||||||
|
|
||||||
IndexType dim_x = 145;
|
IndexType dim_x = 145;
|
||||||
IndexType dim_y = 1;
|
IndexType dim_y = 1;
|
||||||
@ -90,7 +123,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename DataType, int DataLayout, typename IndexType>
|
template <typename DataType, int DataLayout, typename IndexType>
|
||||||
static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
|
static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
|
||||||
IndexType dim_x = 567;
|
IndexType dim_x = 567;
|
||||||
IndexType dim_y = 1;
|
IndexType dim_y = 1;
|
||||||
@ -132,12 +165,14 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
|
|||||||
QueueInterface queueInterface(d);
|
QueueInterface queueInterface(d);
|
||||||
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
||||||
|
|
||||||
test_full_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||||
test_first_dim_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||||
test_last_dim_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||||
test_full_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
|
||||||
test_first_dim_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||||
test_last_dim_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||||
|
test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||||
|
test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
|
||||||
}
|
}
|
||||||
void test_cxx11_tensor_reduction_sycl() {
|
void test_cxx11_tensor_reduction_sycl() {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user