mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-13 20:26:03 +08:00
Adding asynchronous execution as it improves the performance.
This commit is contained in:
parent
2d4a091beb
commit
730eb9fe1c
@ -347,7 +347,7 @@ template< typename Self, typename Output, typename Index, typename ContractT, ty
|
|||||||
/// End the kernel
|
/// End the kernel
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
self.device().synchronize();
|
self.device().asynchronousExec();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -214,7 +214,7 @@ struct SyclDevice {
|
|||||||
auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
||||||
});
|
});
|
||||||
synchronize();
|
asynchronousExec();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
|
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
|
||||||
@ -245,7 +245,7 @@ struct SyclDevice {
|
|||||||
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, 0));
|
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, 0));
|
||||||
});
|
});
|
||||||
synchronize();
|
asynchronousExec();
|
||||||
}
|
}
|
||||||
/// returning the sycl queue
|
/// returning the sycl queue
|
||||||
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
|
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
|
||||||
@ -263,7 +263,7 @@ struct SyclDevice {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
synchronize();
|
asynchronousExec();
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||||
@ -282,6 +282,10 @@ struct SyclDevice {
|
|||||||
EIGEN_STRONG_INLINE void synchronize() const {
|
EIGEN_STRONG_INLINE void synchronize() const {
|
||||||
sycl_queue().wait_and_throw(); //pass
|
sycl_queue().wait_and_throw(); //pass
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void asynchronousExec() const {
|
||||||
|
sycl_queue().throw_asynchronous();//pass
|
||||||
|
}
|
||||||
// This function checks if the runtime recorded an error for the
|
// This function checks if the runtime recorded an error for the
|
||||||
// underlying stream device.
|
// underlying stream device.
|
||||||
EIGEN_STRONG_INLINE bool ok() const {
|
EIGEN_STRONG_INLINE bool ok() const {
|
||||||
|
@ -81,7 +81,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
dev.sycl_queue().submit(f);
|
dev.sycl_queue().submit(f);
|
||||||
dev.synchronize();
|
dev.asynchronousExec();
|
||||||
|
|
||||||
/* At this point, you could queue::wait_and_throw() to ensure that
|
/* At this point, you could queue::wait_and_throw() to ensure that
|
||||||
* errors are caught quickly. However, this would likely impact
|
* errors are caught quickly. However, this would likely impact
|
||||||
@ -173,7 +173,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
|||||||
tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
|
tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
dev.synchronize();
|
dev.asynchronousExec();
|
||||||
|
|
||||||
/// This is used to recursively reduce the tmp value to an element of 1;
|
/// This is used to recursively reduce the tmp value to an element of 1;
|
||||||
syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
||||||
@ -212,7 +212,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
|||||||
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
|
(output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
|
||||||
|
|
||||||
});
|
});
|
||||||
dev.synchronize();
|
dev.asynchronousExec();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -54,7 +54,7 @@ void run(Expr &expr, Dev &dev) {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
dev.synchronize();
|
dev.asynchronousExec();
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
}
|
}
|
||||||
|
@ -137,8 +137,6 @@ static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
|
|||||||
array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
|
array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
|
||||||
|
|
||||||
TEST_UNARY_BUILTINS(float)
|
TEST_UNARY_BUILTINS(float)
|
||||||
/// your GPU must support double. Otherwise, disable the double test.
|
|
||||||
TEST_UNARY_BUILTINS(double)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace std {
|
namespace std {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user