Make it possible for a vectorized tensor expression to be executed in a CUDA kernel.

This commit is contained in:
Benoit Steiner 2015-11-11 15:22:50 -08:00
parent 4f471146fb
commit 7f1c29fb0c

View File

@ -50,6 +50,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
{ {
public: public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{ {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
@ -57,7 +58,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
if (needs_assign) if (needs_assign)
{ {
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
const Index VectorizedSize = (size / PacketSize) * PacketSize; const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = 0; i < VectorizedSize; i += PacketSize) { for (Index i = 0; i < VectorizedSize; i += PacketSize) {