This commit is contained in:
Gael Guennebaud 2016-09-02 15:24:14 +02:00
commit 49c0390ce0
3 changed files with 98 additions and 29 deletions

View File

@ -25,7 +25,8 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{ {
// Type promotion to handle the case where the types of the lhs and the rhs are different. // Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename gebp_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ResScalar Scalar; typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind, typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind; typename traits<RhsXprType>::StorageKind>::ret StorageKind;

View File

@ -489,6 +489,27 @@ static void test_tensor_product()
} }
template<int DataLayout>
static void test_const_inputs()
{
Tensor<float, 2, DataLayout> in1(2, 3);
Tensor<float, 2, DataLayout> in2(3, 2);
in1.setRandom();
in2.setRandom();
TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
Tensor<float, 2, DataLayout> mat3(2,2);
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
mat3 = mat1.contract(mat2, dims);
VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
}
void test_cxx11_tensor_contraction() void test_cxx11_tensor_contraction()
{ {
CALL_SUBTEST(test_evals<ColMajor>()); CALL_SUBTEST(test_evals<ColMajor>());
@ -519,4 +540,6 @@ void test_cxx11_tensor_contraction()
CALL_SUBTEST(test_small_blocking_factors<RowMajor>()); CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
CALL_SUBTEST(test_tensor_product<ColMajor>()); CALL_SUBTEST(test_tensor_product<ColMajor>());
CALL_SUBTEST(test_tensor_product<RowMajor>()); CALL_SUBTEST(test_tensor_product<RowMajor>());
CALL_SUBTEST(test_const_inputs<ColMajor>());
CALL_SUBTEST(test_const_inputs<RowMajor>());
} }

View File

@ -10,7 +10,6 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include <cuda_fp16.h> #include <cuda_fp16.h>
@ -19,10 +18,55 @@
using Eigen::Tensor; using Eigen::Tensor;
void test_cuda_nullary() {
Tensor<float, 1, 0, int> in1(2);
Tensor<float, 1, 0, int> in2(2);
in1.setRandom();
in2.setRandom();
std::size_t tensor_bytes = in1.size() * sizeof(float);
float* d_in1;
float* d_in2;
cudaMalloc((void**)(&d_in1), tensor_bytes);
cudaMalloc((void**)(&d_in2), tensor_bytes);
cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
d_in1, 2);
Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
d_in2, 2);
gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
gpu_in2.device(gpu_device) = gpu_in2.random();
Tensor<float, 1, 0, int> new1(2);
Tensor<float, 1, 0, int> new2(2);
assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
gpu_device.stream()) == cudaSuccess);
assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(new1(i), 3.14f);
VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
}
cudaFree(d_in1);
cudaFree(d_in2);
}
void test_cuda_elementwise_small() { void test_cuda_elementwise_small() {
Tensor<float, 1> in1(Eigen::array<int, 1>(2)); Tensor<float, 1> in1(Eigen::array<int64_t, 1>(2));
Tensor<float, 1> in2(Eigen::array<int, 1>(2)); Tensor<float, 1> in2(Eigen::array<int64_t, 1>(2));
Tensor<float, 1> out(Eigen::array<int, 1>(2)); Tensor<float, 1> out(Eigen::array<int64_t, 1>(2));
in1.setRandom(); in1.setRandom();
in2.setRandom(); in2.setRandom();
@ -44,11 +88,11 @@ void test_cuda_elementwise_small() {
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
d_in1, Eigen::array<int, 1>(2)); d_in1, Eigen::array<int64_t, 1>(2));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
d_in2, Eigen::array<int, 1>(2)); d_in2, Eigen::array<int64_t, 1>(2));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
d_out, Eigen::array<int, 1>(2)); d_out, Eigen::array<int64_t, 1>(2));
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
@ -58,8 +102,8 @@ void test_cuda_elementwise_small() {
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX( VERIFY_IS_APPROX(
out(Eigen::array<int, 1>(i)), out(Eigen::array<int64_t, 1>(i)),
in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i))); in1(Eigen::array<int64_t, 1>(i)) + in2(Eigen::array<int64_t, 1>(i)));
} }
cudaFree(d_in1); cudaFree(d_in1);
@ -69,10 +113,10 @@ void test_cuda_elementwise_small() {
void test_cuda_elementwise() void test_cuda_elementwise()
{ {
Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97)); Tensor<float, 3> in1(Eigen::array<int64_t, 3>(72,53,97));
Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97)); Tensor<float, 3> in2(Eigen::array<int64_t, 3>(72,53,97));
Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97)); Tensor<float, 3> in3(Eigen::array<int64_t, 3>(72,53,97));
Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97)); Tensor<float, 3> out(Eigen::array<int64_t, 3>(72,53,97));
in1.setRandom(); in1.setRandom();
in2.setRandom(); in2.setRandom();
in3.setRandom(); in3.setRandom();
@ -98,10 +142,10 @@ void test_cuda_elementwise()
Eigen::CudaStreamDevice stream; Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97)); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int64_t, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97)); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int64_t, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97)); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int64_t, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97)); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int64_t, 3>(72,53,97));
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
@ -111,7 +155,7 @@ void test_cuda_elementwise()
for (int i = 0; i < 72; ++i) { for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) { for (int j = 0; j < 53; ++j) {
for (int k = 0; k < 97; ++k) { for (int k = 0; k < 97; ++k) {
VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k))); VERIFY_IS_APPROX(out(Eigen::array<int64_t, 3>(i,j,k)), in1(Eigen::array<int64_t, 3>(i,j,k)) + in2(Eigen::array<int64_t, 3>(i,j,k)) * in3(Eigen::array<int64_t, 3>(i,j,k)));
} }
} }
} }
@ -181,7 +225,7 @@ void test_cuda_reduction()
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
array<int, 2> reduction_axis; array<int64_t, 2> reduction_axis;
reduction_axis[0] = 1; reduction_axis[0] = 1;
reduction_axis[1] = 3; reduction_axis[1] = 3;
@ -214,8 +258,8 @@ void test_cuda_contraction()
// more than 30 * 1024, which is the number of threads in blocks on // more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU // a 15 SM GK110 GPU
Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1)); Tensor<float, 5, DataLayout> t_right(Eigen::array<int64_t, 5>(3, 31, 7, 20, 1));
Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1)); Tensor<float, 5, DataLayout> t_result(Eigen::array<int64_t, 5>(6, 50, 7, 20, 1));
t_left.setRandom(); t_left.setRandom();
t_right.setRandom(); t_right.setRandom();
@ -299,7 +343,7 @@ void test_cuda_convolution_1d()
Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
Eigen::array<int, 1> dims(1); Eigen::array<int64_t, 1> dims(1);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@ -352,7 +396,7 @@ void test_cuda_convolution_inner_dim_col_major_1d()
Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
Eigen::array<int, 1> dims(0); Eigen::array<int64_t, 1> dims(0);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@ -405,7 +449,7 @@ void test_cuda_convolution_inner_dim_row_major_1d()
Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
Eigen::array<int, 1> dims(3); Eigen::array<int64_t, 1> dims(3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@ -459,7 +503,7 @@ void test_cuda_convolution_2d()
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
Eigen::array<int, 2> dims(1,2); Eigen::array<int64_t, 2> dims(1,2);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@ -496,9 +540,9 @@ void test_cuda_convolution_2d()
template<int DataLayout> template<int DataLayout>
void test_cuda_convolution_3d() void test_cuda_convolution_3d()
{ {
Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17)); Tensor<float, 5, DataLayout> input(Eigen::array<int64_t, 5>(74,37,11,137,17));
Tensor<float, 3, DataLayout> kernel(3,4,2); Tensor<float, 3, DataLayout> kernel(3,4,2);
Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17)); Tensor<float, 5, DataLayout> out(Eigen::array<int64_t, 5>(74,35,8,136,17));
input = input.constant(10.0f) + input.random(); input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random(); kernel = kernel.constant(7.0f) + kernel.random();
@ -523,7 +567,7 @@ void test_cuda_convolution_3d()
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
Eigen::array<int, 3> dims(1,2,3); Eigen::array<int64_t, 3> dims(1,2,3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@ -1168,6 +1212,7 @@ void test_cuda_betainc()
void test_cxx11_tensor_cuda() void test_cxx11_tensor_cuda()
{ {
CALL_SUBTEST_1(test_cuda_nullary());
CALL_SUBTEST_1(test_cuda_elementwise_small()); CALL_SUBTEST_1(test_cuda_elementwise_small());
CALL_SUBTEST_1(test_cuda_elementwise()); CALL_SUBTEST_1(test_cuda_elementwise());
CALL_SUBTEST_1(test_cuda_props()); CALL_SUBTEST_1(test_cuda_props());