diff --git a/Eigen/Core b/Eigen/Core index 3ffd6220b..bf2479585 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -359,6 +359,7 @@ using std::ptrdiff_t; #include "src/Core/arch/ZVector/Complex.h" #endif +#include "src/Core/arch/CUDA/Complex.h" // Half float support #include "src/Core/arch/CUDA/Half.h" #include "src/Core/arch/CUDA/PacketMathHalf.h" diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index e3f20894d..25c3ef3d7 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -220,7 +220,7 @@ DenseBase::Constant(const Scalar& value) * * The function generates 'size' equally spaced values in the closed interval [low,high]. * This particular version of LinSpaced() uses sequential access, i.e. vector access is - * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization + * assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization * and yields faster code than the random access version. * * When size is set to 1, a vector of length 1 containing 'high' is returned. @@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con /** * \brief Sets a linearly spaced vector. * - * The function fill *this with equally spaced values in the closed interval [low,high]. + * The function fills *this with equally spaced values in the closed interval [low,high]. * When size is set to 1, a vector of length 1 containing 'high' is returned. * * \only_for_vectors diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index bff322b3c..a8c83f168 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -159,20 +159,20 @@ struct gemv_static_vector_if template struct gemv_static_vector_if { - #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0 - internal::plain_array m_data; - EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; } - #else - // Some architectures cannot align on the stack, - // => let's manually enforce alignment by allocating more data and return the address of the first aligned element. enum { ForceAlignment = internal::packet_traits::Vectorizable, PacketSize = internal::packet_traits::size }; - internal::plain_array m_data; + #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0 + internal::plain_array m_data; + EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; } + #else + // Some architectures cannot align on the stack, + // => let's manually enforce alignment by allocating more data and return the address of the first aligned element. + internal::plain_array m_data; EIGEN_STRONG_INLINE Scalar* data() { return ForceAlignment - ? reinterpret_cast((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES) + ? reinterpret_cast((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES) : m_data.array; } #endif @@ -207,7 +207,7 @@ template<> struct gemv_dense_selector typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef Map, Aligned> MappedDest; + typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 0c77ee003..3c9ef22fa 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x) // this range is +/-1.0f in single-precision. const T plus_9 = pset1(9.f); const T minus_9 = pset1(-9.f); - const T x = pmax(minus_9, pmin(plus_9, a_x)); - + // NOTE GCC prior to 6.3 might improperly optimize this max/min + // step such that if a_x is nan, x will be either 9 or -9, + // and tanh will return 1 or -1 instead of nan. + // This is supposed to be fixed in gcc6.3, + // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + const T x = pmax(minus_9,pmin(plus_9,a_x)); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(4.89352455891786e-03f); const T alpha_3 = pset1(6.37261928875436e-04f); diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 19a7483ba..d56df8249 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -330,15 +330,11 @@ template class MatrixBase /////////// LU module /////////// - EIGEN_DEVICE_FUNC inline const FullPivLU fullPivLu() const; - EIGEN_DEVICE_FUNC inline const PartialPivLU partialPivLu() const; - EIGEN_DEVICE_FUNC inline const PartialPivLU lu() const; - EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h new file mode 100644 index 000000000..f133b2db9 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -0,0 +1,88 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_CUDA_H +#define EIGEN_COMPLEX_CUDA_H + +// clang-format off + +namespace Eigen { + +namespace internal { + +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +// Many std::complex methods such as operator+, operator-, operator* and +// operator/ are not constexpr. Due to this, clang does not treat them as device +// functions and thus Eigen functors making use of these operators fail to +// compile. Here, we manually specialize these functors for complex types when +// building for CUDA to avoid non-constexpr methods. + +template struct scalar_sum_op> { + typedef typename std::complex result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + return std::complex(numext::real(a) + numext::real(b), + numext::imag(a) + numext::imag(b)); + } +}; + +template struct scalar_difference_op> { + typedef typename std::complex result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + return std::complex(numext::real(a) - numext::real(b), + numext::imag(a) - numext::imag(b)); + } +}; + +template struct scalar_product_op, std::complex> { + enum { + Vectorizable = packet_traits>::HasMul + }; + typedef typename std::complex result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + return std::complex(a_real * b_real - a_imag * b_imag, + a_real * b_imag + a_imag * b_real); + } +}; + +template struct scalar_quotient_op, std::complex> { + enum { + Vectorizable = packet_traits>::HasDiv + }; + typedef typename std::complex result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex operator() (const std::complex& a, const std::complex& b) const { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + const T norm = T(1) / (b_real * b_real + b_imag * b_imag); + return std::complex((a_real * b_real + a_imag * b_imag) * norm, + (a_imag * b_real - a_real * b_imag) * norm); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_CUDA_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index d8d30267e..d97f8caa7 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -179,7 +179,7 @@ struct selfadjoint_product_impl { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; - typedef Map, Aligned> MappedDest; + typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols()); diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index c11a983c7..4b292e74d 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -216,7 +216,7 @@ template struct trmv_selector typedef internal::blas_traits RhsBlasTraits; typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; - typedef Map, Aligned> MappedDest; + typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index fa60008ef..088a65240 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -671,6 +671,14 @@ struct scalar_div_cost { enum { value = 8*NumTraits::MulCost }; }; +template +struct scalar_div_cost, Vectorized> { + enum { value = 2*scalar_div_cost::value + + 6*NumTraits::MulCost + + 3*NumTraits::AddCost + }; +}; + template struct scalar_div_cost::type> { enum { value = 24 }; }; diff --git a/Eigen/src/Geometry/EulerAngles.h b/Eigen/src/Geometry/EulerAngles.h index b875b7a13..4865e58aa 100644 --- a/Eigen/src/Geometry/EulerAngles.h +++ b/Eigen/src/Geometry/EulerAngles.h @@ -55,7 +55,12 @@ MatrixBase::eulerAngles(Index a0, Index a1, Index a2) const res[0] = atan2(coeff(j,i), coeff(k,i)); if((odd && res[0]Scalar(0))) { - res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm(); res[1] = -atan2(s2, coeff(i,i)); } @@ -84,7 +89,12 @@ MatrixBase::eulerAngles(Index a0, Index a1, Index a2) const res[0] = atan2(coeff(j,k), coeff(k,k)); Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm(); if((odd && res[0]Scalar(0))) { - res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } res[1] = atan2(-coeff(i,k), -c2); } else diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index 4c1f499a1..80de2c305 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -119,7 +119,7 @@ void MatrixBase::applyHouseholderOnTheLeft( { *this *= Scalar(1)-tau; } - else + else if(tau!=Scalar(0)) { Map::type> tmp(workspace,cols()); Block bottom(derived(), 1, 0, rows()-1, cols()); @@ -156,7 +156,7 @@ void MatrixBase::applyHouseholderOnTheRight( { *this *= Scalar(1)-tau; } - else + else if(tau!=Scalar(0)) { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index ebcd5c208..03b6af706 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -879,7 +879,7 @@ struct Assignment >, internal::assign_ * * \sa class FullPivLU */ -template EIGEN_DEVICE_FUNC +template inline const FullPivLU::PlainObject> MatrixBase::fullPivLu() const { diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index 147f9496c..3134632e1 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -327,7 +327,7 @@ struct Assignment, internal::assign_op EIGEN_DEVICE_FUNC +template inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 13394fffa..d43961887 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -584,7 +584,7 @@ struct Assignment >, internal::assi * * \sa class PartialPivLU */ -template EIGEN_DEVICE_FUNC +template inline const PartialPivLU::PlainObject> MatrixBase::partialPivLu() const { @@ -599,7 +599,7 @@ MatrixBase::partialPivLu() const * * \sa class PartialPivLU */ -template EIGEN_DEVICE_FUNC +template inline const PartialPivLU::PlainObject> MatrixBase::lu() const { diff --git a/bench/btl/libs/blaze/CMakeLists.txt b/bench/btl/libs/blaze/CMakeLists.txt index f8b1b2ec3..e99a0855c 100644 --- a/bench/btl/libs/blaze/CMakeLists.txt +++ b/bench/btl/libs/blaze/CMakeLists.txt @@ -1,10 +1,13 @@ find_package(BLAZE) -find_package(Boost) +find_package(Boost COMPONENTS system) if (BLAZE_FOUND AND Boost_FOUND) include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS}) btl_add_bench(btl_blaze main.cpp) + # Note: The newest blaze version requires C++14. + # Ideally, we should set this depending on the version of Blaze we found + set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14) if(BUILD_btl_blaze) - target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a ) + target_link_libraries(btl_blaze ${Boost_LIBRARIES}) endif() endif () diff --git a/doc/CustomizingEigen_NullaryExpr.dox b/doc/CustomizingEigen_NullaryExpr.dox index d70f81065..37c8dcd89 100644 --- a/doc/CustomizingEigen_NullaryExpr.dox +++ b/doc/CustomizingEigen_NullaryExpr.dox @@ -53,6 +53,33 @@ showing that the program works as expected: This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch. + +\section NullaryExpr_Indexing Example 2: indexing rows and columns + +The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this: + +\snippet nullary_indexing.out main1 + +To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j): + +\snippet nullary_indexing.cpp functor + +Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression: + +\snippet nullary_indexing.cpp function + +Finally, here is an example of how this function can be used: + +\snippet nullary_indexing.cpp main1 + +This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc. + +\snippet nullary_indexing.cpp main2 + +and the output is: + +\snippet nullary_indexing.out main2 + */ } diff --git a/doc/examples/CMakeLists.txt b/doc/examples/CMakeLists.txt index 08cf8efd7..f7a19055f 100644 --- a/doc/examples/CMakeLists.txt +++ b/doc/examples/CMakeLists.txt @@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS}) ) add_dependencies(all_examples ${example}) endforeach(example_src) + +check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11) +if(EIGEN_COMPILER_SUPPORT_CPP11) +ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11") +endif() \ No newline at end of file diff --git a/doc/examples/nullary_indexing.cpp b/doc/examples/nullary_indexing.cpp new file mode 100644 index 000000000..e27c3585a --- /dev/null +++ b/doc/examples/nullary_indexing.cpp @@ -0,0 +1,66 @@ +#include +#include + +using namespace Eigen; + +// [functor] +template +class indexing_functor { + const ArgType &m_arg; + const RowIndexType &m_rowIndices; + const ColIndexType &m_colIndices; +public: + typedef Matrix MatrixType; + + indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) + : m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices) + {} + + const typename ArgType::Scalar& operator() (Index row, Index col) const { + return m_arg(m_rowIndices[row], m_colIndices[col]); + } +}; +// [functor] + +// [function] +template +CwiseNullaryOp, typename indexing_functor::MatrixType> +indexing(const Eigen::MatrixBase& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) +{ + typedef indexing_functor Func; + typedef typename Func::MatrixType MatrixType; + return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices)); +} +// [function] + + +int main() +{ + std::cout << "[main1]\n"; + Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4); + Array3i ri(1,2,1); + ArrayXi ci(6); ci << 3,2,1,0,0,2; + Eigen::MatrixXi B = indexing(A, ri, ci); + std::cout << "A =" << std::endl; + std::cout << A << std::endl << std::endl; + std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl; + std::cout << B << std::endl; + std::cout << "[main1]\n"; + + std::cout << "[main2]\n"; + B = indexing(A, ri+1, ci); + std::cout << "A(ri+1,ci) =" << std::endl; + std::cout << B << std::endl << std::endl; +#if __cplusplus >= 201103L + B = indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)); + std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl; + std::cout << B << std::endl << std::endl; +#endif + std::cout << "[main2]\n"; +} + diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 9a1f3792c..8ad5ac639 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -417,6 +417,7 @@ void cholesky_faillure_cases() VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); VERIFY(ldlt.info()==NumericalIssue); } +#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2) { mat.resize(3,3); mat << -1, -3, 3, @@ -426,6 +427,7 @@ void cholesky_faillure_cases() VERIFY(ldlt.info()==NumericalIssue); VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); } +#endif { mat.resize(3,3); mat << 1, 2, 3, diff --git a/test/fastmath.cpp b/test/fastmath.cpp index 438e6b2e5..cc5db0746 100644 --- a/test/fastmath.cpp +++ b/test/fastmath.cpp @@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) { VERIFY( !m.allFinite() ); VERIFY( m.hasNaN() ); } - m(4) /= T(0.0); + T hidden_zero = (std::numeric_limits::min)()*(std::numeric_limits::min)(); + m(4) /= hidden_zero; if(dryrun) { std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << " ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n"; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 77514d8a0..1394d9f2b 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -365,6 +365,7 @@ template void packetmath_real() } if (PacketTraits::HasTanh) { + // NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details. data1[0] = std::numeric_limits::quiet_NaN(); packet_helper::HasTanh,Packet> h; h.store(data2, internal::ptanh(h.load(data1))); diff --git a/test/product_small.cpp b/test/product_small.cpp index 3e8dab01e..0db50b949 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -213,7 +213,8 @@ void test_product_small() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(Matrix()) ); - CALL_SUBTEST_2( product(Matrix()) ); + CALL_SUBTEST_2( product(Matrix()) ); + CALL_SUBTEST_8( product(Matrix()) ); CALL_SUBTEST_3( product(Matrix3d()) ); CALL_SUBTEST_4( product(Matrix4d()) ); CALL_SUBTEST_5( product(Matrix4f()) ); diff --git a/test/svd_fill.h b/test/svd_fill.h index a1f752c66..3877c0c7e 100644 --- a/test/svd_fill.h +++ b/test/svd_fill.h @@ -14,6 +14,8 @@ template<> Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); } template<> Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); } +template +Array four_denorms() { return four_denorms().cast(); } template void svd_fill_random(MatrixType &m, int Option = 0) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 1468caa23..28c6f7626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -168,39 +168,20 @@ struct GpuDevice { return stream_->stream(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return NULL; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return NULL; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return NULL; -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { @@ -210,30 +191,22 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { @@ -242,21 +215,21 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - eigen_assert(false && "The default device should be used instead to generate kernel code"); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + EIGEN_STRONG_INLINE size_t numThreads() const { // FIXME return 32; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { // FIXME return 48*1024; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { // We won't try to take advantage of the l2 cache for the time being, and // there is no l3 cache on cuda devices. return firstLevelCacheSize(); @@ -276,56 +249,26 @@ struct GpuDevice { #endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { return stream_->deviceProperties().maxThreadsPerMultiProcessor; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { return stream_->deviceProperties().sharedMemPerBlock; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const { -#ifndef __CUDA_ARCH__ + EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); - return 0; -#endif } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const { + EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; } diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h index 82243e643..98f9f647d 100644 --- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h +++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -189,7 +189,12 @@ namespace Eigen res[0] = atan2(mat(J,K), mat(K,K)); Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm(); if((IsOdd && res[0]Scalar(0))) { - res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } res[1] = atan2(-mat(I,K), -c2); } else @@ -212,7 +217,12 @@ namespace Eigen res[0] = atan2(mat(J,I), mat(K,I)); if((IsOdd && res[0]Scalar(0))) { - res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI); + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); res[1] = -atan2(s2, mat(I,I)); } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 714046809..9eac6ec73 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(cxx11_tensor_complex_cuda) + ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_cast_float16_cuda) diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu new file mode 100644 index 000000000..2baf5eaad --- /dev/null +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -0,0 +1,97 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops +#define EIGEN_USE_GPU + +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 +#include +#endif +#include "main.h" +#include + +using Eigen::Tensor; + +template +void test_cuda_complex_cwise_ops() { + const int kNumItems = 2; + std::size_t complex_bytes = kNumItems * sizeof(std::complex); + + std::complex* d_in1; + std::complex* d_in2; + std::complex* d_out; + cudaMalloc((void**)(&d_in1), complex_bytes); + cudaMalloc((void**)(&d_in2), complex_bytes); + cudaMalloc((void**)(&d_out), complex_bytes); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, 1, 0, int>, Eigen::Aligned> gpu_in1( + d_in1, kNumItems); + Eigen::TensorMap, 1, 0, int>, Eigen::Aligned> gpu_in2( + d_in2, kNumItems); + Eigen::TensorMap, 1, 0, int>, Eigen::Aligned> gpu_out( + d_out, kNumItems); + + const std::complex a(3.14f, 2.7f); + const std::complex b(-10.6f, 1.4f); + + gpu_in1.device(gpu_device) = gpu_in1.constant(a); + gpu_in2.device(gpu_device) = gpu_in2.constant(b); + + enum CwiseOp { + Add = 0, + Sub, + Mul, + Div + }; + + Tensor, 1, 0, int> actual(kNumItems); + for (int op = Add; op <= Div; op++) { + std::complex expected; + switch (static_cast(op)) { + case Add: + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + expected = a + b; + break; + case Sub: + gpu_out.device(gpu_device) = gpu_in1 - gpu_in2; + expected = a - b; + break; + case Mul: + gpu_out.device(gpu_device) = gpu_in1 * gpu_in2; + expected = a * b; + break; + case Div: + gpu_out.device(gpu_device) = gpu_in1 / gpu_in2; + expected = a / b; + break; + } + assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < kNumItems; ++i) { + VERIFY_IS_APPROX(actual(i), expected); + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); +} + + +void test_cxx11_tensor_complex_cwise_ops() +{ + CALL_SUBTEST(test_cuda_complex_cwise_ops()); + CALL_SUBTEST(test_cuda_complex_cwise_ops()); +}