This commit is contained in:
Gael Guennebaud 2016-09-22 22:32:55 +02:00
commit 8f2bdde373
27 changed files with 374 additions and 108 deletions

View File

@ -359,6 +359,7 @@ using std::ptrdiff_t;
#include "src/Core/arch/ZVector/Complex.h"
#endif
#include "src/Core/arch/CUDA/Complex.h"
// Half float support
#include "src/Core/arch/CUDA/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h"

View File

@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
*
* The function generates 'size' equally spaced values in the closed interval [low,high].
* This particular version of LinSpaced() uses sequential access, i.e. vector access is
* assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
* assumed to be a(0), a(1), ..., a(size-1). This assumption allows for better vectorization
* and yields faster code than the random access version.
*
* When size is set to 1, a vector of length 1 containing 'high' is returned.
@ -389,7 +389,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
/**
* \brief Sets a linearly spaced vector.
*
* The function fill *this with equally spaced values in the closed interval [low,high].
* The function fills *this with equally spaced values in the closed interval [low,high].
* When size is set to 1, a vector of length 1 containing 'high' is returned.
*
* \only_for_vectors

View File

@ -159,20 +159,20 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
template<typename Scalar,int Size,int MaxSize>
struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
{
#if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
#else
// Some architectures cannot align on the stack,
// => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
enum {
ForceAlignment = internal::packet_traits<Scalar>::Vectorizable,
PacketSize = internal::packet_traits<Scalar>::size
};
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
#if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0,EIGEN_PLAIN_ENUM_MIN(AlignedMax,PacketSize)> m_data;
EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
#else
// Some architectures cannot align on the stack,
// => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?EIGEN_MAX_ALIGN_BYTES:0),0> m_data;
EIGEN_STRONG_INLINE Scalar* data() {
return ForceAlignment
? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
? reinterpret_cast<Scalar*>((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
: m_data.array;
}
#endif
@ -207,7 +207,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);

View File

@ -29,8 +29,12 @@ T generic_fast_tanh_float(const T& a_x)
// this range is +/-1.0f in single-precision.
const T plus_9 = pset1<T>(9.f);
const T minus_9 = pset1<T>(-9.f);
const T x = pmax(minus_9, pmin(plus_9, a_x));
// NOTE GCC prior to 6.3 might improperly optimize this max/min
// step such that if a_x is nan, x will be either 9 or -9,
// and tanh will return 1 or -1 instead of nan.
// This is supposed to be fixed in gcc6.3,
// see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
const T x = pmax(minus_9,pmin(plus_9,a_x));
// The monomial coefficients of the numerator polynomial (odd).
const T alpha_1 = pset1<T>(4.89352455891786e-03f);
const T alpha_3 = pset1<T>(6.37261928875436e-04f);

View File

@ -330,15 +330,11 @@ template<typename Derived> class MatrixBase
/////////// LU module ///////////
EIGEN_DEVICE_FUNC
inline const FullPivLU<PlainObject> fullPivLu() const;
EIGEN_DEVICE_FUNC
inline const PartialPivLU<PlainObject> partialPivLu() const;
EIGEN_DEVICE_FUNC
inline const PartialPivLU<PlainObject> lu() const;
EIGEN_DEVICE_FUNC
inline const Inverse<Derived> inverse() const;
template<typename ResultType>

View File

@ -0,0 +1,88 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_CUDA_H
#define EIGEN_COMPLEX_CUDA_H
// clang-format off
namespace Eigen {
namespace internal {
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
// functions and thus Eigen functors making use of these operators fail to
// compile. Here, we manually specialize these functors for complex types when
// building for CUDA to avoid non-constexpr methods.
template<typename T> struct scalar_sum_op<std::complex<T>> {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) + numext::real(b),
numext::imag(a) + numext::imag(b));
}
};
template<typename T> struct scalar_difference_op<std::complex<T>> {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) - numext::real(b),
numext::imag(a) - numext::imag(b));
}
};
template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T>> {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasMul
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
return std::complex<T>(a_real * b_real - a_imag * b_imag,
a_real * b_imag + a_imag * b_real);
}
};
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T>> {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasDiv
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
(a_imag * b_real - a_real * b_imag) * norm);
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_CUDA_H

View File

@ -179,7 +179,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
{
typedef typename Dest::Scalar ResScalar;
typedef typename Rhs::Scalar RhsScalar;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());

View File

@ -216,7 +216,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
typedef internal::blas_traits<Rhs> RhsBlasTraits;
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest;
typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);

View File

@ -671,6 +671,14 @@ struct scalar_div_cost {
enum { value = 8*NumTraits<T>::MulCost };
};
template<typename T,bool Vectorized>
struct scalar_div_cost<std::complex<T>, Vectorized> {
enum { value = 2*scalar_div_cost<T>::value
+ 6*NumTraits<T>::MulCost
+ 3*NumTraits<T>::AddCost
};
};
template<bool Vectorized>
struct scalar_div_cost<signed long,Vectorized,typename conditional<sizeof(long)==8,void,false_type>::type> { enum { value = 24 }; };

View File

@ -55,7 +55,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
res[0] = atan2(coeff(j,i), coeff(k,i));
if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
{
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
res[1] = -atan2(s2, coeff(i,i));
}
@ -84,7 +89,12 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
res[0] = atan2(coeff(j,k), coeff(k,k));
Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
res[1] = atan2(-coeff(i,k), -c2);
}
else

View File

@ -119,7 +119,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
{
*this *= Scalar(1)-tau;
}
else
else if(tau!=Scalar(0))
{
Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
@ -156,7 +156,7 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
{
*this *= Scalar(1)-tau;
}
else
else if(tau!=Scalar(0))
{
Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);

View File

@ -879,7 +879,7 @@ struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_
*
* \sa class FullPivLU
*/
template<typename Derived> EIGEN_DEVICE_FUNC
template<typename Derived>
inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
MatrixBase<Derived>::fullPivLu() const
{

View File

@ -327,7 +327,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
*
* \sa computeInverseAndDetWithCheck()
*/
template<typename Derived> EIGEN_DEVICE_FUNC
template<typename Derived>
inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
{
EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)

View File

@ -584,7 +584,7 @@ struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assi
*
* \sa class PartialPivLU
*/
template<typename Derived> EIGEN_DEVICE_FUNC
template<typename Derived>
inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
MatrixBase<Derived>::partialPivLu() const
{
@ -599,7 +599,7 @@ MatrixBase<Derived>::partialPivLu() const
*
* \sa class PartialPivLU
*/
template<typename Derived> EIGEN_DEVICE_FUNC
template<typename Derived>
inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
MatrixBase<Derived>::lu() const
{

View File

@ -1,10 +1,13 @@
find_package(BLAZE)
find_package(Boost)
find_package(Boost COMPONENTS system)
if (BLAZE_FOUND AND Boost_FOUND)
include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
btl_add_bench(btl_blaze main.cpp)
# Note: The newest blaze version requires C++14.
# Ideally, we should set this depending on the version of Blaze we found
set_property(TARGET btl_blaze PROPERTY CXX_STANDARD 14)
if(BUILD_btl_blaze)
target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
target_link_libraries(btl_blaze ${Boost_LIBRARIES})
endif()
endif ()

View File

@ -53,6 +53,33 @@ showing that the program works as expected:
This implementation of \c makeCirculant is much simpler than \ref TopicNewExpressionType "defining a new expression" from scratch.
\section NullaryExpr_Indexing Example 2: indexing rows and columns
The goal here is to mimic MatLab's ability to index a matrix through two vectors of indices referencing the rows and columns to be picked respectively, like this:
\snippet nullary_indexing.out main1
To this end, let us first write a nullary-functor storing references to the input matrix and to the two arrays of indices, and implementing the required \c operator()(i,j):
\snippet nullary_indexing.cpp functor
Then, let's create an \c indexing(A,rows,cols) function creating the nullary expression:
\snippet nullary_indexing.cpp function
Finally, here is an example of how this function can be used:
\snippet nullary_indexing.cpp main1
This straightforward implementation is already quite powerful as the row or column index arrays can also be expressions to perform offsetting, modulo, striding, reverse, etc.
\snippet nullary_indexing.cpp main2
and the output is:
\snippet nullary_indexing.out main2
*/
}

View File

@ -14,3 +14,8 @@ foreach(example_src ${examples_SRCS})
)
add_dependencies(all_examples ${example})
endforeach(example_src)
check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11)
if(EIGEN_COMPILER_SUPPORT_CPP11)
ei_add_target_property(nullary_indexing COMPILE_FLAGS "-std=c++11")
endif()

View File

@ -0,0 +1,66 @@
#include <Eigen/Core>
#include <iostream>
using namespace Eigen;
// [functor]
template<class ArgType, class RowIndexType, class ColIndexType>
class indexing_functor {
const ArgType &m_arg;
const RowIndexType &m_rowIndices;
const ColIndexType &m_colIndices;
public:
typedef Matrix<typename ArgType::Scalar,
RowIndexType::SizeAtCompileTime,
ColIndexType::SizeAtCompileTime,
ArgType::Flags&RowMajorBit?RowMajor:ColMajor,
RowIndexType::MaxSizeAtCompileTime,
ColIndexType::MaxSizeAtCompileTime> MatrixType;
indexing_functor(const ArgType& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
: m_arg(arg), m_rowIndices(row_indices), m_colIndices(col_indices)
{}
const typename ArgType::Scalar& operator() (Index row, Index col) const {
return m_arg(m_rowIndices[row], m_colIndices[col]);
}
};
// [functor]
// [function]
template <class ArgType, class RowIndexType, class ColIndexType>
CwiseNullaryOp<indexing_functor<ArgType,RowIndexType,ColIndexType>, typename indexing_functor<ArgType,RowIndexType,ColIndexType>::MatrixType>
indexing(const Eigen::MatrixBase<ArgType>& arg, const RowIndexType& row_indices, const ColIndexType& col_indices)
{
typedef indexing_functor<ArgType,RowIndexType,ColIndexType> Func;
typedef typename Func::MatrixType MatrixType;
return MatrixType::NullaryExpr(row_indices.size(), col_indices.size(), Func(arg.derived(), row_indices, col_indices));
}
// [function]
int main()
{
std::cout << "[main1]\n";
Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4);
Array3i ri(1,2,1);
ArrayXi ci(6); ci << 3,2,1,0,0,2;
Eigen::MatrixXi B = indexing(A, ri, ci);
std::cout << "A =" << std::endl;
std::cout << A << std::endl << std::endl;
std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl;
std::cout << B << std::endl;
std::cout << "[main1]\n";
std::cout << "[main2]\n";
B = indexing(A, ri+1, ci);
std::cout << "A(ri+1,ci) =" << std::endl;
std::cout << B << std::endl << std::endl;
#if __cplusplus >= 201103L
B = indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3));
std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl;
std::cout << B << std::endl << std::endl;
#endif
std::cout << "[main2]\n";
}

View File

@ -417,6 +417,7 @@ void cholesky_faillure_cases()
VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
VERIFY(ldlt.info()==NumericalIssue);
}
#if (!EIGEN_ARCH_i386) || defined(EIGEN_VECTORIZE_SSE2)
{
mat.resize(3,3);
mat << -1, -3, 3,
@ -426,6 +427,7 @@ void cholesky_faillure_cases()
VERIFY(ldlt.info()==NumericalIssue);
VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix());
}
#endif
{
mat.resize(3,3);
mat << 1, 2, 3,

View File

@ -49,7 +49,8 @@ void check_inf_nan(bool dryrun) {
VERIFY( !m.allFinite() );
VERIFY( m.hasNaN() );
}
m(4) /= T(0.0);
T hidden_zero = (std::numeric_limits<T>::min)()*(std::numeric_limits<T>::min)();
m(4) /= hidden_zero;
if(dryrun)
{
std::cout << "std::isfinite(" << m(4) << ") = "; check((std::isfinite)(m(4)),false); std::cout << " ; numext::isfinite = "; check((numext::isfinite)(m(4)), false); std::cout << "\n";

View File

@ -365,6 +365,7 @@ template<typename Scalar> void packetmath_real()
}
if (PacketTraits::HasTanh) {
// NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
packet_helper<internal::packet_traits<Scalar>::HasTanh,Packet> h;
h.store(data2, internal::ptanh(h.load(data1)));

View File

@ -213,7 +213,8 @@ void test_product_small()
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( product(Matrix<float, 3, 2>()) );
CALL_SUBTEST_2( product(Matrix<int, 3, 5>()) );
CALL_SUBTEST_2( product(Matrix<int, 3, 17>()) );
CALL_SUBTEST_8( product(Matrix<double, 3, 17>()) );
CALL_SUBTEST_3( product(Matrix3d()) );
CALL_SUBTEST_4( product(Matrix4d()) );
CALL_SUBTEST_5( product(Matrix4f()) );

View File

@ -14,6 +14,8 @@ template<>
Array4f four_denorms() { return Array4f(5.60844e-39f, -5.60844e-39f, 4.94e-44f, -4.94e-44f); }
template<>
Array4d four_denorms() { return Array4d(5.60844e-313, -5.60844e-313, 4.94e-324, -4.94e-324); }
template<typename T>
Array<T,4,1> four_denorms() { return four_denorms<double>().cast<T>(); }
template<typename MatrixType>
void svd_fill_random(MatrixType &m, int Option = 0)

View File

@ -168,39 +168,20 @@ struct GpuDevice {
return stream_->stream();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return stream_->allocate(num_bytes);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
stream_->deallocate(buffer);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* scratchpad() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE void* scratchpad() const {
return stream_->scratchpad();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE unsigned int* semaphore() const {
return stream_->semaphore();
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@ -210,30 +191,22 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
cudaError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@ -242,21 +215,21 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices.
return firstLevelCacheSize();
@ -276,56 +249,26 @@ struct GpuDevice {
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
#ifndef __CUDA_ARCH__
EIGEN_STRONG_INLINE int minorDeviceVersion() const {
return stream_->deviceProperties().minor;
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
return 0;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}

View File

@ -189,7 +189,12 @@ namespace Eigen
res[0] = atan2(mat(J,K), mat(K,K));
Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
res[1] = atan2(-mat(I,K), -c2);
}
else
@ -212,7 +217,12 @@ namespace Eigen
res[0] = atan2(mat(J,I), mat(K,I));
if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
{
res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
if(res[0] > Scalar(0)) {
res[0] -= Scalar(EIGEN_PI);
}
else {
res[0] += Scalar(EIGEN_PI);
}
Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
res[1] = -atan2(s2, mat(I,I));
}

View File

@ -226,6 +226,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(cxx11_tensor_complex_cuda)
ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
ei_add_test(cxx11_tensor_reduction_cuda)
ei_add_test(cxx11_tensor_argmax_cuda)
ei_add_test(cxx11_tensor_cast_float16_cuda)

View File

@ -0,0 +1,97 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
#define EIGEN_USE_GPU
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#include <cuda_fp16.h>
#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<typename T>
void test_cuda_complex_cwise_ops() {
const int kNumItems = 2;
std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
std::complex<T>* d_in1;
std::complex<T>* d_in2;
std::complex<T>* d_out;
cudaMalloc((void**)(&d_in1), complex_bytes);
cudaMalloc((void**)(&d_in2), complex_bytes);
cudaMalloc((void**)(&d_out), complex_bytes);
Eigen::CudaStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
d_in1, kNumItems);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
d_in2, kNumItems);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
d_out, kNumItems);
const std::complex<T> a(3.14f, 2.7f);
const std::complex<T> b(-10.6f, 1.4f);
gpu_in1.device(gpu_device) = gpu_in1.constant(a);
gpu_in2.device(gpu_device) = gpu_in2.constant(b);
enum CwiseOp {
Add = 0,
Sub,
Mul,
Div
};
Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
for (int op = Add; op <= Div; op++) {
std::complex<T> expected;
switch (static_cast<CwiseOp>(op)) {
case Add:
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
expected = a + b;
break;
case Sub:
gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
expected = a - b;
break;
case Mul:
gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
expected = a * b;
break;
case Div:
gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
expected = a / b;
break;
}
assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < kNumItems; ++i) {
VERIFY_IS_APPROX(actual(i), expected);
}
}
cudaFree(d_in1);
cudaFree(d_in2);
cudaFree(d_out);
}
void test_cxx11_tensor_complex_cwise_ops()
{
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
}