From c0f2cb016e60b7dbde1d5946f42234a709a711f9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 28 Apr 2014 10:32:27 -0700 Subject: [PATCH 001/214] Extended support for Tensors: * Added ability to map a region of the memory to a tensor * Added basic support for unary and binary coefficient wise expressions, such as addition or square root * Provided an emulation layer to make it possible to compile the code with compilers (such as nvcc) that don't support cxx11. --- Eigen/src/Core/util/Macros.h | 5 + unsupported/Eigen/CXX11/Core | 14 +- unsupported/Eigen/CXX11/Tensor | 27 ++- .../Eigen/CXX11/src/Core/util/CXX11Meta.h | 24 +-- .../CXX11/src/Core/util/CXX11Workarounds.h | 16 +- .../CXX11/src/Core/util/EmulateCXX11Meta.h | 184 ++++++++++++++++++ unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 156 ++++++++------- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 52 +++++ .../Eigen/CXX11/src/Tensor/TensorBase.h | 82 ++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 127 ++++++++++++ .../Eigen/CXX11/src/Tensor/TensorExpr.h | 161 +++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 27 +++ .../Eigen/CXX11/src/Tensor/TensorMap.h | 101 ++++++++++ .../Eigen/CXX11/src/Tensor/TensorStorage.h | 52 ++--- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 122 ++++++++++++ unsupported/test/CMakeLists.txt | 5 +- unsupported/test/cxx11_tensor_simple.cpp | 2 +- 17 files changed, 1028 insertions(+), 129 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index bfd6ba7de..3a928001e 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -121,6 +121,11 @@ #define EIGEN_HAVE_RVALUE_REFERENCES #endif +// Does the compiler support variadic templates? +#if __cplusplus > 199711L +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index 4dc4ab224..bba3d578d 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2013 Christian Seiler +// Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -21,20 +22,23 @@ * module. Note that at this stage, you should not need to include * this module directly. * + * It also provides a limited fallback for compilers that don't support + * CXX11 yet, such as nvcc. + * * \code * #include * \endcode */ -#include - +// Emulate the cxx11 functionality that we need if the compiler doesn't support it. +#if __cplusplus <= 199711L +#include "src/Core/util/EmulateCXX11Meta.h" +#else #include "src/Core/util/CXX11Workarounds.h" #include "src/Core/util/CXX11Meta.h" +#endif #include #endif // EIGEN_CXX11_CORE_MODULE -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f2c5129b3..f554c204a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -10,9 +10,10 @@ #ifndef EIGEN_CXX11_TENSOR_MODULE #define EIGEN_CXX11_TENSOR_MODULE -#include +#include "Eigen/src/Core/util/StaticAssert.h" +#include "unsupported/Eigen/CXX11/Core" -#include +#include "Eigen/src/Core/util/DisableStupidWarnings.h" /** \defgroup CXX11_Tensor_Module Tensor Module * @@ -27,13 +28,21 @@ #include #include -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" +#include "Eigen/Core" -#include +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" +#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" + +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_TENSOR_MODULE - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 618e2eb7b..47f06b1b5 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -317,7 +317,7 @@ constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts template constexpr inline Array h_array_reverse(Array arr, numeric_list) { - return {{std_array_get(arr)...}}; + return {{array_get(arr)...}}; } template @@ -335,9 +335,9 @@ constexpr inline std::array array_reverse(std::array arr) // an infinite loop) template struct h_array_reduce { - constexpr static inline auto run(std::array arr) -> decltype(Reducer::run(h_array_reduce::run(arr), std_array_get(arr))) + constexpr static inline auto run(std::array arr) -> decltype(Reducer::run(h_array_reduce::run(arr), array_get(arr))) { - return Reducer::run(h_array_reduce::run(arr), std_array_get(arr)); + return Reducer::run(h_array_reduce::run(arr), array_get(arr)); } }; @@ -346,7 +346,7 @@ struct h_array_reduce { constexpr static inline T run(std::array arr) { - return std_array_get<0>(arr); + return array_get<0>(arr); } }; @@ -375,7 +375,7 @@ constexpr inline auto array_prod(std::array arr) -> decltype(array_reduce< template constexpr inline std::array h_array_zip(std::array a, std::array b, numeric_list) { - return std::array{{ Op::run(std_array_get(a), std_array_get(b))... }}; + return std::array{{ Op::run(array_get(a), array_get(b))... }}; } template @@ -387,9 +387,9 @@ constexpr inline std::array array_zip(std::array< /* zip an array and reduce the result */ template -constexpr inline auto h_array_zip_and_reduce(std::array a, std::array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(std_array_get(a), std_array_get(b))...)) +constexpr inline auto h_array_zip_and_reduce(std::array a, std::array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) { - return reduce::type...>::run(Op::run(std_array_get(a), std_array_get(b))...); + return reduce::type...>::run(Op::run(array_get(a), array_get(b))...); } template @@ -403,7 +403,7 @@ constexpr inline auto array_zip_and_reduce(std::array a, std::array template constexpr inline std::array h_array_apply(std::array a, numeric_list) { - return std::array{{ Op::run(std_array_get(a))... }}; + return std::array{{ Op::run(array_get(a))... }}; } template @@ -415,9 +415,9 @@ constexpr inline std::array array_apply(std::array -constexpr inline auto h_array_apply_and_reduce(std::array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(std_array_get(arr))...)) +constexpr inline auto h_array_apply_and_reduce(std::array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) { - return reduce::type...>::run(Op::run(std_array_get(arr))...); + return reduce::type...>::run(Op::run(array_get(arr))...); } template @@ -497,7 +497,3 @@ InstType instantiate_by_c_array(ArrType* arr) } // end namespace Eigen #endif // EIGEN_CXX11META_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 356ae10cf..77207f453 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -40,8 +40,18 @@ #error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) #endif +using std::array; + namespace Eigen { +// Use std::array as Eigen array +/*template +struct array : public std::array { + array() = default; + array(const std::initializer_list& a);// : std::array(a) {}; + array(const std::array& a); +};*/ + namespace internal { /* std::get is only constexpr in C++14, not yet in C++11 @@ -60,9 +70,9 @@ namespace internal { #define STD_GET_ARR_HACK std::template get(a) #endif -template constexpr inline T& std_array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } -template constexpr inline T&& std_array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } -template constexpr inline T const& std_array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } +template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } +template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } #undef STD_GET_ARR_HACK diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h new file mode 100644 index 000000000..76fcba5b4 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -0,0 +1,184 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_CXX11_META_H +#define EIGEN_EMULATE_CXX11_META_H + + +namespace Eigen { + +// The array class is only available starting with cxx11. Emulate our own here +// if needed +template class array { + public: + T& operator[] (size_t index) { return values[index]; } + const T& operator[] (size_t index) const { return values[index]; } + + T values[n]; +}; + + +namespace internal { + +/** \internal + * \file CXX11/Core/util/EmulateCXX11Meta.h + * This file emulates a subset of the functionality provided by CXXMeta.h for + * compilers that don't yet support cxx11 such as nvcc. + */ + +struct empty_list { static const std::size_t count = 0; }; + +template struct type_list { + T head; + Tail tail; + static const std::size_t count = 1 + Tail::count; +}; + +struct null_type { }; + +template +struct make_type_list { + typedef typename make_type_list::type tailresult; + + typedef type_list type; +}; + +template<> struct make_type_list<> { + typedef empty_list type; +}; + + + +template +struct type2val { + static const T value = n; +}; + + +template struct gen_numeric_list_repeated; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; +}; + + + +template +array repeat(t v) { + array array; + array.fill(v); + return array; +} + +template +t array_prod(const array& a) { + t prod = 1; + for (size_t i = 0; i < n; ++i) { prod *= a[i]; } + return prod; +} +template +t array_prod(const array& /*a*/) { + return 0; +} + +template inline T& array_get(array& a) { + return a[I]; +} +template inline const T& array_get(const array& a) { + return a[I]; +} + +struct sum_op { + template static inline bool run(A a, B b) { return a + b; } +}; +struct product_op { + template static inline bool run(A a, B b) { return a * b; } +}; + +struct logical_and_op { + template static inline bool run(A a, B b) { return a && b; } +}; +struct logical_or_op { + template static inline bool run(A a, B b) { return a || b; } +}; + +struct equal_op { + template static inline bool run(A a, B b) { return a == b; } +}; +struct not_equal_op { + template static inline bool run(A a, B b) { return a != b; } +}; +struct lesser_op { + template static inline bool run(A a, B b) { return a < b; } +}; +struct lesser_equal_op { + template static inline bool run(A a, B b) { return a <= b; } +}; + +struct greater_op { + template static inline bool run(A a, B b) { return a > b; } +}; +struct greater_equal_op { + template static inline bool run(A a, B b) { return a >= b; } +}; + +struct not_op { + template static inline bool run(A a) { return !a; } +}; +struct negation_op { + template static inline bool run(A a) { return -a; } +}; +struct greater_equal_zero_op { + template static inline bool run(A a) { return a >= 0; } +}; + + +template +inline bool array_apply_and_reduce(const array& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; +} + +template +inline bool array_zip_and_reduce(const array& a, const array& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; +} + +} // end namespace internal + +} // end namespace Eigen + + + +#endif // EIGEN_EMULATE_CXX11_META_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index c6216e14c..7b8f14c6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -57,28 +57,16 @@ namespace Eigen { * * \ref TopicStorageOrders */ -template -class Tensor; namespace internal { -template -struct traits> -{ - typedef Scalar_ Scalar; - typedef Dense StorageKind; - typedef DenseIndex Index; - enum { - Options = Options_ - }; -}; template struct tensor_index_linearization_helper { - constexpr static inline Index run(std::array const& indices, std::array const& dimensions) + static inline Index run(array const& indices, array const& dimensions) { - return std_array_get(indices) + - std_array_get(dimensions) * + return array_get(indices) + + array_get(dimensions) * tensor_index_linearization_helper::run(indices, dimensions); } }; @@ -86,39 +74,40 @@ struct tensor_index_linearization_helper template struct tensor_index_linearization_helper { - constexpr static inline Index run(std::array const& indices, std::array const&) + static inline Index run(array const& indices, array const&) { - return std_array_get(indices); + return array_get(indices); } }; /* Forward-declaration required for the symmetry support. */ template class tensor_symmetry_value_setter; + } // end namespace internal template -class Tensor +class Tensor : public TensorBase > { - static_assert(NumIndices_ >= 1, "A tensor must have at least one index."); - public: typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; - typedef typename internal::traits::Scalar Scalar; + typedef Scalar_ Scalar; typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; - typedef Self DenseType; + typedef typename Base::CoeffReturnType CoeffReturnType; - constexpr static int Options = Options_; - constexpr static std::size_t NumIndices = NumIndices_; + static const int Options = Options_; + static const std::size_t NumIndices = NumIndices_; protected: TensorStorage m_storage; public: EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE std::array dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE array dimensions() const { return m_storage.dimensions(); } EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); } EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } @@ -129,29 +118,17 @@ class Tensor inline Self& base() { return *this; } inline const Self& base() const { return *this; } - void setZero() - { - // FIXME: until we have implemented packet access and the - // expression engine w.r.t. nullary ops, use this - // as a kludge. Only works with POD types, but for - // any standard usage, this shouldn't be a problem - memset((void *)data(), 0, size() * sizeof(Scalar)); - } - - inline Self& operator=(Self const& other) - { - m_storage = other.m_storage; - return *this; - } - +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return coeff(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline const Scalar& coeff(const std::array& indices) const + inline const Scalar& coeff(const array& indices) const { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; @@ -163,14 +140,17 @@ class Tensor return m_storage.data()[index]; } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return coeffRef(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline Scalar& coeffRef(const std::array& indices) + inline Scalar& coeffRef(const array& indices) { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; @@ -182,14 +162,17 @@ class Tensor return m_storage.data()[index]; } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return this->operator()(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline const Scalar& operator()(const std::array& indices) const + inline const Scalar& operator()(const array& indices) const { eigen_assert(checkIndexRange(indices)); return coeff(indices); @@ -203,18 +186,22 @@ class Tensor inline const Scalar& operator[](Index index) const { - static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead."); + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); return coeff(index); } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - return operator()(std::array{{firstIndex, secondIndex, otherIndices...}}); + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#endif - inline Scalar& operator()(const std::array& indices) + inline Scalar& operator()(const array& indices) { eigen_assert(checkIndexRange(indices)); return coeffRef(indices); @@ -228,47 +215,70 @@ class Tensor inline Scalar& operator[](Index index) { - static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead."); + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) return coeffRef(index); } - inline Tensor() + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() : m_storage() { } - inline Tensor(const Self& other) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) : m_storage(other.m_storage) { } - inline Tensor(Self&& other) - : m_storage(other.m_storage) - { - } +#ifdef EIGEN_HAVE_RVALUE_REFERENCES +// inline Tensor(Self&& other) +// : m_storage(other.m_storage) +// { +// } +#endif +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Tensor(Index firstDimension, IndexTypes... otherDimensions) : m_storage() { - static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to construct a tensor must be equal to the rank of the tensor."); - resize(std::array{{firstDimension, otherDimensions...}}); + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); } +#endif - inline Tensor(std::array dimensions) + inline Tensor(const array& dimensions) : m_storage(internal::array_prod(dimensions), dimensions) { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) + { + // FIXME: we need to resize the tensor to fix the dimensions of the other. + // Unfortunately this isn't possible yet when the rhs is an expression. + // resize(other.dimensions()); + internal::TensorAssign::run(*this, other); + return *this; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template void resize(Index firstDimension, IndexTypes... otherDimensions) { - static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to resize a tensor must be equal to the rank of the tensor."); - resize(std::array{{firstDimension, otherDimensions...}}); + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); } +#endif - void resize(const std::array& dimensions) + void resize(const array& dimensions) { std::size_t i; Index size = Index(1); @@ -285,20 +295,22 @@ class Tensor #endif } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES template internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, Index firstIndex, IndexTypes... otherIndices) { - return symCoeff(symmetry, std::array{{firstIndex, otherIndices...}}); + return symCoeff(symmetry, array{{firstIndex, otherIndices...}}); } template - internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, std::array const& indices) + internal::tensor_symmetry_value_setter symCoeff(const Symmetry_& symmetry, array const& indices) { return internal::tensor_symmetry_value_setter(*this, symmetry, indices); } +#endif protected: - bool checkIndexRange(const std::array& indices) const + bool checkIndexRange(const array& indices) const { using internal::array_apply_and_reduce; using internal::array_zip_and_reduce; @@ -313,7 +325,7 @@ class Tensor array_zip_and_reduce(indices, m_storage.dimensions()); } - inline Index linearizedIndex(const std::array& indices) const + inline Index linearizedIndex(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, m_storage.dimensions()); } @@ -322,7 +334,3 @@ class Tensor } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_H - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 000000000..f1df827f9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,52 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is responsible for triggering the evaluation of the expressions + * used on the lhs and rhs of an assignment operator and copy the result of + * the evaluation of the rhs expression at the address computed during the + * evaluation lhs expression. + * + * TODO: vectorization. For now the code only uses scalars + * TODO: parallelisation using multithreading on cpu, or kernels on gpu. + */ +namespace internal { + +template +struct TensorAssign +{ + typedef typename Derived1::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(Derived1& dst, const Derived2& src) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + for(Index i = 0; i < size; ++i) { + evalDst.coeffRef(i) = evalSrc.coeff(i); + } + } +}; + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 000000000..0b9f32f7f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangably in expressions. + */ + +template +class TensorBase +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Index Index; + typedef Scalar CoeffReturnType; + + Derived& setZero() { + return setConstant(Scalar(0)); + } + + Derived& setConstant(const Scalar& val) { + Scalar* data = derived().data(); + for (int i = 0; i < derived().size(); ++i) { + data[i] = val; + } + return derived(); + } + + Derived& setRandom() { + Scalar* data = derived().data(); + for (int i = 0; i < derived().size(); ++i) { + data[i] = internal::random_default_impl::run(); + } + return derived(); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cwiseSqrt() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cwiseAbs() const { return derived(); } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + protected: + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 000000000..f4f10eff5 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,127 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + * TODO: add support for vectorization + */ + + +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar& CoeffReturnType; + //typedef typename Derived::PacketScalar PacketScalar; + typedef TensorEvaluator nestedType; + + TensorEvaluator(Derived& m) + : m_data(const_cast(m.data())) + { } + + CoeffReturnType coeff(Index index) const { + return m_data[index]; + } + + Scalar& coeffRef(Index index) { + return m_data[index]; + } + + // to do: vectorized evaluation. + /* template + PacketReturnType packet(Index index) const + { + return ploadt(m_data + index); + } + + template + void writePacket(Index index, const PacketScalar& x) + { + return pstoret(const_cast(m_data) + index, x); + }*/ + + protected: + Scalar* m_data; +}; + + + + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorCwiseUnaryOp XprType; + typedef TensorEvaluator nestedType; + + TensorEvaluator(const XprType& op) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + private: + const UnaryOp m_functor; + typename TensorEvaluator::nestedType m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorCwiseBinaryOp XprType; + typedef TensorEvaluator leftType; + typedef TensorEvaluator rightType; + + TensorEvaluator(const XprType& op) + : m_functor(op.functor()), + m_leftImpl(op.lhsExpression()), + m_rightImpl(op.rhsExpression()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + + private: + const BinaryOp m_functor; + typename TensorEvaluator::nestedType m_leftImpl; + typename TensorEvaluator::nestedType m_rightImpl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 000000000..5a45cec31 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,161 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary operator + * (e.g. addition) is applied to a lhs and a rhs expression. + * + */ + +namespace internal { +template +struct traits > + : traits +{ + typedef typename result_of< + UnaryOp(typename XprType::Scalar) + >::type Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseUnaryOp +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + inline TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + nestedExpression() { return m_xpr.const_cast_derived(); } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename result_of< + BinaryOp( + typename LhsXprType::Scalar, + typename RhsXprType::Scalar + ) + >::type Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseBinaryOp +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + inline TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 000000000..dc97764f0 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,27 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +template class Tensor; +template class TensorMap; +template class TensorBase; + +template class TensorCwiseUnaryOp; +template class TensorCwiseBinaryOp; + +// Move to internal? +template struct TensorEvaluator; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 000000000..7dec1e08d --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,101 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +template class Stride; + + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ + +template class TensorMap : public TensorBase > +{ + public: + typedef TensorMap Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + /* typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *>::type + PointerType;*/ + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions({{firstDimension}}) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions({{firstDimension, otherDimensions...}}) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_dimensions); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar* data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const Index index = internal::tensor_index_linearization_helper::run(array{{firstIndex, otherIndices...}}, m_dimensions); + return m_data[index]; + } +#endif + + template + EIGEN_DEVICE_FUNC + Self& operator=(const OtherDerived& other) + { + internal::TensorAssign::run(*this, other); + return *this; + } + + private: + typename PlainObjectType::Scalar* m_data; + array m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index a34600ee6..503d7cfd6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -37,14 +37,19 @@ template class TensorStorage : public TensorStorage::type> { - typedef TensorStorage::type> Base_; + typedef TensorStorage::type> Base_; + public: - TensorStorage() = default; - TensorStorage(const TensorStorage&) = default; - TensorStorage(TensorStorage&&) = default; + TensorStorage() { } + TensorStorage(const TensorStorage& other) : Base_(other) { } + +#ifdef EIGEN_HAVE_RVALUE_REFERENCES +// TensorStorage(TensorStorage&&) = default; +#endif TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} - TensorStorage(DenseIndex size, const std::array& dimensions) : Base_(size, dimensions) {} - TensorStorage& operator=(const TensorStorage&) = default; + TensorStorage(DenseIndex size, const array& dimensions) : Base_(size, dimensions) {} + + // TensorStorage& operator=(const TensorStorage&) = default; }; // pure dynamic @@ -52,17 +57,17 @@ template class TensorStorage::type> { T *m_data; - std::array m_dimensions; + array m_dimensions; typedef TensorStorage::type> Self_; public: - TensorStorage() : m_data(0), m_dimensions(internal::template repeat(0)) {} + TensorStorage() : m_data(0), m_dimensions() {} TensorStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_dimensions(internal::template repeat(0)) {} - TensorStorage(DenseIndex size, const std::array& dimensions) - : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) - { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } - TensorStorage(const Self_& other) + TensorStorage(DenseIndex size, const array& dimensions) + : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) + { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } + TensorStorage(const Self_& other) : m_data(internal::conditional_aligned_new_auto(internal::array_prod(other.m_dimensions))) , m_dimensions(other.m_dimensions) { @@ -76,28 +81,34 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - std::array dimensions(void) const {return m_dimensions;} - void conservativeResize(DenseIndex size, const std::array& nbDimensions) + const array& dimensions() const {return m_dimensions;} + + void conservativeResize(DenseIndex size, const array& nbDimensions) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, internal::array_prod(m_dimensions)); m_dimensions = nbDimensions; } - void resize(DenseIndex size, const std::array& nbDimensions) + void resize(DenseIndex size, const array& nbDimensions) { if(size != internal::array_prod(m_dimensions)) { @@ -110,8 +121,9 @@ class TensorStorage +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +namespace Eigen { +namespace internal { + + +template +class compute_tensor_flags +{ + enum { + is_dynamic_size_storage = 1, + + aligned_bit = + ( + ((Options&DontAlign)==0) && ( +#if EIGEN_ALIGN_STATICALLY + (!is_dynamic_size_storage) +#else + 0 +#endif + || +#if EIGEN_ALIGN + is_dynamic_size_storage +#else + 0 +#endif + ) + ) ? AlignedBit : 0, + packet_access_bit = packet_traits::Vectorizable && aligned_bit ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit | aligned_bit}; +}; + + +template +struct traits > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef DenseIndex Index; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret, + }; +}; + + +template +struct traits > + : public traits +{ + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; +}; + + +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options_>& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options_>& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorMap& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorMap& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const Tensor& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const Tensor& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorMap& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorMap& type; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 0a6c56c19..31583d3ca 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -93,7 +93,7 @@ ei_add_test(minres) ei_add_test(levenberg_marquardt) ei_add_test(bdcsvd) -option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." OFF) +option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON) if(EIGEN_TEST_CXX11) # FIXME: add C++11 compiler switch in some portable way # (MSVC doesn't need any for example, so this will @@ -101,4 +101,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") + ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_expr "-std=c++0x") + ei_add_test(cxx11_tensor_map "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index ea512c9cc..1f76033ea 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -163,7 +163,7 @@ static void test_3d() VERIFY_IS_EQUAL((epsilon(0,2,1)), -1); VERIFY_IS_EQUAL((epsilon(1,0,2)), -1); - std::array dims{{2,3,4}}; + array dims{{2,3,4}}; Tensor t1(dims); Tensor t2(dims); From 0320f7e3a71406b9a03d1bab0d168fd76e63d457 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 6 May 2014 11:18:37 -0700 Subject: [PATCH 002/214] Added support for fixed sized tensors. Improved support for tensor expressions. --- unsupported/Eigen/CXX11/Tensor | 2 + .../Eigen/CXX11/src/Core/util/CXX11Meta.h | 2 +- .../CXX11/src/Core/util/CXX11Workarounds.h | 12 +- .../CXX11/src/Core/util/EmulateCXX11Meta.h | 95 ++++++- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 39 +-- .../Eigen/CXX11/src/Tensor/TensorBase.h | 14 ++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 212 ++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 232 ++++++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorMap.h | 31 ++- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 46 +++- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 45 +++- unsupported/test/cxx11_tensor_assign.cpp | 195 +++++++++++++++ unsupported/test/cxx11_tensor_expr.cpp | 145 +++++++++++ unsupported/test/cxx11_tensor_fixed_size.cpp | 167 +++++++++++++ unsupported/test/cxx11_tensor_map.cpp | 142 +++++++++++ 18 files changed, 1319 insertions(+), 82 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h create mode 100644 unsupported/test/cxx11_tensor_assign.cpp create mode 100644 unsupported/test/cxx11_tensor_expr.cpp create mode 100644 unsupported/test/cxx11_tensor_fixed_size.cpp create mode 100644 unsupported/test/cxx11_tensor_map.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f554c204a..f2b18ef31 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -31,6 +31,7 @@ #include "Eigen/Core" #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" @@ -41,6 +42,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" #include "Eigen/src/Core/util/ReenableStupidWarnings.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 47f06b1b5..accaa94e7 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -112,7 +112,7 @@ template struct get<0, type_lis template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template struct get> : get> {}; -template struct get<0, numeric_list> { constexpr static int value = a; }; +template struct get<0, numeric_list> { constexpr static T value = a; }; template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 77207f453..f102872ae 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -17,9 +17,6 @@ #error Intel Compiler only supports required C++ features since version 13.1. // note that most stuff in principle works with 13.0 but when combining // some features, at some point 13.0 will just fail with an internal assertion -#elif defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 1)) -// note that it _should_ work with 3.1 but it was only tested with 3.2 -#error Clang C++ Compiler (clang++) only supports required C++ features since version 3.1. #elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) // G++ < 4.6 by default will continue processing the source files - even if we use #error to make // it error out. For this reason, we use the pragma to make sure G++ aborts at the first error @@ -40,17 +37,10 @@ #error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) #endif -using std::array; - namespace Eigen { // Use std::array as Eigen array -/*template -struct array : public std::array { - array() = default; - array(const std::initializer_list& a);// : std::array(a) {}; - array(const std::array& a); -};*/ +template using array = std::array; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 76fcba5b4..ab869177c 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -11,16 +11,63 @@ #define EIGEN_EMULATE_CXX11_META_H + namespace Eigen { // The array class is only available starting with cxx11. Emulate our own here // if needed template class array { public: - T& operator[] (size_t index) { return values[index]; } - const T& operator[] (size_t index) const { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } T values[n]; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v) { + EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { + EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { + EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) { + EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) { + EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + array(std::initializer_list l) { + std::copy(l.begin(), l.end(), values); + } +#endif }; @@ -35,8 +82,10 @@ namespace internal { struct empty_list { static const std::size_t count = 0; }; template struct type_list { - T head; - Tail tail; + typedef T HeadType; + typedef Tail TailType; + static const T head; + static const Tail tail; static const std::size_t count = 1 + Tail::count; }; @@ -54,9 +103,25 @@ template<> struct make_type_list<> { }; +template struct get_type; +template +struct get_type<0, type_list > +{ + typedef Head type; +}; + +template +struct get_type > +{ + typedef typename get_type::type type; +}; + + +/* numeric list */ template struct type2val { + typedef T type; static const T value = n; }; @@ -84,6 +149,28 @@ template struct gen_numeric_list_repeated { }; +template struct get; + +template +struct get<0, type_list > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + +template +struct get > +{ + typedef typename get::type type; + static const type value = get::value; +}; + +template struct arg_prod { + static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod::value; +}; +template <> struct arg_prod { + static const int value = 1; +}; template array repeat(t v) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 7b8f14c6d..f5c027d1c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -60,26 +60,6 @@ namespace Eigen { namespace internal { -template -struct tensor_index_linearization_helper -{ - static inline Index run(array const& indices, array const& dimensions) - { - return array_get(indices) + - array_get(dimensions) * - tensor_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct tensor_index_linearization_helper -{ - static inline Index run(array const& indices, array const&) - { - return array_get(indices); - } -}; - /* Forward-declaration required for the symmetry support. */ template class tensor_symmetry_value_setter; @@ -102,13 +82,15 @@ class Tensor : public TensorBase > static const int Options = Options_; static const std::size_t NumIndices = NumIndices_; + typedef DSizes Dimensions; + protected: TensorStorage m_storage; public: EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE array dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); } + EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } @@ -232,13 +214,6 @@ class Tensor : public TensorBase > { } -#ifdef EIGEN_HAVE_RVALUE_REFERENCES -// inline Tensor(Self&& other) -// : m_storage(other.m_storage) -// { -// } -#endif - #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Tensor(Index firstDimension, IndexTypes... otherDimensions) @@ -327,7 +302,11 @@ class Tensor : public TensorBase > inline Index linearizedIndex(const array& indices) const { - return internal::tensor_index_linearization_helper::run(indices, m_storage.dimensions()); + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0b9f32f7f..9c7783aaf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -62,6 +62,20 @@ class TensorBase EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> cwiseAbs() const { return derived(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cwisePow(Scalar exponent) const { + return TensorCwiseUnaryOp, const Derived> + (derived(), internal::scalar_pow_op(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator * (Scalar scale) const { + return TensorCwiseUnaryOp, const Derived> + (derived(), internal::scalar_multiple_op(scale)); + } + // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 000000000..bd3bd5aca --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + + + +// Boiler plate code +namespace internal { + +template struct dget { + static const std::size_t value = internal::get::value; + }; + + +template +struct fixed_size_tensor_index_linearization_helper +{ + template + static inline Index run(array const& indices, + const Dimensions& dimensions) + { + return array_get(indices) + + dget::value * + fixed_size_tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper +{ + template + static inline Index run(array const& indices, + const Dimensions&) + { + return array_get(indices); + } +}; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct Sizes : internal::numeric_list { + typedef internal::numeric_list Base; + static const std::size_t total_size = internal::arg_prod(Indices...); + + static std::size_t TotalSize() { + return internal::arg_prod(Indices...); + } + + Sizes() { } + template + explicit Sizes(const array&/* indices*/) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + explicit Sizes(std::initializer_list/* l*/) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T&/* other*/) { + // add assertion failure if the size of other is different + return *this; + } + + template + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + template + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + +#else + +template +struct non_zero_size { + typedef internal::type2val type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template struct Sizes { + typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod::value; + + static const size_t TotalSize() { + return internal::arg_prod::value; + } + + Sizes() { } + template + explicit Sizes(const array& indices) { + // todo: add assertion + } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + explicit Sizes(std::initializer_list l) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T& other) { + // to do: check the size of other + return *this; + } + + template + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + } + template + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + } +}; + +#endif + +// Boiler plate +namespace internal { +template +struct tensor_index_linearization_helper +{ + static inline Index run(array const& indices, array const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper +{ + static inline Index run(array const& indices, array const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + + size_t TotalSize() const { + return internal::array_prod(*static_cast(this)); + } + + DSizes() { } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + // explicit DSizes(std::initializer_list l) : Base(l) { } +#endif + explicit DSizes(const array& a) : Base(a) { } + + DSizes& operator = (const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + size_t IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + size_t IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f4f10eff5..b0dbca041 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -24,15 +24,12 @@ namespace Eigen { * TODO: add support for vectorization */ - template struct TensorEvaluator { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; typedef typename Derived::Scalar& CoeffReturnType; - //typedef typename Derived::PacketScalar PacketScalar; - typedef TensorEvaluator nestedType; TensorEvaluator(Derived& m) : m_data(const_cast(m.data())) @@ -72,7 +69,6 @@ template struct TensorEvaluator > { typedef TensorCwiseUnaryOp XprType; - typedef TensorEvaluator nestedType; TensorEvaluator(const XprType& op) : m_functor(op.functor()), @@ -89,7 +85,7 @@ struct TensorEvaluator > private: const UnaryOp m_functor; - typename TensorEvaluator::nestedType m_argImpl; + TensorEvaluator m_argImpl; }; @@ -99,8 +95,6 @@ template struct TensorEvaluator > { typedef TensorCwiseBinaryOp XprType; - typedef TensorEvaluator leftType; - typedef TensorEvaluator rightType; TensorEvaluator(const XprType& op) : m_functor(op.functor()), @@ -118,8 +112,8 @@ struct TensorEvaluator::nestedType m_leftImpl; - typename TensorEvaluator::nestedType m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 5a45cec31..aa875dc31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -54,7 +54,7 @@ struct nested, 1, typename eval -class TensorCwiseUnaryOp +class TensorCwiseUnaryOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -75,11 +75,6 @@ class TensorCwiseUnaryOp const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - nestedExpression() { return m_xpr.const_cast_derived(); } - protected: typename XprType::Nested m_xpr; const UnaryOp m_functor; @@ -124,7 +119,7 @@ struct nested, 1, typename template -class TensorCwiseBinaryOp +class TensorCwiseBinaryOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 000000000..953880123 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixes sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > +{ + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_STRONG_INLINE array dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#ifdef EIGEN_HAVE_RVALUE_REFERENCES + inline TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) + { + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + internal::TensorAssign::run(*this, other); + return *this; + } + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index dc97764f0..e8a2125c4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -13,6 +13,7 @@ namespace Eigen { template class Tensor; +template class TensorFixedSize; template class TensorMap; template class TensorBase; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 7dec1e08d..bb0b39c5a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -43,24 +43,38 @@ template class TensorMap : public TensorBase({{firstDimension}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions({{firstDimension, otherDimensions...}}) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif + inline TensorMap(PointerArgType dataPtr, const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_dimensions); } + EIGEN_STRONG_INLINE const typename PlainObjectType::Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_data; } EIGEN_DEVICE_FUNC @@ -78,8 +92,13 @@ template class TensorMap : public TensorBase::run(array{{firstIndex, otherIndices...}}, m_dimensions); - return m_data[index]; + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } } #endif @@ -93,7 +112,7 @@ template class TensorMap : public TensorBase m_dimensions; + typename PlainObjectType::Dimensions m_dimensions; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 503d7cfd6..efcb39559 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -32,6 +32,35 @@ namespace Eigen { */ template class TensorStorage; + +// Pure fixed-size storage +template +class TensorStorage +{ + private: + T m_data[Size]; + FixedDimensions m_dimensions; + + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStorage() { + EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } +}; + + + // pure-dynamic, but without specification of all dimensions explicitly template class TensorStorage @@ -44,7 +73,7 @@ class TensorStorage TensorStorage(const TensorStorage& other) : Base_(other) { } #ifdef EIGEN_HAVE_RVALUE_REFERENCES -// TensorStorage(TensorStorage&&) = default; + // TensorStorage(TensorStorage&&) = default; #endif TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} TensorStorage(DenseIndex size, const array& dimensions) : Base_(size, dimensions) {} @@ -57,11 +86,11 @@ template class TensorStorage::type> { T *m_data; - array m_dimensions; + DSizes m_dimensions; typedef TensorStorage::type> Self_; public: - TensorStorage() : m_data(0), m_dimensions() {} + TensorStorage() : m_data(0), m_dimensions() {} TensorStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_dimensions(internal::template repeat(0)) {} TensorStorage(DenseIndex size, const array& dimensions) @@ -83,25 +112,25 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - const array& dimensions() const {return m_dimensions;} + const DSizes& dimensions() const {return m_dimensions;} void conservativeResize(DenseIndex size, const array& nbDimensions) { @@ -124,9 +153,10 @@ class TensorStorage > }; +template +struct traits > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef DenseIndex Index; +}; + + template struct traits > : public traits @@ -68,16 +77,28 @@ struct traits > }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options>& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const Tensor<_Scalar, NumIndices_, Options_>& type; + typedef const Tensor<_Scalar, NumIndices_, Options>& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorFixedSize& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorFixedSize& type; }; template @@ -104,6 +125,18 @@ struct nested, 1, typename eval& type; }; +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorFixedSize& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorFixedSize& type; +}; + template struct nested, 1, typename eval >::type> { diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp new file mode 100644 index 000000000..c88872950 --- /dev/null +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -0,0 +1,195 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor vec1(6); + Tensor vec2(6); + vec1(0) = 4; vec2(0) = 0; + vec1(1) = 8; vec2(1) = 1; + vec1(2) = 15; vec2(2) = 2; + vec1(3) = 16; vec2(3) = 3; + vec1(4) = 23; vec2(4) = 4; + vec1(5) = 42; vec2(5) = 5; + + int col_major[6]; + int row_major[6]; + memset(col_major, 0, 6*sizeof(int)); + memset(row_major, 0, 6*sizeof(int)); + TensorMap> vec3(col_major, 6); + TensorMap> vec4(row_major, 6); + + vec3 = vec1; + vec4 = vec2; + + VERIFY_IS_EQUAL(vec3(0), 4); + VERIFY_IS_EQUAL(vec3(1), 8); + VERIFY_IS_EQUAL(vec3(2), 15); + VERIFY_IS_EQUAL(vec3(3), 16); + VERIFY_IS_EQUAL(vec3(4), 23); + VERIFY_IS_EQUAL(vec3(5), 42); + + VERIFY_IS_EQUAL(vec4(0), 0); + VERIFY_IS_EQUAL(vec4(1), 1); + VERIFY_IS_EQUAL(vec4(2), 2); + VERIFY_IS_EQUAL(vec4(3), 3); + VERIFY_IS_EQUAL(vec4(4), 4); + VERIFY_IS_EQUAL(vec4(5), 5); + + vec1.setZero(); + vec2.setZero(); + vec1 = vec3; + vec2 = vec4; + + VERIFY_IS_EQUAL(vec1(0), 4); + VERIFY_IS_EQUAL(vec1(1), 8); + VERIFY_IS_EQUAL(vec1(2), 15); + VERIFY_IS_EQUAL(vec1(3), 16); + VERIFY_IS_EQUAL(vec1(4), 23); + VERIFY_IS_EQUAL(vec1(5), 42); + + VERIFY_IS_EQUAL(vec2(0), 0); + VERIFY_IS_EQUAL(vec2(1), 1); + VERIFY_IS_EQUAL(vec2(2), 2); + VERIFY_IS_EQUAL(vec2(3), 3); + VERIFY_IS_EQUAL(vec2(4), 4); + VERIFY_IS_EQUAL(vec2(5), 5); +} + +static void test_2d() +{ + Tensor mat1(2,3); + Tensor mat2(2,3); + + mat1(0,0) = 0; + mat1(0,1) = 1; + mat1(0,2) = 2; + mat1(1,0) = 3; + mat1(1,1) = 4; + mat1(1,2) = 5; + + mat2(0,0) = 0; + mat2(0,1) = 1; + mat2(0,2) = 2; + mat2(1,0) = 3; + mat2(1,1) = 4; + mat2(1,2) = 5; + + int col_major[6]; + int row_major[6]; + memset(col_major, 0, 6*sizeof(int)); + memset(row_major, 0, 6*sizeof(int)); + TensorMap> mat3(row_major, 2, 3); + TensorMap> mat4(col_major, 2, 3); + + mat3 = mat1; + mat4 = mat2; + + VERIFY_IS_EQUAL(mat3(0,0), 0); + VERIFY_IS_EQUAL(mat3(0,1), 1); + VERIFY_IS_EQUAL(mat3(0,2), 2); + VERIFY_IS_EQUAL(mat3(1,0), 3); + VERIFY_IS_EQUAL(mat3(1,1), 4); + VERIFY_IS_EQUAL(mat3(1,2), 5); + + VERIFY_IS_EQUAL(mat4(0,0), 0); + VERIFY_IS_EQUAL(mat4(0,1), 1); + VERIFY_IS_EQUAL(mat4(0,2), 2); + VERIFY_IS_EQUAL(mat4(1,0), 3); + VERIFY_IS_EQUAL(mat4(1,1), 4); + VERIFY_IS_EQUAL(mat4(1,2), 5); + + mat1.setZero(); + mat2.setZero(); + mat1 = mat3; + mat2 = mat4; + + VERIFY_IS_EQUAL(mat1(0,0), 0); + VERIFY_IS_EQUAL(mat1(0,1), 1); + VERIFY_IS_EQUAL(mat1(0,2), 2); + VERIFY_IS_EQUAL(mat1(1,0), 3); + VERIFY_IS_EQUAL(mat1(1,1), 4); + VERIFY_IS_EQUAL(mat1(1,2), 5); + + VERIFY_IS_EQUAL(mat2(0,0), 0); + VERIFY_IS_EQUAL(mat2(0,1), 1); + VERIFY_IS_EQUAL(mat2(0,2), 2); + VERIFY_IS_EQUAL(mat2(1,0), 3); + VERIFY_IS_EQUAL(mat2(1,1), 4); + VERIFY_IS_EQUAL(mat2(1,2), 5); +} + +static void test_3d() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + int val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val++; + } + } + } + + int col_major[2*3*7]; + int row_major[2*3*7]; + memset(col_major, 0, 2*3*7*sizeof(int)); + memset(row_major, 0, 2*3*7*sizeof(int)); + TensorMap> mat3(col_major, 2, 3, 7); + TensorMap> mat4(row_major, 2, 3, 7); + + mat3 = mat1; + mat4 = mat2; + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat3(i,j,k), val); + VERIFY_IS_EQUAL(mat4(i,j,k), val); + val++; + } + } + } + + mat1.setZero(); + mat2.setZero(); + mat1 = mat3; + mat2 = mat4; + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat1(i,j,k), val); + VERIFY_IS_EQUAL(mat2(i,j,k), val); + val++; + } + } + } +} + + +void test_cxx11_tensor_assign() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp new file mode 100644 index 000000000..e0124da8c --- /dev/null +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -0,0 +1,145 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor vec1({6}); + Tensor vec2({6}); + + vec1(0) = 4.0; vec2(0) = 0.0; + vec1(1) = 8.0; vec2(1) = 1.0; + vec1(2) = 15.0; vec2(2) = 2.0; + vec1(3) = 16.0; vec2(3) = 3.0; + vec1(4) = 23.0; vec2(4) = 4.0; + vec1(5) = 42.0; vec2(5) = 5.0; + + float data3[6]; + TensorMap> vec3(data3, 6); + vec3 = vec1.cwiseSqrt(); + float data4[6]; + TensorMap> vec4(data4, 6); + vec4 = vec2.cwiseSqrt(); + + VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); + VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); + VERIFY_IS_APPROX(vec3(2), sqrtf(15.0)); + VERIFY_IS_APPROX(vec3(3), sqrtf(16.0)); + VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); + VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); + + VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); + VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); + VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); + VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); + VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); + VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + + vec3 = vec1 + vec2; + VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); + VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); + VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f); + VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f); + VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f); + VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f); +} + +static void test_2d() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + TensorMap> mat2(data2, 2, 3); + + mat1(0,0) = 0.0; + mat1(0,1) = 1.0; + mat1(0,2) = 2.0; + mat1(1,0) = 3.0; + mat1(1,1) = 4.0; + mat1(1,2) = 5.0; + + mat2(0,0) = -0.0; + mat2(0,1) = -1.0; + mat2(0,2) = -2.0; + mat2(1,0) = -3.0; + mat2(1,1) = -4.0; + mat2(1,2) = -5.0; + + Tensor mat3(2,3); + Tensor mat4(2,3); + mat3 = mat1.cwiseAbs(); + mat4 = mat2.cwiseAbs(); + + VERIFY_IS_APPROX(mat3(0,0), 0.0f); + VERIFY_IS_APPROX(mat3(0,1), 1.0f); + VERIFY_IS_APPROX(mat3(0,2), 2.0f); + VERIFY_IS_APPROX(mat3(1,0), 3.0f); + VERIFY_IS_APPROX(mat3(1,1), 4.0f); + VERIFY_IS_APPROX(mat3(1,2), 5.0f); + + VERIFY_IS_APPROX(mat4(0,0), 0.0f); + VERIFY_IS_APPROX(mat4(0,1), 1.0f); + VERIFY_IS_APPROX(mat4(0,2), 2.0f); + VERIFY_IS_APPROX(mat4(1,0), 3.0f); + VERIFY_IS_APPROX(mat4(1,1), 4.0f); + VERIFY_IS_APPROX(mat4(1,2), 5.0f); +} + +static void test_3d() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val += 1.0; + } + } + } + + Tensor mat3(2,3,7); + mat3 = mat1 + mat1; + Tensor mat4(2,3,7); + mat4 = mat2 * 3.14f; + Tensor mat5(2,3,7); + mat5 = mat1.cwiseSqrt().cwiseSqrt(); + Tensor mat6(2,3,7); + mat6 = mat2.cwiseSqrt() * 3.14f; + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), val + val); + VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f); + VERIFY_IS_APPROX(mat5(i,j,k), sqrtf(sqrtf(val))); + VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); + val += 1.0; + } + } + } +} + + +void test_cxx11_tensor_expr() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp new file mode 100644 index 000000000..c1d74d881 --- /dev/null +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + + +static void test_1d() +{ + TensorFixedSize > vec1; + TensorFixedSize, RowMajor> vec2; + + VERIFY_IS_EQUAL((vec1.size()), 6); + // VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6); + // VERIFY_IS_EQUAL((vec1.dimension(0)), 6); + + vec1(0) = 4.0; vec2(0) = 0.0; + vec1(1) = 8.0; vec2(1) = 1.0; + vec1(2) = 15.0; vec2(2) = 2.0; + vec1(3) = 16.0; vec2(3) = 3.0; + vec1(4) = 23.0; vec2(4) = 4.0; + vec1(5) = 42.0; vec2(5) = 5.0; + + float data3[6]; + TensorMap > > vec3(data3, 6); + vec3 = vec1.cwiseSqrt(); + float data4[6]; + TensorMap, RowMajor> > vec4(data4, 6); + vec4 = vec2.cwiseSqrt(); + + VERIFY_IS_EQUAL((vec3.size()), 6); + // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); + // VERIFY_IS_EQUAL((vec3.dimension(0)), 6); + + VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); + VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); + VERIFY_IS_APPROX(vec3(2), sqrtf(15.0)); + VERIFY_IS_APPROX(vec3(3), sqrtf(16.0)); + VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); + VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); + + VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); + VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); + VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); + VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); + VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); + VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + + vec3 = vec1 + vec2; + VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); + VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); + VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f); + VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f); + VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f); + VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f); +} + +static void test_2d() +{ + float data1[6]; + TensorMap >> mat1(data1,2,3); + float data2[6]; + TensorMap, RowMajor>> mat2(data2,2,3); + + VERIFY_IS_EQUAL((mat1.size()), 2*3); + // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); + + mat1(0,0) = 0.0; + mat1(0,1) = 1.0; + mat1(0,2) = 2.0; + mat1(1,0) = 3.0; + mat1(1,1) = 4.0; + mat1(1,2) = 5.0; + + mat2(0,0) = -0.0; + mat2(0,1) = -1.0; + mat2(0,2) = -2.0; + mat2(1,0) = -3.0; + mat2(1,1) = -4.0; + mat2(1,2) = -5.0; + + TensorFixedSize> mat3; + TensorFixedSize, RowMajor> mat4; + mat3 = mat1.cwiseAbs(); + mat4 = mat2.cwiseAbs(); + + VERIFY_IS_EQUAL((mat3.size()), 2*3); + // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat3.dimension(1)), 3); + + VERIFY_IS_APPROX(mat3(0,0), 0.0f); + VERIFY_IS_APPROX(mat3(0,1), 1.0f); + VERIFY_IS_APPROX(mat3(0,2), 2.0f); + VERIFY_IS_APPROX(mat3(1,0), 3.0f); + VERIFY_IS_APPROX(mat3(1,1), 4.0f); + VERIFY_IS_APPROX(mat3(1,2), 5.0f); + + VERIFY_IS_APPROX(mat4(0,0), 0.0f); + VERIFY_IS_APPROX(mat4(0,1), 1.0f); + VERIFY_IS_APPROX(mat4(0,2), 2.0f); + VERIFY_IS_APPROX(mat4(1,0), 3.0f); + VERIFY_IS_APPROX(mat4(1,1), 4.0f); + VERIFY_IS_APPROX(mat4(1,2), 5.0f); +} + +static void test_3d() +{ + TensorFixedSize > mat1; + TensorFixedSize, RowMajor> mat2; + + VERIFY_IS_EQUAL((mat1.size()), 2*3*7); + // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); + // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); + + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val += 1.0; + } + } + } + + TensorFixedSize > mat3; + mat3 = mat1.cwiseSqrt(); + TensorFixedSize, RowMajor> mat4; + mat4 = mat2.cwiseSqrt(); + + VERIFY_IS_EQUAL((mat3.size()), 2*3*7); + // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); + // VERIFY_IS_EQUAL((mat3.dimension(1)), 3); + // VERIFY_IS_EQUAL((mat3.dimension(2)), 7); + + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val)); + VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val)); + val += 1.0; + } + } + } +} + + +void test_cxx11_tensor_fixed_size() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp new file mode 100644 index 000000000..478c20306 --- /dev/null +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_1d() +{ + Tensor vec1(6); + Tensor vec2(6); + + TensorMap> vec3(vec1.data(), 6); + TensorMap> vec4(vec2.data(), 6); + + vec1(0) = 4; vec2(0) = 0; + vec1(1) = 8; vec2(1) = 1; + vec1(2) = 15; vec2(2) = 2; + vec1(3) = 16; vec2(3) = 3; + vec1(4) = 23; vec2(4) = 4; + vec1(5) = 42; vec2(5) = 5; + + VERIFY_IS_EQUAL(vec1.size(), 6); + VERIFY_IS_EQUAL(vec1.dimension(0), 6); + + VERIFY_IS_EQUAL(vec3(0), 4); + VERIFY_IS_EQUAL(vec3(1), 8); + VERIFY_IS_EQUAL(vec3(2), 15); + VERIFY_IS_EQUAL(vec3(3), 16); + VERIFY_IS_EQUAL(vec3(4), 23); + VERIFY_IS_EQUAL(vec3(5), 42); + + VERIFY_IS_EQUAL(vec4(0), 0); + VERIFY_IS_EQUAL(vec4(1), 1); + VERIFY_IS_EQUAL(vec4(2), 2); + VERIFY_IS_EQUAL(vec4(3), 3); + VERIFY_IS_EQUAL(vec4(4), 4); + VERIFY_IS_EQUAL(vec4(5), 5); +} + +static void test_2d() +{ + Tensor mat1(2,3); + Tensor mat2(2,3); + + mat1(0,0) = 0; + mat1(0,1) = 1; + mat1(0,2) = 2; + mat1(1,0) = 3; + mat1(1,1) = 4; + mat1(1,2) = 5; + + mat2(0,0) = 0; + mat2(0,1) = 1; + mat2(0,2) = 2; + mat2(1,0) = 3; + mat2(1,1) = 4; + mat2(1,2) = 5; + + TensorMap> mat3(mat1.data(), 2, 3); + TensorMap> mat4(mat2.data(), 2, 3); + + VERIFY_IS_EQUAL(mat3.size(), 6); + VERIFY_IS_EQUAL(mat3.dimension(0), 2); + VERIFY_IS_EQUAL(mat3.dimension(1), 3); + + VERIFY_IS_EQUAL(mat4.size(), 6); + VERIFY_IS_EQUAL(mat4.dimension(0), 2); + VERIFY_IS_EQUAL(mat4.dimension(1), 3); + + VERIFY_IS_EQUAL(mat3(0,0), 0); + VERIFY_IS_EQUAL(mat3(0,1), 1); + VERIFY_IS_EQUAL(mat3(0,2), 2); + VERIFY_IS_EQUAL(mat3(1,0), 3); + VERIFY_IS_EQUAL(mat3(1,1), 4); + VERIFY_IS_EQUAL(mat3(1,2), 5); + + VERIFY_IS_EQUAL(mat4(0,0), 0); + VERIFY_IS_EQUAL(mat4(0,1), 1); + VERIFY_IS_EQUAL(mat4(0,2), 2); + VERIFY_IS_EQUAL(mat4(1,0), 3); + VERIFY_IS_EQUAL(mat4(1,1), 4); + VERIFY_IS_EQUAL(mat4(1,2), 5); +} + +static void test_3d() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + int val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + mat2(i,j,k) = val; + val++; + } + } + } + + TensorMap> mat3(mat1.data(), 2, 3, 7); + TensorMap> mat4(mat2.data(), 2, 3, 7); + + VERIFY_IS_EQUAL(mat3.size(), 2*3*7); + VERIFY_IS_EQUAL(mat3.dimension(0), 2); + VERIFY_IS_EQUAL(mat3.dimension(1), 3); + VERIFY_IS_EQUAL(mat3.dimension(2), 7); + + VERIFY_IS_EQUAL(mat4.size(), 2*3*7); + VERIFY_IS_EQUAL(mat4.dimension(0), 2); + VERIFY_IS_EQUAL(mat4.dimension(1), 3); + VERIFY_IS_EQUAL(mat4.dimension(2), 7); + + val = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(mat3(i,j,k), val); + VERIFY_IS_EQUAL(mat4(i,j,k), val); + val++; + } + } + } +} + + +void test_cxx11_tensor_map() +{ + CALL_SUBTEST(test_1d()); + CALL_SUBTEST(test_2d()); + CALL_SUBTEST(test_3d()); +} From 7402fea0a8e63e3ea248257047c584afee8f8bde Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 16 May 2014 15:08:05 -0700 Subject: [PATCH 003/214] Vectorized the evaluation of tensor expression (using SSE, AVX, NEON, ...) Added the ability to parallelize the evaluation of a tensor expression over multiple cpu cores. Added the ability to offload the evaluation of a tensor expression to a GPU. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 145 +++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorBase.h | 12 ++ .../Eigen/CXX11/src/Tensor/TensorDevice.h | 83 +++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 56 +++++++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 54 ++++-- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 25 ++- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 10 +- .../src/Tensor/TensorForwardDeclarations.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMap.h | 158 ++++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 19 --- unsupported/test/CMakeLists.txt | 3 + unsupported/test/cxx11_tensor_device.cpp | 126 ++++++++++++++ unsupported/test/cxx11_tensor_fixed_size.cpp | 28 ++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 37 ++++ 17 files changed, 719 insertions(+), 65 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h create mode 100644 unsupported/test/cxx11_tensor_device.cpp create mode 100644 unsupported/test/cxx11_tensor_thread_pool.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f2b18ef31..323d9edff 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -31,6 +31,7 @@ #include "Eigen/Core" #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" @@ -39,6 +40,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index f5c027d1c..d8ff3f584 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -75,9 +75,15 @@ class Tensor : public TensorBase > typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef Scalar_ Scalar; - typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename Base::PacketReturnType PacketReturnType; + + enum { + IsAligned = bool(EIGEN_ALIGN), + PacketAccess = true, + }; static const int Options = Options_; static const std::size_t NumIndices = NumIndices_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index f1df827f9..e69ff6188 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -10,6 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#ifdef EIGEN_USE_THREADS +#include +#endif namespace Eigen { @@ -28,7 +31,8 @@ namespace Eigen { */ namespace internal { -template +// Default strategy: the expressions are evaluated with a single cpu thread. +template::PacketAccess & TensorEvaluator::PacketAccess> struct TensorAssign { typedef typename Derived1::Index Index; @@ -38,13 +42,150 @@ struct TensorAssign TensorEvaluator evalDst(dst); TensorEvaluator evalSrc(src); const Index size = dst.size(); - for(Index i = 0; i < size; ++i) { + for (Index i = 0; i < size; ++i) { evalDst.coeffRef(i) = evalSrc.coeff(i); } } }; +template +struct TensorAssign +{ + typedef typename Derived1::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(Derived1& dst, const Derived2& src) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + static const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evalDst.template writePacket(i, evalSrc.template packet(i)); + } + for (Index i = VectorizedSize; i < size; ++i) { + evalDst.coeffRef(i) = evalSrc.coeff(i); + } + } +}; + + + +// Multicore strategy: the index space is partitioned and each core is assigned to a partition +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + dst.coeffRef(i) = src.coeff(i); + } + } +}; + +template +struct EvalRange { + static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits::size; + if (last - first > PacketSize) { + static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned; + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + dst.template writePacket(i, src.template packet(i)); + } + } + + for (; i < last; ++i) { + dst.coeffRef(i) = src.coeff(i); + } + } +}; + +template +struct TensorAssignMultiThreaded +{ + typedef typename Derived1::Index Index; + static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + + static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + + int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); + } + + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange, TensorEvaluator, Index>::run(evalDst, evalSrc, numblocks * blocksize, size); + } + } +}; +#endif + + +// GPU: the evaluation of the expressions is offloaded to a GPU. +#ifdef EIGEN_USE_GPU +template +__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + evalDst.coeffRef(index) = evalSrc.coeff(index); +} +template +__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) { + const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + evalDst.coeffRef(index) = evalSrc.coeff(index); + } +} + +template +struct TensorAssignGpu +{ + typedef typename Derived1::Index Index; + static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) + { + TensorEvaluator evalDst(dst); + TensorEvaluator evalSrc(src); + const Index size = dst.size(); + const int block_size = std::min(size, 32*32); + const int num_blocks = size / block_size; + EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); + + const int remaining_items = size % block_size; + if (remaining_items > 0) { + const int peel_start_offset = num_blocks * block_size; + const int peel_block_size = std::min(size, 32); + const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; + EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); + } + } +}; +#endif + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 9c7783aaf..fa1bd3498 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -28,6 +28,7 @@ class TensorBase typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::Index Index; typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; Derived& setZero() { return setConstant(Scalar(0)); @@ -83,6 +84,17 @@ class TensorBase return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template + TensorDevice device(const DeviceType& device) { + return TensorDevice(device, derived()); + } + protected: template friend class TensorBase; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 000000000..71890e187 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,83 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: thread pools. + * Todo: operator +=, -=, *= and so on. + */ + +template class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + internal::TensorAssign::run(m_expression, other); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + + +#ifdef EIGEN_USE_THREADS +template class TensorDevice { + public: + TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + internal::TensorAssignMultiThreaded::run(m_expression, other, m_device); + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; +}; +#endif + + +#ifdef EIGEN_USE_GPU +template class TensorDevice +{ + public: + TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + internal::TensorAssignGpu::run(m_expression, other, m_device); + return *this; + } + + protected: + const GpuDevice& m_device; + ExpressionType& m_expression; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h new file mode 100644 index 000000000..ded6ca604 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H + + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { +}; + + +// Multiple cpu cores +// We should really use a thread pool here but first we need to find a portable thread pool library. +#ifdef EIGEN_USE_THREADS +struct ThreadPoolDevice { + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + size_t numThreads() const { return num_threads_; } + /*ThreadPool* threadPool() const { return pool_; }*/ + + private: + // todo: NUMA, ... + size_t num_threads_; + /*ThreadPool* pool_;*/ +}; +#endif + + +// GPU offloading +#ifdef EIGEN_USE_GPU +struct GpuDevice { + // todo: support for multiple gpu; + GpuDevice() { + cudaStreamCreate(&stream_); + } + ~GpuDevice() { + cudaStreamDestroy(stream_); + } + const cudaStream_t& stream() const { return stream_; } + + private: + cudaStream_t stream_; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index bd3bd5aca..43e9d6550 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -79,16 +79,16 @@ struct Sizes : internal::numeric_list { Sizes() { } template - explicit Sizes(const array&/* indices*/) { + explicit Sizes(const array& /*indices*/) { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - explicit Sizes(std::initializer_list/* l*/) { + explicit Sizes(std::initializer_list /*l*/) { // todo: add assertion } #endif - template Sizes& operator = (const T&/* other*/) { + template Sizes& operator = (const T& /*other*/) { // add assertion failure if the size of other is different return *this; } @@ -119,7 +119,7 @@ template ::value; - static const size_t TotalSize() { + static size_t TotalSize() { return internal::arg_prod::value; } @@ -181,14 +181,11 @@ template struct DSizes : array { typedef array Base; - size_t TotalSize() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { return internal::array_prod(*static_cast(this)); } DSizes() { } -#ifdef EIGEN_HAS_VARIADIC_TEMPLATES - // explicit DSizes(std::initializer_list l) : Base(l) { } -#endif explicit DSizes(const array& a) : Base(a) { } DSizes& operator = (const array& other) { @@ -203,7 +200,6 @@ struct DSizes : array { size_t IndexOfRowMajor(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); } - }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b0dbca041..3ce924dc3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -29,32 +29,38 @@ struct TensorEvaluator { typedef typename Derived::Index Index; typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar& CoeffReturnType; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + }; TensorEvaluator(Derived& m) : m_data(const_cast(m.data())) { } - CoeffReturnType coeff(Index index) const { + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_data[index]; } - Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) { return m_data[index]; } - // to do: vectorized evaluation. - /* template + template PacketReturnType packet(Index index) const { - return ploadt(m_data + index); + return internal::ploadt(m_data + index); } - template - void writePacket(Index index, const PacketScalar& x) + template + void writePacket(Index index, const Packet& x) { - return pstoret(const_cast(m_data) + index, x); - }*/ + return internal::pstoret(m_data + index, x); + } protected: Scalar* m_data; @@ -70,6 +76,11 @@ struct TensorEvaluator > { typedef TensorCwiseUnaryOp XprType; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + }; + TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) @@ -77,12 +88,19 @@ struct TensorEvaluator > typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; - CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + private: const UnaryOp m_functor; TensorEvaluator m_argImpl; @@ -96,6 +114,12 @@ struct TensorEvaluator XprType; + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + }; + TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_leftImpl(op.lhsExpression()), @@ -104,11 +128,17 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); + } private: const BinaryOp m_functor; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index aa875dc31..e32077f6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -33,6 +33,9 @@ struct traits > typedef typename result_of< UnaryOp(typename XprType::Scalar) >::type Scalar; + typedef typename result_of< + UnaryOp(typename XprType::Packet) + >::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; }; @@ -57,14 +60,16 @@ template class TensorCwiseUnaryOp : public TensorBase > { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; - inline TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) : m_xpr(xpr), m_functor(func) {} EIGEN_DEVICE_FUNC @@ -92,6 +97,7 @@ struct traits > typename RhsXprType::Scalar ) >::type Scalar; + typedef typename internal::packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -123,14 +129,17 @@ class TensorCwiseBinaryOp : public TensorBase::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - inline TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 953880123..dcc7ccd65 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -33,11 +33,17 @@ class TensorFixedSize : public TensorBase::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef Scalar_ Scalar; - typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; - static const int Options = Options_; + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_ALIGN), + PacketAccess = true, + }; + typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index e8a2125c4..09b0fe66d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -14,12 +14,14 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; -template class TensorMap; +template class TensorMap; template class TensorBase; template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; +template class TensorDevice; + // Move to internal? template struct TensorEvaluator; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index bb0b39c5a..3fc9c5335 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -22,16 +22,16 @@ template class Strid * */ -template class TensorMap : public TensorBase > +template class TensorMap : public TensorBase > { public: - typedef TensorMap Self; + typedef TensorMap Self; typedef typename PlainObjectType::Base Base; typedef typename Eigen::internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; - typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::packet_traits::type Packet; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; @@ -43,13 +43,12 @@ template class TensorMap : public TensorBase({{firstDimension}})) { @@ -65,7 +64,7 @@ template class TensorMap : public TensorBase& dimensions) + inline TensorMap(PointerArgType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } @@ -80,12 +79,97 @@ template class TensorMap : public TensorBase& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + return m_data[index]; + } + } +#else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC @@ -100,8 +184,60 @@ template class TensorMap : public TensorBase= 0 && index < size()); + return m_data[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[0]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } #endif + template EIGEN_DEVICE_FUNC Self& operator=(const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index efcb39559..64098343e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -72,9 +72,6 @@ class TensorStorage TensorStorage() { } TensorStorage(const TensorStorage& other) : Base_(other) { } -#ifdef EIGEN_HAVE_RVALUE_REFERENCES - // TensorStorage(TensorStorage&&) = default; -#endif TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {} TensorStorage(DenseIndex size, const array& dimensions) : Base_(size, dimensions) {} @@ -111,22 +108,6 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 31583d3ca..abc3375e5 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -104,4 +104,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_assign "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp new file mode 100644 index 000000000..9eb1d0420 --- /dev/null +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_device +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +// Context for evaluation on cpu +struct CPUContext { + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out) { } + + const Eigen::Tensor& in1() const { return in1_; } + const Eigen::Tensor& in2() const { return in2_; } + Eigen::TensorDevice, Eigen::DefaultDevice> out() { return TensorDevice, Eigen::DefaultDevice>(cpu_device_, out_); } + + private: + const Eigen::Tensor& in1_; + const Eigen::Tensor& in2_; + Eigen::Tensor& out_; + + Eigen::DefaultDevice cpu_device_; +}; + + +// Context for evaluation on GPU +struct GPUContext { + GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out) { } + + const Eigen::TensorMap >& in1() const { return in1_; } + const Eigen::TensorMap >& in2() const { return in2_; } + Eigen::TensorDevice >, Eigen::GpuDevice> out() { return TensorDevice >, Eigen::GpuDevice>(gpu_device_, out_); } + + private: + const Eigen::TensorMap >& in1_; + const Eigen::TensorMap >& in2_; + Eigen::TensorMap >& out_; + Eigen::GpuDevice gpu_device_; +}; + + +// The actual expression to evaluate +template +static void test_contextual_eval(Context* context) +{ + context->out() = context->in1() + context->in2() * 3.14f; +} + +static void test_cpu() { + Eigen::Tensor in1(Eigen::array(2,3,7)); + Eigen::Tensor in2(Eigen::array(2,3,7)); + Eigen::Tensor out(Eigen::array(2,3,7)); + + in1.setRandom(); + in2.setRandom(); + CPUContext context(in1, in2, out); + test_contextual_eval(&context); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + } + } + } +} + +static void test_gpu() { + Eigen::Tensor in1(Eigen::array(2,3,7)); + Eigen::Tensor in2(Eigen::array(2,3,7)); + Eigen::Tensor out(Eigen::array(2,3,7)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(2,3,7)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(2,3,7)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(2,3,7)); + + GPUContext context(gpu_in1, gpu_in2, gpu_out); + test_contextual_eval(&context); + + cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + } + } + } +} + + + +void test_cxx11_tensor_device() +{ + CALL_SUBTEST(test_cpu()); + CALL_SUBTEST(test_gpu()); +} diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index c1d74d881..214f6951d 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -159,9 +159,37 @@ static void test_3d() } +static void test_array() +{ + TensorFixedSize > mat1; + float val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(array(i,j,k)) = val; + val += 1.0; + } + } + } + + TensorFixedSize > mat3; + mat3 = mat1.cwisePow(3.5f); + + val = 0.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(array(i,j,k)), powf(val, 3.5f)); + val += 1.0; + } + } + } +} + void test_cxx11_tensor_fixed_size() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_array()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp new file mode 100644 index 000000000..c9de71da3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -0,0 +1,37 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cxx11_tensor_thread_pool() +{ + Eigen::Tensor in1(Eigen::array(2,3,7)); + Eigen::Tensor in2(Eigen::array(2,3,7)); + Eigen::Tensor out(Eigen::array(2,3,7)); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(3); + out.device(thread_pool_device) = in1 + in2 * 3.14; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + } + } + } +} From 736267cf6b17832a571acf7e34ca07c7f55907ee Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 22 May 2014 16:22:35 -0700 Subject: [PATCH 004/214] Added support for additional tensor operations: * comparison (<, <=, ==, !=, ...) * selection * nullary ops such as random or constant generation * misc unary ops such as log(), exp(), or a user defined unaryExpr() Cleaned up the code a little. --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 139 ++++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 84 +++++++++++ .../Eigen/CXX11/src/Tensor/TensorExpr.h | 109 ++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 2 + .../Eigen/CXX11/src/Tensor/TensorMap.h | 36 +++-- 5 files changed, 339 insertions(+), 31 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fa1bd3498..8a88ba806 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -33,21 +33,25 @@ class TensorBase Derived& setZero() { return setConstant(Scalar(0)); } - Derived& setConstant(const Scalar& val) { - Scalar* data = derived().data(); - for (int i = 0; i < derived().size(); ++i) { - data[i] = val; - } - return derived(); + return derived() = constant(val); + } + Derived& setRandom() { + return derived() = random(); } - Derived& setRandom() { - Scalar* data = derived().data(); - for (int i = 0; i < derived().size(); ++i) { - data[i] = internal::random_default_impl::run(); - } - return derived(); + // Nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return TensorCwiseNullaryOp, const Derived> + (internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return TensorCwiseNullaryOp, const Derived>(); } // Coefficient-wise unary operators @@ -57,15 +61,31 @@ class TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cwiseSqrt() const { return derived(); } + sqrt() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { return derived(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { return derived(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cwiseAbs() const { return derived(); } + abs() const { return derived(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cwisePow(Scalar exponent) const { + pow(Scalar exponent) const { return TensorCwiseUnaryOp, const Derived> (derived(), internal::scalar_pow_op(exponent)); } @@ -77,6 +97,30 @@ class TensorBase (derived(), internal::scalar_multiple_op(scale)); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cast() const { + return derived(); + } + // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -90,6 +134,71 @@ class TensorBase return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + // Coefficient-wise ternary operators. + template + inline const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const{ + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { return TensorDevice(device, derived()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 3ce924dc3..e0c0863b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -68,6 +68,42 @@ struct TensorEvaluator +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorCwiseNullaryOp XprType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess, + }; + + TensorEvaluator(const XprType& op) + : m_functor(op.functor()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(index); + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_functor.packetOp(index); + } + + private: + const NullaryOp m_functor; +}; + + // -------------------- CwiseUnaryOp -------------------- @@ -146,6 +182,54 @@ struct TensorEvaluator m_rightImpl; }; + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator > +{ + typedef TensorSelectOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & + TensorEvaluator::PacketAccess*/, + }; + + TensorEvaluator(const XprType& op) + : m_condImpl(op.ifExpression()), + m_thenImpl(op.thenExpression()), + m_elseImpl(op.elseExpression()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + static const int PacketSize = internal::unpacket_traits::size; + internal::Selector select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + } + + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index e32077f6e..94cfae05c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -17,6 +17,9 @@ namespace Eigen { * * \brief Tensor expression classes. * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. This + * is typically used to generate constants. + * * The TensorCwiseUnaryOp class represents an expression where a unary operator * (e.g. cwiseSqrt) is applied to an expression. * @@ -24,6 +27,46 @@ namespace Eigen { * (e.g. addition) is applied to a lhs and a rhs expression. * */ +namespace internal { +template +struct traits > + : traits +{ + typedef typename PlainObjectType::Packet Packet; + typedef typename PlainObjectType::Scalar Scalar; + typedef typename PlainObjectType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; +}; + +} // end namespace internal + + + +template +class TensorCwiseNullaryOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; + typedef typename PlainObjectType::PacketReturnType PacketReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const NullaryOp& func = NullaryOp()) + : m_functor(func) {} + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + // todo: add tensor dimension to be able to do some sanity checks + const NullaryOp m_functor; +}; + + namespace internal { template @@ -160,6 +203,72 @@ class TensorCwiseBinaryOp : public TensorBase +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSelectOp type; +}; + +} // end namespace internal + + +template +class TensorSelectOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + const IfXprType& ifExpression() const { return m_condition; } + + const ThenXprType& thenExpression() const { return m_then; } + + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 09b0fe66d..03ac8d516 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -17,8 +17,10 @@ template class TensorFi template class TensorMap; template class TensorBase; +template class TensorCwiseNullaryOp; template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; +template class TensorSelectOp; template class TensorDevice; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3fc9c5335..3a2ff5b30 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -45,33 +45,37 @@ template class TensorMap : public Tensor static const int Options = Options_; + static const std::size_t NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned), PacketAccess = true, }; EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array({{firstDimension}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif - inline TensorMap(PointerArgType dataPtr, const array& dimensions) + inline TensorMap(PointerArgType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename PlainObjectType::Dimensions& dimensions() const { return m_dimensions; } + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC @@ -80,7 +84,7 @@ template class TensorMap : public Tensor EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -96,12 +100,12 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const { - static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } @@ -159,7 +163,7 @@ template class TensorMap : public Tensor #endif EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -175,12 +179,12 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } @@ -247,8 +251,8 @@ template class TensorMap : public Tensor } private: - typename PlainObjectType::Scalar* m_data; - typename PlainObjectType::Dimensions m_dimensions; + Scalar* m_data; + Dimensions m_dimensions; }; } // end namespace Eigen From 6fa6cdd2b988da98cbdd2b1a5fd2fd3b9d56a4b1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 4 Jun 2014 09:21:48 -0700 Subject: [PATCH 005/214] Added support for tensor contractions Updated expression evaluation mechanism to also compute the size of the tensor result Misc fixes and improvements. --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Core/util/EmulateCXX11Meta.h | 2 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 38 ++- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 50 ++-- .../CXX11/src/Tensor/TensorContraction.h | 229 ++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 29 ++- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 44 +++- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 36 +-- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- .../src/Tensor/TensorForwardDeclarations.h | 2 + .../Eigen/CXX11/src/Tensor/TensorStorage.h | 11 +- 14 files changed, 370 insertions(+), 96 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 323d9edff..d4e8d3a15 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -39,6 +39,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index ab869177c..636063f9e 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -23,6 +23,8 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + static const std::size_t size = n; + T values[n]; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index d8ff3f584..e034f8c03 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -81,7 +81,7 @@ class Tensor : public TensorBase > typedef typename Base::PacketReturnType PacketReturnType; enum { - IsAligned = bool(EIGEN_ALIGN), + IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), PacketAccess = true, }; @@ -94,11 +94,11 @@ class Tensor : public TensorBase > TensorStorage m_storage; public: - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED // work, because that uses base().coeffRef() - and we don't yet @@ -116,13 +116,13 @@ class Tensor : public TensorBase > } #endif - inline const Scalar& coeff(const array& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; } - inline const Scalar& coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_storage.data()[index]; @@ -138,13 +138,13 @@ class Tensor : public TensorBase > } #endif - inline Scalar& coeffRef(const array& indices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) { eigen_internal_assert(checkIndexRange(indices)); return m_storage.data()[linearizedIndex(indices)]; } - inline Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_storage.data()[index]; @@ -160,19 +160,19 @@ class Tensor : public TensorBase > } #endif - inline const Scalar& operator()(const array& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { eigen_assert(checkIndexRange(indices)); return coeff(indices); } - inline const Scalar& operator()(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return coeff(index); } - inline const Scalar& operator[](Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const { // The bracket operator is only for vectors, use the parenthesis operator instead. EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -189,19 +189,19 @@ class Tensor : public TensorBase > } #endif - inline Scalar& operator()(const array& indices) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { eigen_assert(checkIndexRange(indices)); return coeffRef(indices); } - inline Scalar& operator()(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) { eigen_assert(index >= 0 && index < size()); return coeffRef(index); } - inline Scalar& operator[](Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) { // The bracket operator is only for vectors, use the parenthesis operator instead EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -223,11 +223,10 @@ class Tensor : public TensorBase > #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template inline Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage() + : m_storage(internal::array_prod(array{{firstDimension, otherDimensions...}}), array{{firstDimension, otherDimensions...}}) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - resize(array{{firstDimension, otherDimensions...}}); } #endif @@ -237,7 +236,6 @@ class Tensor : public TensorBase > EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) @@ -306,7 +304,7 @@ class Tensor : public TensorBase > array_zip_and_reduce(indices, m_storage.dimensions()); } - inline Index linearizedIndex(const array& indices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const { if (Options&RowMajor) { return m_storage.dimensions().IndexOfRowMajor(indices); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index e69ff6188..da1eb62cb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -53,7 +53,6 @@ template struct TensorAssign { typedef typename Derived1::Index Index; - EIGEN_DEVICE_FUNC static inline void run(Derived1& dst, const Derived2& src) { TensorEvaluator evalDst(dst); @@ -63,7 +62,7 @@ struct TensorAssign static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; static const int PacketSize = unpacket_traits::PacketReturnType>::size; - static const int VectorizedSize = (size / PacketSize) * PacketSize; + const int VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { evalDst.template writePacket(i, evalSrc.template packet(i)); @@ -148,7 +147,7 @@ struct TensorAssignMultiThreaded // GPU: the evaluation of the expressions is offloaded to a GPU. -#ifdef EIGEN_USE_GPU +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template __global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { const int index = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 8a88ba806..c5c711313 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -30,13 +30,16 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; - Derived& setZero() { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { return setConstant(Scalar(0)); } - Derived& setConstant(const Scalar& val) { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { return derived() = constant(val); } - Derived& setRandom() { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { return derived() = random(); } @@ -45,13 +48,13 @@ class TensorBase EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> constant(const Scalar& value) const { return TensorCwiseNullaryOp, const Derived> - (internal::scalar_constant_op(value)); + (derived(), internal::scalar_constant_op(value)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> random() const { - return TensorCwiseNullaryOp, const Derived>(); + return TensorCwiseNullaryOp, const Derived>(derived()); } // Coefficient-wise unary operators @@ -124,77 +127,86 @@ class TensorBase // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator+(const OtherDerived& other) const { + operator+(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator-(const OtherDerived& other) const { + operator-(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator*(const OtherDerived& other) const { + operator*(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator/(const OtherDerived& other) const { + operator/(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMax(const OtherDerived& other) const { + cwiseMax(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMin(const OtherDerived& other) const { + cwiseMin(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { + operator<(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { + operator<=(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { + operator>(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { + operator>=(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { + operator==(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { + operator!=(const OtherDerived& other) const { return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + // Contractions. + typedef std::pair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + // Coefficient-wise ternary operators. - template + template inline const TensorSelectOp - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const{ + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 000000000..d424df36e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,229 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorContractionOp type; +}; + +} // end namespace internal + + + +template +class TensorContractionOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; +}; + + +template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorContractionOp XprType; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * Indices::size>::size; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ + false, + }; + + TensorEvaluator(const XprType& op) + : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) + { + Index index = 0; + Index stride = 1; + m_shiftright = 1; + + int skipped = 0; + const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + bool skip = false; + for (int j = 0; j < Indices::size; ++j) { + if (op.indices()[j].first == i) { + skip = true; + m_leftOffsets[2*skipped] = stride; + m_leftOffsets[2*skipped+1] = stride * left_dims[i]; + m_stitchsize[skipped] = left_dims[i]; + break; + } + } + if (!skip) { + m_dimensions[index++] = left_dims[i]; + m_shiftright *= left_dims[i]; + } else { + ++skipped; + } + stride *= left_dims[i]; + } + + stride = 1; + skipped = 0; + const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + bool skip = false; + for (int j = 0; j < Indices::size; ++j) { + if (op.indices()[j].second == i) { + skip = true; + m_rightOffsets[2*skipped] = stride; + m_rightOffsets[2*skipped+1] = stride * right_dims[i]; + break; + } + } + if (!skip) { + m_dimensions[index++] = right_dims[i]; + } else { + ++skipped; + } + stride *= right_dims[i]; + } + + // Scalar case + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * Indices::size) { + m_dimensions[0] = 1; + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const Dimensions& dimensions() const { return m_dimensions; } + + void evalTo(typename XprType::Scalar* buffer) const { + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + const Index startLeft = index % m_shiftright; + const Index startRight = index / m_shiftright; + CoeffReturnType result = CoeffReturnType(0); + partialStitch(startLeft, startRight, 0, result); + return result; + } + + /* TODO: vectorization + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + assert(false); + }*/ + + private: + EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const { + Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]); + Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]); + + for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) { + const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex]; + const Index right = firstRight+j*m_rightOffsets[2*StitchIndex]; + if (StitchIndex < Indices::size-1) { + partialStitch(left, right, StitchIndex+1, accum); + } else { + accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); + } + } + } + + private: + array m_leftOffsets; + array m_rightOffsets; + array m_stitchsize; + Index m_shiftright; + Dimensions m_dimensions; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 71890e187..dbe60a165 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -59,7 +59,7 @@ template class TensorDevice class TensorDevice { public: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index ded6ca604..d7f5ab7c9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -37,17 +37,14 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU struct GpuDevice { - // todo: support for multiple gpu; - GpuDevice() { - cudaStreamCreate(&stream_); - } - ~GpuDevice() { - cudaStreamDestroy(stream_); - } - const cudaStream_t& stream() const { return stream_; } + // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. + GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } + + const cudaStream_t& stream() const { return *stream_; } private: - cudaStream_t stream_; + // TODO: multigpu. + const cudaStream_t* stream_; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 43e9d6550..c92b8c679 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -35,14 +35,14 @@ namespace Eigen { namespace internal { template struct dget { - static const std::size_t value = internal::get::value; - }; + static const std::size_t value = get::value; +}; template struct fixed_size_tensor_index_linearization_helper { - template + template EIGEN_DEVICE_FUNC static inline Index run(array const& indices, const Dimensions& dimensions) { @@ -55,7 +55,7 @@ struct fixed_size_tensor_index_linearization_helper template struct fixed_size_tensor_index_linearization_helper { - template + template EIGEN_DEVICE_FUNC static inline Index run(array const& indices, const Dimensions&) { @@ -93,11 +93,11 @@ struct Sizes : internal::numeric_list { return *this; } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } @@ -139,11 +139,11 @@ template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); } @@ -180,13 +180,18 @@ struct tensor_index_linearization_helper template struct DSizes : array { typedef array Base; + static const std::size_t count = NumDims; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { return internal::array_prod(*static_cast(this)); } - DSizes() { } - explicit DSizes(const array& a) : Base(a) { } + EIGEN_DEVICE_FUNC DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } DSizes& operator = (const array& other) { *static_cast(this) = other; @@ -194,10 +199,10 @@ struct DSizes : array { } // A constexpr would be so much better here - size_t IndexOfColMajor(const array& indices) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); } - size_t IndexOfRowMajor(const array& indices) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index e0c0863b7..ab2513cea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -21,7 +21,6 @@ namespace Eigen { * * TODO: add support for more types of expressions, in particular expressions * leading to lvalues (slicing, reshaping, etc...) - * TODO: add support for vectorization */ template @@ -32,16 +31,19 @@ struct TensorEvaluator typedef typename Derived::Packet Packet; typedef typename Derived::Scalar CoeffReturnType; typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; enum { IsAligned = Derived::IsAligned, PacketAccess = Derived::PacketAccess, }; - TensorEvaluator(Derived& m) - : m_data(const_cast(m.data())) + EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m) + : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_data[index]; } @@ -64,29 +66,34 @@ struct TensorEvaluator protected: Scalar* m_data; + Dimensions m_dims; }; // -------------------- CwiseNullaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator > { - typedef TensorCwiseNullaryOp XprType; + typedef TensorCwiseNullaryOp XprType; enum { IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, }; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) - : m_functor(op.functor()) + : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -101,6 +108,7 @@ struct TensorEvaluator > private: const NullaryOp m_functor; + TensorEvaluator m_argImpl; }; @@ -117,7 +125,7 @@ struct TensorEvaluator > PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) { } @@ -125,6 +133,9 @@ struct TensorEvaluator > typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -156,7 +167,7 @@ struct TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) : m_functor(op.functor()), m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) @@ -165,6 +176,13 @@ struct TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -196,7 +214,7 @@ struct TensorEvaluator TensorEvaluator::PacketAccess*/, }; - TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) : m_condImpl(op.ifExpression()), m_thenImpl(op.thenExpression()), m_elseImpl(op.elseExpression()) @@ -205,7 +223,13 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 94cfae05c..60908ee94 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -28,13 +28,13 @@ namespace Eigen { * */ namespace internal { -template -struct traits > - : traits +template +struct traits > + : traits { - typedef typename PlainObjectType::Packet Packet; - typedef typename PlainObjectType::Scalar Scalar; - typedef typename PlainObjectType::Nested XprTypeNested; + typedef typename XprType::Packet Packet; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; }; @@ -42,27 +42,31 @@ struct traits > -template -class TensorCwiseNullaryOp : public TensorBase > +template +class TensorCwiseNullaryOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; - typedef typename PlainObjectType::PacketReturnType PacketReturnType; - typedef TensorCwiseNullaryOp Nested; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef TensorCwiseNullaryOp Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const NullaryOp& func = NullaryOp()) - : m_functor(func) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; } protected: - // todo: add tensor dimension to be able to do some sanity checks + typename XprType::Nested m_xpr; const NullaryOp m_functor; }; @@ -71,7 +75,7 @@ class TensorCwiseNullaryOp : public TensorBase struct traits > - : traits + : traits { typedef typename result_of< UnaryOp(typename XprType::Scalar) @@ -207,7 +211,7 @@ class TensorCwiseBinaryOp : public TensorBase struct traits > - : traits + : traits { typedef typename traits::Scalar Scalar; typedef typename internal::packet_traits::type Packet; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index dcc7ccd65..789c04238 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -52,7 +52,7 @@ class TensorFixedSize : public TensorBase dimensions() const { return m_storage.dimensions(); } + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 03ac8d516..239b5cb67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,6 +21,8 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; +template class TensorReductionOp; +template class TensorContractionOp; template class TensorDevice; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 64098343e..c9d6517eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -53,7 +53,7 @@ class TensorStorage EIGEN_STRONG_INLINE const T *data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return m_dimensions; } + EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); } @@ -111,7 +111,8 @@ class TensorStorage(m_data, internal::array_prod(m_dimensions)); } void swap(Self_& other) { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); } - const DSizes& dimensions() const {return m_dimensions;} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const {return m_dimensions;} void conservativeResize(DenseIndex size, const array& nbDimensions) { @@ -132,10 +133,10 @@ class TensorStorage Date: Thu, 5 Jun 2014 10:49:34 -0700 Subject: [PATCH 006/214] Created additional tests for the tensor code. --- unsupported/test/CMakeLists.txt | 2 + unsupported/test/cxx11_tensor_comparisons.cpp | 84 +++++++++ unsupported/test/cxx11_tensor_contraction.cpp | 163 ++++++++++++++++++ unsupported/test/cxx11_tensor_device.cpp | 17 +- unsupported/test/cxx11_tensor_expr.cpp | 149 ++++++++++++++-- unsupported/test/cxx11_tensor_fixed_size.cpp | 14 +- unsupported/test/cxx11_tensor_thread_pool.cpp | 7 +- 7 files changed, 406 insertions(+), 30 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_comparisons.cpp create mode 100644 unsupported/test/cxx11_tensor_contraction.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index abc3375e5..d6072c9f3 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -102,6 +102,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_comparison "-std=c++0x") + ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_device "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp new file mode 100644 index 000000000..186f56ac3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_comparisons.cpp @@ -0,0 +1,84 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_orderings() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor lt(2,3,7); + Tensor le(2,3,7); + Tensor gt(2,3,7); + Tensor ge(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + + lt = mat1 < mat2; + le = mat1 <= mat2; + gt = mat1 > mat2; + ge = mat1 >= mat2; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k)); + VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k)); + VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k)); + VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k)); + } + } + } +} + + +static void test_equality() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + if (random() < 0.5) { + mat2(i,j,k) = mat1(i,j,k); + } + } + } + } + + Tensor eq(2,3,7); + Tensor ne(2,3,7); + eq = (mat1 == mat2); + ne = (mat1 != mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k)); + VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_comparisons() +{ + CALL_SUBTEST(test_orderings()); + CALL_SUBTEST(test_equality()); +} diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp new file mode 100644 index 000000000..1c89dfdd1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -0,0 +1,163 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +typedef Tensor::DimensionPair DimPair; + + +static void test_evals() +{ + Tensor mat1(2, 3); + Tensor mat2(2, 3); + Tensor mat3(3, 2); + + mat1.setRandom(); + mat2.setRandom(); + mat3.setRandom(); + + Tensor mat4(3,3); + mat4.setZero(); + Eigen::array dims3({{DimPair(0, 0)}}); + TensorEvaluator eval(mat1.contract(mat2, dims3)); + eval.evalTo(mat4.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 3); + VERIFY_IS_EQUAL(eval.dimensions()[1], 3); + + VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2)); + VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2)); + VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0)); + VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1)); + VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2)); + + Tensor mat5(2,2); + mat5.setZero(); + Eigen::array dims4({{DimPair(1, 1)}}); + TensorEvaluator eval2(mat1.contract(mat2, dims4)); + eval2.evalTo(mat5.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval2.dimensions()[1], 2); + + VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2)); + VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2)); + VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2)); + VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2)); + + Tensor mat6(2,2); + mat6.setZero(); + Eigen::array dims6({{DimPair(1, 0)}}); + TensorEvaluator eval3(mat1.contract(mat3, dims6)); + eval3.evalTo(mat6.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval3.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval3.dimensions()[1], 2); + + VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0)); + VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1)); + VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0)); + VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1)); +} + + +static void test_scalar() +{ + Tensor vec1({6}); + Tensor vec2({6}); + + vec1.setRandom(); + vec2.setRandom(); + + Tensor scalar(1); + scalar.setZero(); + Eigen::array dims({{DimPair(0, 0)}}); + TensorEvaluator eval(vec1.contract(vec2, dims)); + eval.evalTo(scalar.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + + float expected = 0.0f; + for (int i = 0; i < 6; ++i) { + expected += vec1(i) * vec2(i); + } + VERIFY_IS_APPROX(scalar(0), expected); +} + + +static void test_multidims() +{ + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2, 2); + + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(2, 2, 2); + mat3.setZero(); + Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); + TensorEvaluator eval(mat1.contract(mat2, dims)); + eval.evalTo(mat3.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval.dimensions()[1], 2); + VERIFY_IS_EQUAL(eval.dimensions()[2], 2); + + VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) + + mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1)); + VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) + + mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1)); + VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) + + mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1)); + VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) + + mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1)); + VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) + + mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1)); + VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) + + mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1)); + VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) + + mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1)); + VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) + + mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); +} + + +static void test_expr() +{ + Tensor mat1(2, 3); + Tensor mat2(3, 2); + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(2,2); + + Eigen::array dims({{DimPair(1, 0)}}); + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1)); + VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0)); + VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); +} + + +void test_cxx11_tensor_contraction() +{ + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_scalar()); + CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_expr()); +} diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index 9eb1d0420..365b109c7 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -15,7 +15,7 @@ #include "main.h" -#include +#include using Eigen::Tensor; using Eigen::RowMajor; @@ -39,8 +39,12 @@ struct CPUContext { // Context for evaluation on GPU struct GPUContext { - GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out) { } - + GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { + cudaStreamCreate(&stream_); + } + ~GPUContext() { + cudaStreamDestroy(stream_); + } const Eigen::TensorMap >& in1() const { return in1_; } const Eigen::TensorMap >& in2() const { return in2_; } Eigen::TensorDevice >, Eigen::GpuDevice> out() { return TensorDevice >, Eigen::GpuDevice>(gpu_device_, out_); } @@ -49,6 +53,7 @@ struct GPUContext { const Eigen::TensorMap >& in1_; const Eigen::TensorMap >& in2_; Eigen::TensorMap >& out_; + cudaStream_t stream_; Eigen::GpuDevice gpu_device_; }; @@ -57,7 +62,7 @@ struct GPUContext { template static void test_contextual_eval(Context* context) { - context->out() = context->in1() + context->in2() * 3.14f; + context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } static void test_cpu() { @@ -73,7 +78,7 @@ static void test_cpu() { for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } @@ -111,7 +116,7 @@ static void test_gpu() { for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index e0124da8c..e85fcbfa9 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -28,10 +28,10 @@ static void test_1d() float data3[6]; TensorMap> vec3(data3, 6); - vec3 = vec1.cwiseSqrt(); + vec3 = vec1.sqrt(); float data4[6]; TensorMap> vec4(data4, 6); - vec4 = vec2.cwiseSqrt(); + vec4 = vec2.square(); VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); @@ -40,12 +40,12 @@ static void test_1d() VERIFY_IS_APPROX(vec3(4), sqrtf(23.0)); VERIFY_IS_APPROX(vec3(5), sqrtf(42.0)); - VERIFY_IS_APPROX(vec4(0), sqrtf(0.0)); - VERIFY_IS_APPROX(vec4(1), sqrtf(1.0)); - VERIFY_IS_APPROX(vec4(2), sqrtf(2.0)); - VERIFY_IS_APPROX(vec4(3), sqrtf(3.0)); - VERIFY_IS_APPROX(vec4(4), sqrtf(4.0)); - VERIFY_IS_APPROX(vec4(5), sqrtf(5.0)); + VERIFY_IS_APPROX(vec4(0), 0.0f); + VERIFY_IS_APPROX(vec4(1), 1.0f); + VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f); + VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f); + VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f); + VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f); vec3 = vec1 + vec2; VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); @@ -79,8 +79,8 @@ static void test_2d() Tensor mat3(2,3); Tensor mat4(2,3); - mat3 = mat1.cwiseAbs(); - mat4 = mat2.cwiseAbs(); + mat3 = mat1.abs(); + mat4 = mat2.abs(); VERIFY_IS_APPROX(mat3(0,0), 0.0f); VERIFY_IS_APPROX(mat3(0,1), 1.0f); @@ -102,7 +102,7 @@ static void test_3d() Tensor mat1(2,3,7); Tensor mat2(2,3,7); - float val = 0.0; + float val = 1.0; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -118,28 +118,147 @@ static void test_3d() Tensor mat4(2,3,7); mat4 = mat2 * 3.14f; Tensor mat5(2,3,7); - mat5 = mat1.cwiseSqrt().cwiseSqrt(); + mat5 = mat1.inverse().log(); Tensor mat6(2,3,7); - mat6 = mat2.cwiseSqrt() * 3.14f; + mat6 = mat2.pow(0.5f) * 3.14f; + Tensor mat7(2,3,7); + mat7 = mat1.cwiseMax(mat5 * 2.0f).exp(); + Tensor mat8(2,3,7); + mat8 = (-mat2).exp() * 3.14f; - val = 0.0; + val = 1.0; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(mat3(i,j,k), val + val); VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f); - VERIFY_IS_APPROX(mat5(i,j,k), sqrtf(sqrtf(val))); + VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val)); VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); + VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f))); + VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f); val += 1.0; } } } } +static void test_constants() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + mat2 = mat1.constant(3.14f); + mat3 = mat1.cwiseMax(7.3f).exp(); + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat2(i,j,k), 3.14f); + VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f))); + val += 1.0; + } + } + } +} + + +static void test_functors() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + + float val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + mat1(i,j,k) = val; + val += 1.0; + } + } + } + mat2 = mat1.inverse().unaryExpr(&asinf); + mat3 = mat1.unaryExpr(&tanhf); + + val = 1.0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k))); + VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k))); + val += 1.0; + } + } + } +} + +static void test_type_casting() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + mat1.setRandom(); + mat2.setRandom(); + + mat3 = mat1.template cast(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0); + } + } + } + + mat3 = mat2.template cast(); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), static_cast(mat2(i,j,k))); + } + } + } +} + +static void test_select() +{ + Tensor selector(2,3,7); + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor result(2,3,7); + + selector.setRandom(); + mat1.setRandom(); + mat2.setRandom(); + result = (selector > selector.constant(0.5f)).select(mat1, mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k)); + } + } + } +} + void test_cxx11_tensor_expr() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_constants()); + CALL_SUBTEST(test_functors()); + CALL_SUBTEST(test_type_casting()); + CALL_SUBTEST(test_select()); } diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 214f6951d..d270486f2 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -33,10 +33,10 @@ static void test_1d() float data3[6]; TensorMap > > vec3(data3, 6); - vec3 = vec1.cwiseSqrt(); + vec3 = vec1.sqrt(); float data4[6]; TensorMap, RowMajor> > vec4(data4, 6); - vec4 = vec2.cwiseSqrt(); + vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); @@ -92,8 +92,8 @@ static void test_2d() TensorFixedSize> mat3; TensorFixedSize, RowMajor> mat4; - mat3 = mat1.cwiseAbs(); - mat4 = mat2.cwiseAbs(); + mat3 = mat1.abs(); + mat4 = mat2.abs(); VERIFY_IS_EQUAL((mat3.size()), 2*3); // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); @@ -136,9 +136,9 @@ static void test_3d() } TensorFixedSize > mat3; - mat3 = mat1.cwiseSqrt(); + mat3 = mat1.sqrt(); TensorFixedSize, RowMajor> mat4; - mat4 = mat2.cwiseSqrt(); + mat4 = mat2.sqrt(); VERIFY_IS_EQUAL((mat3.size()), 2*3*7); // VERIFY_IS_EQUAL((mat3.dimension(0)), 2); @@ -173,7 +173,7 @@ static void test_array() } TensorFixedSize > mat3; - mat3 = mat1.cwisePow(3.5f); + mat3 = mat1.pow(3.5f); val = 0.0; for (int i = 0; i < 2; ++i) { diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index c9de71da3..b371e8a71 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -12,6 +12,7 @@ #include "main.h" #include +#include "thread/threadpool.h" using Eigen::Tensor; @@ -24,8 +25,10 @@ void test_cxx11_tensor_thread_pool() in1.setRandom(); in2.setRandom(); - Eigen::ThreadPoolDevice thread_pool_device(3); - out.device(thread_pool_device) = in1 + in2 * 3.14; + ThreadPool thread_pool(2); + thread_pool.StartWorkers(); + Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, 3); + out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { From a961d72e65fc537fe571845407b4e2ee0554bd49 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 16:25:16 -0700 Subject: [PATCH 007/214] Added support for convolution and reshaping of tensors. --- unsupported/Eigen/CXX11/Tensor | 2 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 14 ++ .../CXX11/src/Tensor/TensorConvolution.h | 206 ++++++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 119 ++++++++++ unsupported/test/cxx11_tensor_convolution.cpp | 70 ++++++ 6 files changed, 413 insertions(+), 2 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h create mode 100644 unsupported/test/cxx11_tensor_convolution.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d4e8d3a15..c67020581 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -40,6 +40,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index c5c711313..932e5c82d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -203,6 +203,13 @@ class TensorBase return TensorContractionOp(derived(), other.derived(), dims); } + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + // Coefficient-wise ternary operators. template inline const TensorSelectOp @@ -210,6 +217,13 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } + // Morphing operators (slicing tbd). + template + inline const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 000000000..ca2e0e562 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,206 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = TensorEvaluator::Dimensions::count; + static const int KernelDims = Indices::size; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ + false, + }; + + TensorEvaluator(const XprType& op) + : m_inputImpl(op.inputExpression()), m_kernelImpl(op.kernelExpression()), m_dimensions(op.inputExpression().dimensions()) + { + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; + } else { + m_inputStride[0] = 1; + } + } + + for (int i = 0; i < KernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; + } else { + m_outputStride[0] = 1; + } + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const Dimensions& dimensions() const { return m_dimensions; } + + void evalTo(typename XprType::Scalar* buffer) const { + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index startInput = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + + CoeffReturnType result = CoeffReturnType(0); + convolve(startInput, 0, 0, result); + return result; + } + + /* TODO: vectorization + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + assert(false); + }*/ + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex < KernelDims-1) { + convolve(input, kernel, DimIndex+1, accum); + } else { + + accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel); + } + } + } + + private: + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + Dimensions m_dimensions; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 239b5cb67..b8833362c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,9 +21,9 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; -template class TensorReductionOp; template class TensorContractionOp; - +template class TensorConvolutionOp; +template class TensorReshapingOp; template class TensorDevice; // Move to internal? diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 000000000..3e089fe1e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,119 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReshapingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReshapingOp type; +}; + +} // end namespace internal + + + +template +class TensorReshapingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +template +struct TensorEvaluator > +{ + typedef TensorReshapingOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + TensorEvaluator(const XprType& op) + : m_impl(op.expression()), m_dimensions(op.dimensions()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + const NewDimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + private: + NewDimensions m_dimensions; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp new file mode 100644 index 000000000..95e40f64f --- /dev/null +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -0,0 +1,70 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + + +static void test_evals() +{ + Tensor input(3, 3); + Tensor kernel(2); + + input.setRandom(); + kernel.setRandom(); + + Tensor result(2,3); + result.setZero(); + Eigen::array::Index, 1> dims3({0}); + + TensorEvaluator eval(input.convolve(kernel, dims3)); + eval.evalTo(result.data()); + EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + VERIFY_IS_EQUAL(eval.dimensions()[0], 2); + VERIFY_IS_EQUAL(eval.dimensions()[1], 3); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 + VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 + VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 +} + + +static void test_expr() +{ + Tensor input(3, 3); + Tensor kernel(2, 2); + input.setRandom(); + kernel.setRandom(); + + Tensor result(2,2); + Eigen::array dims({0, 1}); + result = input.convolve(kernel, dims); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); +} + + +void test_cxx11_tensor_convolution() +{ + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_expr()); +} From 79085e08e9512f678b4584df49d1b2835b40117f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 20:16:13 -0700 Subject: [PATCH 008/214] Fixed a typo --- unsupported/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d6072c9f3..e67e61263 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -102,7 +102,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") - ei_add_test(cxx11_tensor_comparison "-std=c++0x") + ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") From 29aebf96e62f4fb5e4b1f3fb475e299df2e7a02e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Jun 2014 20:18:44 -0700 Subject: [PATCH 009/214] Created the pblend packet primitive and implemented it using SSE and AVX instructions. --- Eigen/src/Core/GenericPacketMath.h | 14 ++++++++++ Eigen/src/Core/arch/AVX/PacketMath.h | 15 ++++++++++ Eigen/src/Core/arch/SSE/Complex.h | 8 +++++- Eigen/src/Core/arch/SSE/PacketMath.h | 41 ++++++++++++++++++++++++++-- test/packetmath.cpp | 16 +++++++++++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 98313c68f..0869dd49f 100755 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -54,6 +54,7 @@ struct default_packet_traits HasMax = 1, HasConj = 1, HasSetLinear = 1, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, @@ -429,6 +430,19 @@ ptranspose(PacketBlock& /*kernel*/) { // Nothing to do in the scalar case, i.e. a 1x1 matrix. } +/*************************************************************************** + * Selector, i.e. vector of N boolean values used to select (i.e. blend) + * words from 2 packets. +***************************************************************************/ +template struct Selector { + bool select[N]; +}; + +template EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 8b8307d75..688ff91e4 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -59,6 +59,7 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 0, HasSqrt = 0 + HasBlend = 1, }; }; template<> struct packet_traits : default_packet_traits @@ -73,6 +74,7 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 0 + HasBlend = 1, }; }; @@ -557,6 +559,19 @@ ptranspose(PacketBlock& kernel) { kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); } +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 758183c18..0bc03cf9e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -44,7 +44,8 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; #endif @@ -472,6 +473,11 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 6912f3bc3..1124b24df 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -108,7 +108,8 @@ template<> struct packet_traits : default_packet_traits HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -123,7 +124,8 @@ template<> struct packet_traits : default_packet_traits HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasBlend = 1 }; }; #endif @@ -135,7 +137,9 @@ template<> struct packet_traits : default_packet_traits // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; @@ -809,6 +813,37 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + } // end namespace internal } // end namespace Eigen diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 9dab07522..663ab886d 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -261,6 +261,22 @@ template void packetmath() VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose"); } } + + if (internal::packet_traits::HasBlend) { + Packet thenPacket = internal::pload(data1); + Packet elsePacket = internal::pload(data2); + EIGEN_ALIGN_DEFAULT internal::Selector selector; + for (int i = 0; i < PacketSize; ++i) { + selector.select[i] = i; + } + + Packet blend = internal::pblend(selector, thenPacket, elsePacket); + EIGEN_ALIGN_DEFAULT Scalar result[size]; + internal::pstore(result, blend); + for (int i = 0; i < PacketSize; ++i) { + VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue)); + } + } } template void packetmath_real() From 8c8ae2d8193809744f5952713287639817e2b442 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 7 Jun 2014 11:24:38 -0700 Subject: [PATCH 010/214] Fixed a typo --- Eigen/src/Core/arch/AVX/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 688ff91e4..74d3746d9 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -58,8 +58,8 @@ template<> struct packet_traits : default_packet_traits HasCos = 0, HasLog = 0, HasExp = 0, - HasSqrt = 0 - HasBlend = 1, + HasSqrt = 0, + HasBlend = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -73,8 +73,8 @@ template<> struct packet_traits : default_packet_traits HasHalfPacket = 1, HasDiv = 1, - HasExp = 0 - HasBlend = 1, + HasExp = 0, + HasBlend = 1 }; }; From fe102248ac8f78e33064caeb5cdea6fc41af637c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:19:21 -0700 Subject: [PATCH 011/214] Fixed the threadpool test --- unsupported/test/cxx11_tensor_thread_pool.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index b371e8a71..2e67b2064 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -12,7 +12,6 @@ #include "main.h" #include -#include "thread/threadpool.h" using Eigen::Tensor; @@ -25,9 +24,7 @@ void test_cxx11_tensor_thread_pool() in1.setRandom(); in2.setRandom(); - ThreadPool thread_pool(2); - thread_pool.StartWorkers(); - Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, 3); + Eigen::ThreadPoolDevice thread_pool_device(3); out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { From 2859a31ac80af86fa58e5347be50d32fd07bcd3c Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:42:34 -0700 Subject: [PATCH 012/214] Fixed compilation error --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3a2ff5b30..3a06170fa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -54,18 +54,18 @@ template class TensorMap : public Tensor PacketAccess = true, }; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } #endif inline TensorMap(PointerArgType dataPtr, const array& dimensions) From 36a2b2e9dc9368356b3f327a1fb00616397c1e0e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:43:51 -0700 Subject: [PATCH 013/214] Prevent the generation of unlaunchable cuda kernels when compiling in debug mode. --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index c92b8c679..3e5687915 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -73,7 +73,7 @@ struct Sizes : internal::numeric_list { typedef internal::numeric_list Base; static const std::size_t total_size = internal::arg_prod(Indices...); - static std::size_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() { return internal::arg_prod(Indices...); } @@ -119,7 +119,7 @@ template ::value; - static size_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { return internal::arg_prod::value; } @@ -156,7 +156,8 @@ namespace internal { template struct tensor_index_linearization_helper { - static inline Index run(array const& indices, array const& dimensions) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const& dimensions) { return array_get(indices) + array_get(dimensions) * @@ -167,7 +168,8 @@ struct tensor_index_linearization_helper template struct tensor_index_linearization_helper { - static inline Index run(array const& indices, array const&) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const&) { return array_get(indices); } From a669052f12d6d71ba815764d6419726d64fef675 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 09:45:30 -0700 Subject: [PATCH 014/214] Improved support for rvalues in tensor expressions. --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 58 ++++++++++++++----- .../CXX11/src/Tensor/TensorContraction.h | 4 ++ .../CXX11/src/Tensor/TensorConvolution.h | 4 ++ .../Eigen/CXX11/src/Tensor/TensorExpr.h | 8 +++ .../src/Tensor/TensorForwardDeclarations.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 6 +- 7 files changed, 71 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 932e5c82d..e447a5d40 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -22,7 +22,7 @@ namespace Eigen { */ template -class TensorBase +class TensorBase { public: typedef typename internal::traits::Scalar Scalar; @@ -30,19 +30,6 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setZero() { - return setConstant(Scalar(0)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { - return derived() = constant(val); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = random(); - } - // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -224,14 +211,53 @@ class TensorBase return TensorReshapingOp(derived(), newDimensions); } + protected: + template friend class TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + + +template +class TensorBase : public TensorBase { + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Index Index; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + + template friend class TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { return TensorDevice(device, derived()); } - protected: - template friend class TensorBase; + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d424df36e..d371eb76d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -35,6 +35,10 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index ca2e0e562..501e9a522 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -35,6 +35,10 @@ struct traits > typedef typename KernelXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 60908ee94..de66da13f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -36,6 +36,10 @@ struct traits > typedef typename XprType::Scalar Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + + enum { + Flags = 0, + }; }; } // end namespace internal @@ -153,6 +157,10 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index b8833362c..1fb90478f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -15,7 +15,7 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; template class TensorMap; -template class TensorBase; +template::value> class TensorBase; template class TensorCwiseNullaryOp; template class TensorCwiseUnaryOp; @@ -29,6 +29,10 @@ template class TensorDevice; // Move to internal? template struct TensorEvaluator; +namespace internal { +template struct TensorAssign; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3e089fe1e..7d5f9271e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -21,7 +21,7 @@ namespace Eigen { */ namespace internal { template -struct traits > +struct traits > : public traits { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -81,6 +81,7 @@ template struct TensorEvaluator > { typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; enum { IsAligned = TensorEvaluator::IsAligned, @@ -95,7 +96,7 @@ struct TensorEvaluator > typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - const NewDimensions& dimensions() const { return m_dimensions; } + const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 2de698a57..40f805741 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -52,7 +52,7 @@ struct traits > typedef DenseIndex Index; enum { Options = Options_, - Flags = compute_tensor_flags::ret, + Flags = compute_tensor_flags::ret | LvalueBit, }; }; @@ -63,6 +63,10 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | LvalueBit, + }; }; From a77458a8ff2a83e716add62253eb50ef64980b21 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 9 Jun 2014 10:06:57 -0700 Subject: [PATCH 015/214] Fixes compilation errors triggered when compiling the tensor contraction code with cxx11 enabled. --- .../CXX11/src/Core/util/CXX11Workarounds.h | 6 ++++++ .../CXX11/src/Core/util/EmulateCXX11Meta.h | 17 +++++++++++++---- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 16 ++++++++-------- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index f102872ae..423ca4be4 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,12 @@ template constexpr inline T const& array_ #undef STD_GET_ARR_HACK +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; + + /* Suppose you have a template of the form * template struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 636063f9e..1d3164d6a 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -182,23 +182,32 @@ array repeat(t v) { } template -t array_prod(const array& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; for (size_t i = 0; i < n; ++i) { prod *= a[i]; } return prod; } template -t array_prod(const array& /*a*/) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } -template inline T& array_get(array& a) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; } -template inline const T& array_get(const array& a) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const T& array_get(const array& a) { return a[I]; } + +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; + + struct sum_op { template static inline bool run(A a, B b) { return a + b; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d371eb76d..5149de1bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -107,7 +107,7 @@ struct TensorEvaluator XprType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * Indices::size>::size; + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; @@ -128,7 +128,7 @@ struct TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; - for (int j = 0; j < Indices::size; ++j) { + for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].first == i) { skip = true; m_leftOffsets[2*skipped] = stride; @@ -151,7 +151,7 @@ struct TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; - for (int j = 0; j < Indices::size; ++j) { + for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].second == i) { skip = true; m_rightOffsets[2*skipped] = stride; @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * Indices::size) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } @@ -209,7 +209,7 @@ struct TensorEvaluator::value-1) { partialStitch(left, right, StitchIndex+1, accum); } else { accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); @@ -218,9 +218,9 @@ struct TensorEvaluator m_leftOffsets; - array m_rightOffsets; - array m_stitchsize; + array::value> m_leftOffsets; + array::value> m_rightOffsets; + array::value> m_stitchsize; Index m_shiftright; Dimensions m_dimensions; TensorEvaluator m_leftImpl; From 925fb6b93710b95082ba44d30405289dff3707eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Jun 2014 09:14:44 -0700 Subject: [PATCH 016/214] TensorEval are now typed on the device: this will make it possible to use partial template specialization to optimize the strategy of each evaluator for each device type. Started work on partial evaluations. --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 42 +++++----- .../Eigen/CXX11/src/Tensor/TensorBase.h | 14 +++- .../CXX11/src/Tensor/TensorContraction.h | 26 +++---- .../CXX11/src/Tensor/TensorConvolution.h | 20 ++--- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 28 ++++++- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 76 +++++++++---------- .../src/Tensor/TensorForwardDeclarations.h | 9 ++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 14 ++-- 9 files changed, 129 insertions(+), 102 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index da1eb62cb..633a7a31b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -32,15 +32,15 @@ namespace Eigen { namespace internal { // Default strategy: the expressions are evaluated with a single cpu thread. -template::PacketAccess & TensorEvaluator::PacketAccess> +template::PacketAccess & TensorEvaluator::PacketAccess> struct TensorAssign { typedef typename Derived1::Index Index; EIGEN_DEVICE_FUNC - static inline void run(Derived1& dst, const Derived2& src) + static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); for (Index i = 0; i < size; ++i) { evalDst.coeffRef(i) = evalSrc.coeff(i); @@ -49,19 +49,19 @@ struct TensorAssign }; -template -struct TensorAssign +template +struct TensorAssign { typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src) + static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int PacketSize = unpacket_traits::PacketReturnType>::size; + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int PacketSize = unpacket_traits::PacketReturnType>::size; const int VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { @@ -116,12 +116,12 @@ struct TensorAssignMultiThreaded typedef typename Derived1::Index Index; static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, DefaultDevice()); + TensorEvaluator evalSrc(src, Defaultevice()); const Index size = dst.size(); - static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); @@ -131,7 +131,7 @@ struct TensorAssignMultiThreaded vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); + results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -167,19 +167,19 @@ struct TensorAssignGpu typedef typename Derived1::Index Index; static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) { - TensorEvaluator evalDst(dst); - TensorEvaluator evalSrc(src); + TensorEvaluator evalDst(dst, device); + TensorEvaluator evalSrc(src, device); const Index size = dst.size(); const int block_size = std::min(size, 32*32); const int num_blocks = size / block_size; - EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); + EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); const int remaining_items = size % block_size; if (remaining_items > 0) { const int peel_start_offset = num_blocks * block_size; const int peel_block_size = std::min(size, 32); const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); + EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index e447a5d40..6b53d2a3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -198,19 +198,25 @@ class TensorBase } // Coefficient-wise ternary operators. - template - inline const TensorSelectOp + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } // Morphing operators (slicing tbd). - template - inline const TensorReshapingOp + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { return TensorReshapingOp(derived(), newDimensions); } + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + protected: template friend class TensorBase; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 5149de1bb..cadbabda2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -102,31 +102,31 @@ template <> struct max_n_1<0> { }; -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorContractionOp XprType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ false, }; - TensorEvaluator(const XprType& op) - : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression()) + TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { Index index = 0; Index stride = 1; m_shiftright = 1; int skipped = 0; - const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].first == i) { @@ -148,8 +148,8 @@ struct TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { + const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); + for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { bool skip = false; for (int j = 0; j < internal::array_size::value; ++j) { if (op.indices()[j].second == i) { @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } @@ -223,8 +223,8 @@ struct TensorEvaluator::value> m_stitchsize; Index m_shiftright; Dimensions m_dimensions; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 501e9a522..a554b8260 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -94,27 +94,27 @@ class TensorConvolutionOp : public TensorBase -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorConvolutionOp XprType; - static const int NumDims = TensorEvaluator::Dimensions::count; + static const int NumDims = TensorEvaluator::Dimensions::count; static const int KernelDims = Indices::size; typedef typename XprType::Index Index; typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ false, }; - TensorEvaluator(const XprType& op) - : m_inputImpl(op.inputExpression()), m_kernelImpl(op.kernelExpression()), m_dimensions(op.inputExpression().dimensions()) + TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions()) { - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); for (int i = 0; i < NumDims; ++i) { if (i > 0) { @@ -200,8 +200,8 @@ struct TensorEvaluator m_indexStride; array m_kernelStride; Dimensions m_dimensions; - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index dbe60a165..ce524a818 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -31,7 +31,7 @@ template class TensorDevice { template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssign::run(m_expression, other); + internal::TensorAssign::run(m_expression, other, m_device); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index d7f5ab7c9..142edda14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -15,6 +15,12 @@ namespace Eigen { // Default device for the machine (typically a single cpu core) struct DefaultDevice { + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } }; @@ -22,14 +28,19 @@ struct DefaultDevice { // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } size_t numThreads() const { return num_threads_; } - /*ThreadPool* threadPool() const { return pool_; }*/ + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } private: // todo: NUMA, ... size_t num_threads_; - /*ThreadPool* pool_;*/ }; #endif @@ -40,7 +51,16 @@ struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } - const cudaStream_t& stream() const { return *stream_; } + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + void* result; + cudaMalloc(&result, num_bytes); + return result; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + cudaFree(buffer); + } private: // TODO: multigpu. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ab2513cea..80fe06957 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -23,7 +23,7 @@ namespace Eigen { * leading to lvalues (slicing, reshaping, etc...) */ -template +template struct TensorEvaluator { typedef typename Derived::Index Index; @@ -38,7 +38,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m) + EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } @@ -73,8 +73,8 @@ struct TensorEvaluator // -------------------- CwiseNullaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; @@ -84,14 +84,14 @@ struct TensorEvaluator > }; EIGEN_DEVICE_FUNC - TensorEvaluator(const XprType& op) - : m_functor(op.functor()), m_argImpl(op.nestedExpression()) + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -108,32 +108,32 @@ struct TensorEvaluator > private: const NullaryOp m_functor; - TensorEvaluator m_argImpl; + TensorEvaluator m_argImpl; }; // -------------------- CwiseUnaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), - m_argImpl(op.nestedExpression()) + m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -150,33 +150,33 @@ struct TensorEvaluator > private: const UnaryOp m_functor; - TensorEvaluator m_argImpl; + TensorEvaluator m_argImpl; }; // -------------------- CwiseBinaryOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorCwiseBinaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), - m_leftImpl(op.lhsExpression()), - m_rightImpl(op.rhsExpression()) + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -196,34 +196,34 @@ struct TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; }; // -------------------- SelectOp -------------------- -template -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorSelectOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & TensorEvaluator::PacketAccess*/, }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op) - : m_condImpl(op.ifExpression()), - m_thenImpl(op.thenExpression()), - m_elseImpl(op.elseExpression()) + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -248,9 +248,9 @@ struct TensorEvaluator } private: - TensorEvaluator m_condImpl; - TensorEvaluator m_thenImpl; - TensorEvaluator m_elseImpl; + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 1fb90478f..27bfe1d73 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,16 +21,17 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; +template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; -template class TensorDevice; +template class TensorForcedEvalOp; -// Move to internal? -template struct TensorEvaluator; +template class TensorDevice; +template struct TensorEvaluator; namespace internal { -template struct TensorAssign; +template struct TensorAssign; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 7d5f9271e..e9e74581f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -77,19 +77,19 @@ class TensorReshapingOp : public TensorBase -struct TensorEvaluator > +template +struct TensorEvaluator, Device> { typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op) - : m_impl(op.expression()), m_dimensions(op.dimensions()) + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { } typedef typename XprType::Index Index; @@ -111,7 +111,7 @@ struct TensorEvaluator > private: NewDimensions m_dimensions; - TensorEvaluator m_impl; + TensorEvaluator m_impl; }; From aa664eabb912a1b96e417e9a8d9c98f423b7fc23 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Jun 2014 10:31:29 -0700 Subject: [PATCH 017/214] Fixed a few compilation errors. --- .../CXX11/src/Tensor/TensorConvolution.h | 2 +- unsupported/test/CMakeLists.txt | 9 +++---- unsupported/test/cxx11_tensor_contraction.cpp | 26 ++++++++++++------- unsupported/test/cxx11_tensor_convolution.cpp | 7 ++--- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index a554b8260..c4cfe0cd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ struct TensorEvaluator XprType; static const int NumDims = TensorEvaluator::Dimensions::count; - static const int KernelDims = Indices::size; + static const int KernelDims = internal::array_size::value; typedef typename XprType::Index Index; typedef DSizes Dimensions; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 4a151bfa7..34130a192 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -95,9 +95,8 @@ ei_add_test(bdcsvd) option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON) if(EIGEN_TEST_CXX11) - # FIXME: add C++11 compiler switch in some portable way - # (MSVC doesn't need any for example, so this will - # clash there) + # It should be safe to always run these tests as there is some fallback code for + # older compiler that don't support cxx11. ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") @@ -107,7 +106,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") - ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") - ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") +# ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 1c89dfdd1..fc67d500b 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -11,6 +11,7 @@ #include +using Eigen::DefaultDevice; using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; @@ -29,9 +30,10 @@ static void test_evals() Tensor mat4(3,3); mat4.setZero(); Eigen::array dims3({{DimPair(0, 0)}}); - TensorEvaluator eval(mat1.contract(mat2, dims3)); + typedef TensorEvaluator Evaluator; + Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice()); eval.evalTo(mat4.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval.dimensions()[0], 3); VERIFY_IS_EQUAL(eval.dimensions()[1], 3); @@ -48,9 +50,10 @@ static void test_evals() Tensor mat5(2,2); mat5.setZero(); Eigen::array dims4({{DimPair(1, 1)}}); - TensorEvaluator eval2(mat1.contract(mat2, dims4)); + typedef TensorEvaluator Evaluator2; + Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice()); eval2.evalTo(mat5.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval2.dimensions()[0], 2); VERIFY_IS_EQUAL(eval2.dimensions()[1], 2); @@ -62,9 +65,10 @@ static void test_evals() Tensor mat6(2,2); mat6.setZero(); Eigen::array dims6({{DimPair(1, 0)}}); - TensorEvaluator eval3(mat1.contract(mat3, dims6)); + typedef TensorEvaluator Evaluator3; + Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice()); eval3.evalTo(mat6.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval3.dimensions()[0], 2); VERIFY_IS_EQUAL(eval3.dimensions()[1], 2); @@ -86,9 +90,10 @@ static void test_scalar() Tensor scalar(1); scalar.setZero(); Eigen::array dims({{DimPair(0, 0)}}); - TensorEvaluator eval(vec1.contract(vec2, dims)); + typedef TensorEvaluator Evaluator; + Evaluator eval(vec1.contract(vec2, dims), DefaultDevice()); eval.evalTo(scalar.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE); float expected = 0.0f; for (int i = 0; i < 6; ++i) { @@ -109,9 +114,10 @@ static void test_multidims() Tensor mat3(2, 2, 2); mat3.setZero(); Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); - TensorEvaluator eval(mat1.contract(mat2, dims)); + typedef TensorEvaluator Evaluator; + Evaluator eval(mat1.contract(mat2, dims), DefaultDevice()); eval.evalTo(mat3.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval.dimensions()[0], 2); VERIFY_IS_EQUAL(eval.dimensions()[1], 2); VERIFY_IS_EQUAL(eval.dimensions()[2], 2); diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index 95e40f64f..bafe73edd 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -12,7 +12,7 @@ #include using Eigen::Tensor; - +using Eigen::DefaultDevice; static void test_evals() { @@ -26,9 +26,10 @@ static void test_evals() result.setZero(); Eigen::array::Index, 1> dims3({0}); - TensorEvaluator eval(input.convolve(kernel, dims3)); + typedef TensorEvaluator Evaluator; + Evaluator eval(input.convolve(kernel, dims3), DefaultDevice()); eval.evalTo(result.data()); - EIGEN_STATIC_ASSERT(TensorEvaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE); VERIFY_IS_EQUAL(eval.dimensions()[0], 2); VERIFY_IS_EQUAL(eval.dimensions()[1], 3); From 38ab7e6ed0491bd5a0c639f218d5ea4728bf1e81 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 09:56:51 -0700 Subject: [PATCH 018/214] Reworked the expression evaluation mechanism in order to make it possible to efficiently compute convolutions and contractions in the future: * The scheduling of computation is moved out the the assignment code and into a new TensorExecutor class * The assignment itself is now a regular node on the expression tree * The expression evaluators start by recursively evaluating all their subexpressions if needed --- unsupported/Eigen/CXX11/Tensor | 4 + unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 270 ++++++++---------- .../CXX11/src/Tensor/TensorContraction.h | 8 + .../CXX11/src/Tensor/TensorConvolution.h | 9 + .../Eigen/CXX11/src/Tensor/TensorDevice.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 146 ++++++++++ .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 56 +++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 194 +++++++++++++ .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 142 +++++++++ .../src/Tensor/TensorForwardDeclarations.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorMap.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 7 + 14 files changed, 695 insertions(+), 174 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c67020581..7e504b302 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,8 +42,12 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" + #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 7f614bbe8..09601fc7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -236,7 +236,9 @@ class Tensor : public TensorBase > // FIXME: we need to resize the tensor to fix the dimensions of the other. // Unfortunately this isn't possible yet when the rhs is an expression. // resize(other.dimensions()); - internal::TensorAssign::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 633a7a31b..a2a925775 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorAssign @@ -21,172 +17,134 @@ namespace Eigen { * * \brief The tensor assignment class. * - * This class is responsible for triggering the evaluation of the expressions - * used on the lhs and rhs of an assignment operator and copy the result of - * the evaluation of the rhs expression at the address computed during the - * evaluation lhs expression. - * - * TODO: vectorization. For now the code only uses scalars - * TODO: parallelisation using multithreading on cpu, or kernels on gpu. + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. */ namespace internal { - -// Default strategy: the expressions are evaluated with a single cpu thread. -template::PacketAccess & TensorEvaluator::PacketAccess> -struct TensorAssign +template +struct traits > { - typedef typename Derived1::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - for (Index i = 0; i < size; ++i) { - evalDst.coeffRef(i) = evalSrc.coeff(i); - } - } + typedef typename LhsXprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + + enum { + Flags = 0, + }; }; - -template -struct TensorAssign +template +struct eval, Eigen::Dense> { - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device()) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evalDst.template writePacket(i, evalSrc.template packet(i)); - } - for (Index i = VectorizedSize; i < size; ++i) { - evalDst.coeffRef(i) = evalSrc.coeff(i); - } - } + typedef const TensorAssignOp& type; }; - - -// Multicore strategy: the index space is partitioned and each core is assigned to a partition -#ifdef EIGEN_USE_THREADS -template -struct EvalRange { - static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { - eigen_assert(last > first); - for (Index i = first; i < last; ++i) { - dst.coeffRef(i) = src.coeff(i); - } - } -}; - -template -struct EvalRange { - static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) { - eigen_assert(last > first); - - Index i = first; - static const int PacketSize = unpacket_traits::size; - if (last - first > PacketSize) { - static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned; - eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { - dst.template writePacket(i, src.template packet(i)); - } - } - - for (; i < last; ++i) { - dst.coeffRef(i) = src.coeff(i); - } - } -}; - -template -struct TensorAssignMultiThreaded +template +struct nested, 1, typename eval >::type> { - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device) - { - TensorEvaluator evalDst(dst, DefaultDevice()); - TensorEvaluator evalSrc(src, Defaultevice()); - const Index size = dst.size(); - - static const bool Vectorizable = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess; - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; - - int blocksz = static_cast(ceil(static_cast(size)/device.numThreads()) + PacketSize - 1); - const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Index i = 0; - vector > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, TensorEvaluator, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize)); - } - - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } - - if (numblocks * blocksize < size) { - EvalRange, TensorEvaluator, Index>::run(evalDst, evalSrc, numblocks * blocksize, size); - } - } + typedef TensorAssignOp type; }; -#endif + +} // end namespace internal -// GPU: the evaluation of the expressions is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - evalDst.coeffRef(index) = evalSrc.coeff(index); -} -template -__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) { - const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; - if (index < size) { - evalDst.coeffRef(index) = evalSrc.coeff(index); + +template +class TensorAssignOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename LhsXprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all::type& m_lhs_xpr; + const typename internal::remove_all::type& m_rhs_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorAssignOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_leftImpl.evalSubExprsIfNeeded(); + m_rightImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_leftImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet(index); + } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + } -template -struct TensorAssignGpu -{ - typedef typename Derived1::Index Index; - static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device) - { - TensorEvaluator evalDst(dst, device); - TensorEvaluator evalSrc(src, device); - const Index size = dst.size(); - const int block_size = std::min(size, 32*32); - const int num_blocks = size / block_size; - EigenMetaKernelNoCheck, TensorEvaluator > <<>>(evalDst, evalSrc); - - const int remaining_items = size % block_size; - if (remaining_items > 0) { - const int peel_start_offset = num_blocks * block_size; - const int peel_block_size = std::min(size, 32); - const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel, TensorEvaluator > <<>>(evalDst, evalSrc, peel_start_offset, size); - } - } -}; -#endif - -} // end namespace internal - -} // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index cadbabda2..b2e12fd15 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,6 +184,14 @@ struct TensorEvaluator class TensorDevice { template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssign::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -48,7 +51,10 @@ template class TensorDevice EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssignMultiThreaded::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); return *this; } @@ -67,13 +73,15 @@ template class TensorDevice template EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - internal::TensorAssignGpu::run(m_expression, other, m_device); + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); return *this; } protected: const GpuDevice& m_device; - ExpressionType& m_expression; + ExpressionType m_expression; }; #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 000000000..db716a80e --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,146 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorEvalToOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorEvalToOp type; +}; + +} // end namespace internal + + + + +template +class TensorEvalToOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(Scalar* buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC Scalar* buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + Scalar* m_buffer; +}; + + + +template +struct TensorEvaluator, Device> +{ + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + private: + TensorEvaluator m_impl; + const Device& m_device; + Scalar* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 80fe06957..5c8b079da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -38,27 +38,32 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); return m_data[index]; } - EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); return m_data[index]; } - template + template EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template + template EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -95,13 +100,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(index); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -137,13 +145,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_argImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -184,12 +199,21 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -230,12 +254,24 @@ struct TensorEvaluator // TODO: use then or else impl instead if they happen to be known at compile time. return m_condImpl.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_condImpl.evalSubExprsIfNeeded(); + m_thenImpl.evalSubExprsIfNeeded(); + m_elseImpl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 000000000..3e41f3290 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,194 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +#ifdef EIGEN_USE_THREADS +#include +#endif + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template::PacketAccess> +struct TensorExecutor +{ + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + + evaluator.cleanup(); + } +}; + + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; + + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template +struct EvalRange { + static void run(Evaluator& evaluator, const Index first, const Index last) { + eigen_assert(last > first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template +struct EvalRange { + static void run(Evaluator& evaluator, const Index first, const Index last,) { + eigen_assert(last > first); + + Index i = first; + static const int PacketSize = unpacket_traits::size; + if (last - first > PacketSize) { + eigen_assert(first % PacketSize == 0); + Index lastPacket = last - (last % PacketSize); + for (; i < lastPacket; i += PacketSize) { + evaluator.evalPacket(i); + } + } + + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } +}; + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + + static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + + int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + TensorEvaluator single_threaded_eval(expr, DefaultDevice()); + + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange, Index>::run, single_threaded_eval, i*blocksize, (i+1)*blocksize)); + } + + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange, Index>::run(single_threaded_eval, numblocks * blocksize, size, nullptr); + } + + evaluator.cleanup(); + } +}; +#endif + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template +__global__ void EigenMetaKernelNoCheck(Evaluator eval) { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + eval.evalScalar(index); +} +template +__global__ void EigenMetaKernelPeel(Evaluator eval, int peel_start_offset, int size) { + const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; + if (index < size) { + eval.evalScalar(index); + } +} + +template +struct TensorExecutor +{ + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator evaluator(expr, device); + evaluator.evalSubExprsIfNeeded(); + + const Index size = evaluator.dimensions().TotalSize(); + const int block_size = std::min(size, 32*32); + const int num_blocks = size / block_size; + EigenMetaKernelNoCheck > <<>>(evaluator); + + const int remaining_items = size % block_size; + if (remaining_items > 0) { + const int peel_start_offset = num_blocks * block_size; + const int peel_block_size = std::min(size, 32); + const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; + EigenMetaKernelPeel > <<>>(evaluator, peel_start_offset, size); + } + evaluator.cleanup(); + } +}; +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 789c04238..d42167da9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -200,7 +200,9 @@ class TensorFixedSize : public TensorBase::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 000000000..6f6641de6 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + + enum { + Flags = 0, + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorForcedEvalOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + + + +template +class TensorForcedEvalOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::Packet Packet; + typedef typename TensorEvaluator::Dimensions Dimensions; + + enum { + IsAligned = true, + PacketAccess = true, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) + { } + + EIGEN_DEVICE_FUNC ~TensorEvaluator() { + eigen_assert(!m_buffer); + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); + + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_buffer, m_op); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + m_impl.cleanup(); + } + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device& m_device; + Scalar* m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 27bfe1d73..c0dffbd0c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,13 +25,16 @@ template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; +template class TensorAssignOp; + +template class TensorEvalToOp; template class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; namespace internal { -template struct TensorAssign; +template class TensorExecutor; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 3a06170fa..c97135b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -246,7 +246,9 @@ template class TensorMap : public Tensor EIGEN_DEVICE_FUNC Self& operator=(const OtherDerived& other) { - internal::TensorAssign::run(*this, other); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index e9e74581f..764bba4e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -98,6 +98,13 @@ struct TensorEvaluator, Device> const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); From f80c8e17eb042fc95767417eeca26cd3fa0c6ad6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 10:12:12 -0700 Subject: [PATCH 019/214] Silenced a compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 58b1808a3..4bdf74286 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -208,9 +208,9 @@ struct TensorEvaluator m_indexStride; array m_kernelStride; - Dimensions m_dimensions; TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; }; From 774c3c1e0aca307e484b00997b735ee5964d96d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 Jun 2014 10:20:28 -0700 Subject: [PATCH 020/214] Created additional unit tests for the tensor code and improved existing ones. --- unsupported/test/CMakeLists.txt | 3 + unsupported/test/cxx11_tensor_device.cpp | 28 ++++++++- unsupported/test/cxx11_tensor_lvalue.cpp | 42 +++++++++++++ unsupported/test/cxx11_tensor_morphing.cpp | 72 ++++++++++++++++++++++ 4 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_lvalue.cpp create mode 100644 unsupported/test/cxx11_tensor_morphing.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 34130a192..7458128fb 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -105,7 +105,10 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") +# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_morphing "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") # ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index 365b109c7..caf2e9735 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -65,6 +65,12 @@ static void test_contextual_eval(Context* context) context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } +template +static void test_forced_contextual_eval(Context* context) +{ + context->out() = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); +} + static void test_cpu() { Eigen::Tensor in1(Eigen::array(2,3,7)); Eigen::Tensor in2(Eigen::array(2,3,7)); @@ -72,9 +78,9 @@ static void test_cpu() { in1.setRandom(); in2.setRandom(); + CPUContext context(in1, in2, out); test_contextual_eval(&context); - for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { @@ -82,6 +88,15 @@ static void test_cpu() { } } } + + test_forced_contextual_eval(&context); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + } + } + } } static void test_gpu() { @@ -111,7 +126,6 @@ static void test_gpu() { GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -120,6 +134,16 @@ static void test_gpu() { } } } + + test_forced_contextual_eval(&context); + cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + } + } + } } diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp new file mode 100644 index 000000000..071f5b406 --- /dev/null +++ b/unsupported/test/cxx11_tensor_lvalue.cpp @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + + +static void test_compound_assignment() +{ + Tensor mat1(2,3,7); + Tensor mat2(2,3,7); + Tensor mat3(2,3,7); + + mat1.setRandom(); + mat2.setRandom(); + mat3 = mat1; + mat3 += mat2; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_lvalue() +{ + CALL_SUBTEST(test_compound_assignment()); +} diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp new file mode 100644 index 000000000..21af9e0b5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -0,0 +1,72 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_reshape() +{ + Tensor tensor1(2,3,1,7,1); + tensor1.setRandom(); + + Tensor tensor2(2,3,7); + Tensor tensor3(6,7); + Tensor tensor4(2,21); + + Tensor::Dimensions dim1{{2,3,7}}; + tensor2 = tensor1.reshape(dim1); + Tensor::Dimensions dim2{{6,7}}; + tensor3 = tensor1.reshape(dim2); + Tensor::Dimensions dim3{{2,21}}; + tensor4 = tensor1.reshape(dim1).reshape(dim3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); + } + } + } +} + + +static void test_reshape_in_expr() { + MatrixXf m1(2,3*5*7*11); + MatrixXf m2(3*5*7*11,13); + m1.setRandom(); + m2.setRandom(); + MatrixXf m3 = m1 * m2; + + TensorMap> tensor1(m1.data(), 2,3,5,7,11); + TensorMap> tensor2(m2.data(), 3,5,7,11,13); + Tensor::Dimensions newDims1{{2,3*5*7*11}}; + Tensor::Dimensions newDims2{{3*5*7*11,13}}; + typedef Tensor::DimensionPair DimPair; + array contract_along{{DimPair(1, 0)}}; + Tensor tensor3(2,13); + tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); + + Map res(tensor3.data(), 2, 13); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 13; ++j) { + VERIFY_IS_APPROX(res(i,j), m3(i,j)); + } + } +} + +void test_cxx11_tensor_morphing() +{ + CALL_SUBTEST(test_simple_reshape()); + CALL_SUBTEST(test_reshape_in_expr()); +} From 47981c5925caa8316205ea84b17616dd69073678 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Jul 2014 14:07:57 -0700 Subject: [PATCH 021/214] Added support for tensor slicing --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 343 +++++++++++++++++- 1 file changed, 327 insertions(+), 16 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 764bba4e6..55954a3a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -20,10 +20,9 @@ namespace Eigen { * */ namespace internal { -template -struct traits > : public traits +template +struct traits > : public traits { - // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; typedef typename internal::packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; @@ -32,24 +31,24 @@ struct traits > : public traits::type _Nested; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorReshapingOp& type; + typedef const TensorReshapingOp& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorReshapingOp type; + typedef TensorReshapingOp type; }; } // end namespace internal -template -class TensorReshapingOp : public TensorBase > +template +class TensorReshapingOp : public TensorBase, WriteAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -71,16 +70,27 @@ class TensorReshapingOp : public TensorBase::type& expression() const { return m_xpr; } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + protected: typename XprType::Nested m_xpr; const NewDimensions m_dims; }; -template -struct TensorEvaluator, Device> +// Eval as rvalue +template +struct TensorEvaluator, Device> { - typedef TensorReshapingOp XprType; + typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; enum { @@ -88,7 +98,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, }; - TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { } @@ -96,7 +106,7 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { m_impl.evalSubExprsIfNeeded(); @@ -116,12 +126,313 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + protected: + NewDimensions m_dimensions; + TensorEvaluator m_impl; +}; + + +// Eval as lvalue +// TODO(bsteiner): share the code with the evaluator for rvalue reshapes. +template +struct TensorEvaluator, Device> +{ + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return m_impl.coeffRef(index); + } + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + m_impl.template writePacket(index, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + private: NewDimensions m_dimensions; TensorEvaluator m_impl; }; +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSlicingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSlicingOp type; +}; + +} // end namespace internal + + + +template +class TensorSlicingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } else { + m_inputStrides[0] = 1; + } + } + + const Sizes& output_dims = op.sizes(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + } else { + m_outputStrides[0] = 1; + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + private: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + const StartIndices m_offsets; + TensorEvaluator m_impl; +}; + + +// Eval as lvalue +// TODO(bsteiner): share the code with the evaluator for rvalue slices. +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } else { + m_inputStrides[0] = 1; + } + } + + const Sizes& output_dims = op.sizes(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + } else { + m_outputStrides[0] = 1; + } + } + } + + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef Sizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { + m_impl.evalSubExprsIfNeeded(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeffRef(inputIndex); + } + + private: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + const StartIndices m_offsets; + TensorEvaluator m_impl; +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H From bc072c5cba4cb6e9e7a6fd5f1e8f0e1231203223 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Jul 2014 14:08:45 -0700 Subject: [PATCH 022/214] Added support for tensor slicing --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 6b53d2a3d..527d47c57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -204,11 +204,16 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } - // Morphing operators (slicing tbd). + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp + const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp(derived(), newDimensions); + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); } // Force the evaluation of the expression. @@ -257,6 +262,17 @@ class TensorBase : public TensorBase, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + // Select the device on which to evaluate the expression. template TensorDevice device(const DeviceType& device) { From 7d53633e05986c61ce90e7fc36862d529c0cc036 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 7 Jul 2014 14:10:36 -0700 Subject: [PATCH 023/214] Added support for tensor slicing --- unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index c0dffbd0c..5d6e7776a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,6 +25,7 @@ template class TensorReductionOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; +template class TensorSlicingOp; template class TensorAssignOp; template class TensorEvalToOp; From c285fda7f40ca161e6c8e66481d9a68e50613c48 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Jul 2014 16:30:48 -0700 Subject: [PATCH 024/214] Extended the functionality of the TensorDeviceType classes --- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index 142edda14..b9c8c19fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -21,6 +21,12 @@ struct DefaultDevice { EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } }; @@ -28,7 +34,7 @@ struct DefaultDevice { // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } size_t numThreads() const { return num_threads_; } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { @@ -37,6 +43,12 @@ struct ThreadPoolDevice { EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } private: // todo: NUMA, ... @@ -47,20 +59,61 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU +static int m_numMultiProcessors = 0; +static int m_maxThreadsPerBlock = 0; +static int m_maxThreadsPerMultiProcessor = 0; + +static inline int getNumCudaMultiProcessors() { + if (m_numMultiProcessors == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + m_numMultiProcessors = deviceProp.multiProcessorCount; + } + return m_numMultiProcessors; +} +static inline int maxCudaThreadsPerBlock() { + if (m_maxThreadsPerBlock == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_numMultiProcessors = deviceProp.multiProcessorCount; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + } + return m_maxThreadsPerBlock; +} +static inline int maxCudaThreadsPerMultiProcessor() { + if (m_maxThreadsPerBlock == 0) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + m_numMultiProcessors = deviceProp.multiProcessorCount; + m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; + m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; + } + return m_maxThreadsPerMultiProcessor; +} + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); } EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { void* result; cudaMalloc(&result, num_bytes); return result; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const { cudaFree(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_); + } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + cudaMemsetAsync(buffer, c, n, *stream_); + } private: // TODO: multigpu. From cc1bacea5b6b532728a001f8cfcf762e5385dcef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Jul 2014 16:39:28 -0700 Subject: [PATCH 025/214] Improved the efficiency of the tensor evaluation code on thread pools and gpus. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 50 +++++++------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 3e41f3290..f50f839fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -77,17 +77,17 @@ struct TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator& evaluator, const Index first, const Index last) { + static void run(Evaluator* evaluator, const Index first, const Index last) { eigen_assert(last > first); for (Index i = first; i < last; ++i) { - evaluator.evalScalar(i); + evaluator->evalScalar(i); } } }; template struct EvalRange { - static void run(Evaluator& evaluator, const Index first, const Index last,) { + static void run(Evaluator* evaluator, const Index first, const Index last) { eigen_assert(last > first); Index i = first; @@ -96,12 +96,12 @@ struct EvalRange { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { - evaluator.evalPacket(i); + evaluator->evalPacket(i); } } for (; i < last; ++i) { - evaluator.evalScalar(i); + evaluator->evalScalar(i); } } }; @@ -112,24 +112,23 @@ struct TensorExecutor typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { - TensorEvaluator evaluator(expr, device); + typedef TensorEvaluator Evaluator; + Evaluator evaluator(expr, device); evaluator.evalSubExprsIfNeeded(); const Index size = evaluator.dimensions().TotalSize(); - static const int PacketSize = Vectorizable ? unpacket_traits::PacketReturnType>::size : 1; + static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - TensorEvaluator single_threaded_eval(expr, DefaultDevice()); - Index i = 0; vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange, Index>::run, single_threaded_eval, i*blocksize, (i+1)*blocksize)); + results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -137,7 +136,7 @@ struct TensorExecutor } if (numblocks * blocksize < size) { - EvalRange, Index>::run(single_threaded_eval, numblocks * blocksize, size, nullptr); + EvalRange::run(&evaluator, numblocks * blocksize, size); } evaluator.cleanup(); @@ -149,15 +148,11 @@ struct TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void EigenMetaKernelNoCheck(Evaluator eval) { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - eval.evalScalar(index); -} -template -__global__ void EigenMetaKernelPeel(Evaluator eval, int peel_start_offset, int size) { - const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x; - if (index < size) { - eval.evalScalar(index); +__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { + const int first_index = blockIdx.x * blockDim.x + threadIdx.x; + const int step_size = blockDim.x * gridDim.x; + for (int i = first_index; i < size; i += step_size) { + eval.evalScalar(i); } } @@ -169,19 +164,12 @@ struct TensorExecutor { TensorEvaluator evaluator(expr, device); evaluator.evalSubExprsIfNeeded(); + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); const Index size = evaluator.dimensions().TotalSize(); - const int block_size = std::min(size, 32*32); - const int num_blocks = size / block_size; - EigenMetaKernelNoCheck > <<>>(evaluator); - - const int remaining_items = size % block_size; - if (remaining_items > 0) { - const int peel_start_offset = num_blocks * block_size; - const int peel_block_size = std::min(size, 32); - const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size; - EigenMetaKernelPeel > <<>>(evaluator, peel_start_offset, size); - } + EigenMetaKernel > <<>>(evaluator, size); + eigen_assert(cudaGetLastError() == cudaSuccess); evaluator.cleanup(); } }; From ea0906dfd877b3be91b5b0a28d2040ec360b1d3a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 8 Jul 2014 16:43:28 -0700 Subject: [PATCH 026/214] Improved evaluation of tensor expressions when used as rvalues --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 5c8b079da..ac9829ce9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -23,6 +23,7 @@ namespace Eigen { * leading to lvalues (slicing, reshaping, etc...) */ +// Generic evaluator template struct TensorEvaluator { @@ -38,7 +39,7 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(Derived& m, const Device&) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) : m_data(const_cast(m.data())), m_dims(m.dimensions()) { } @@ -75,6 +76,49 @@ struct TensorEvaluator }; +// Default evaluator for rvalues +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = Derived::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) + : m_data(m.data()), m_dims(m.dimensions()) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data); + return m_data[index]; + } + + template EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt(m_data + index); + } + + protected: + const Scalar* m_data; + Dimensions m_dims; +}; + + + // -------------------- CwiseNullaryOp -------------------- From 25b2f6624d092ed99d0c4936de0c83c9ea4a024d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 9 Jul 2014 12:48:34 -0700 Subject: [PATCH 027/214] Improved the speed of slicing operations. --- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 55954a3a7..f6f67afa7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -320,11 +320,12 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + inputIndex += (index + m_offsets[0]); return m_impl.coeff(inputIndex); } @@ -399,11 +400,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + inputIndex += (index + m_offsets[0]); return m_impl.coeff(inputIndex); } @@ -416,11 +418,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + inputIndex += (index + m_offsets[0]); return m_impl.coeffRef(inputIndex); } From ffd3654f6738bab79db010e02cd67660ecca62c1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Jul 2014 11:09:46 -0700 Subject: [PATCH 028/214] Vectorized the evaluation of expressions involving tensor slices. --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 108 ++++++++++++++++-- 1 file changed, 98 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index f6f67afa7..3b42c8514 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -273,8 +273,10 @@ struct TensorEvaluator, Devi static const int NumDims = internal::array_size::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -329,11 +331,40 @@ struct TensorEvaluator, Devi return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } private: Dimensions m_dimensions; @@ -353,8 +384,8 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::value; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -409,11 +440,38 @@ struct TensorEvaluator, Device> return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { @@ -427,6 +485,36 @@ struct TensorEvaluator, Device> return m_impl.coeffRef(inputIndex); } + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + m_impl.template writePacket(inputIndices[0], x); + } + else { + CoeffReturnType values[packetSize]; + internal::pstore(values, x); + m_impl.coeffRef(inputIndices[0]) = values[0]; + m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + coeffRef(index+i) = values[i]; + } + } + } + private: Dimensions m_dimensions; array m_outputStrides; From 9b7a6f0122f6817a3c12bc75803d4270cd9db507 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Jul 2014 11:27:27 -0700 Subject: [PATCH 029/214] Added tests for tensor slicing --- unsupported/test/cxx11_tensor_morphing.cpp | 132 ++++++++++++++++++++- 1 file changed, 130 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 21af9e0b5..fbfdaadb7 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -52,8 +52,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - typedef Tensor::DimensionPair DimPair; - array contract_along{{DimPair(1, 0)}}; + array::DimensionPair, 1> contract_along{{1, 0}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -65,8 +64,137 @@ static void test_reshape_in_expr() { } } + +static void test_reshape_as_lvalue() +{ + Tensor tensor(2,3,7); + tensor.setRandom(); + + Tensor tensor2d(6,7); + Tensor::Dimensions dim{{2,3,7}}; + tensor2d.reshape(dim) = tensor; + + Tensor tensor5d(2,3,1,7,1); + tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); + VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); + } + } + } +} + + +static void test_simple_slice() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor slice1(1,1,1,1,1); + Eigen::DSizes indices(Eigen::array(1,2,3,4,5)); + Eigen::DSizes sizes(Eigen::array(1,1,1,1,1)); + slice1 = tensor.slice(indices, sizes); + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + + Tensor slice2(1,1,2,2,3); + Eigen::DSizes indices2(Eigen::array(1,1,3,4,5)); + Eigen::DSizes sizes2(Eigen::array(1,1,2,2,3)); + slice2 = tensor.slice(indices2, sizes2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } +} + + +static void test_slice_in_expr() { + MatrixXf m1(7,7); + MatrixXf m2(3,3); + m1.setRandom(); + m2.setRandom(); + + MatrixXf m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1); + + TensorMap> tensor1(m1.data(), 7, 7); + TensorMap> tensor2(m2.data(), 3, 3); + Tensor tensor3(3,1); + array::DimensionPair, 1> contract_along{{1, 0}}; + + Eigen::DSizes indices1(Eigen::array(1,2)); + Eigen::DSizes sizes1(Eigen::array(3,3)); + Eigen::DSizes indices2(Eigen::array(0,2)); + Eigen::DSizes sizes2(Eigen::array(3,1)); + tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along); + + Map res(tensor3.data(), 3, 1); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 1; ++j) { + VERIFY_IS_APPROX(res(i,j), m3(i,j)); + } + } +} + + +static void test_slice_as_lvalue() +{ + Tensor tensor1(2,2,7); + tensor1.setRandom(); + Tensor tensor2(2,2,7); + tensor2.setRandom(); + Tensor tensor3(4,3,5); + tensor3.setRandom(); + Tensor tensor4(4,3,2); + tensor4.setRandom(); + + Tensor result(4,5,7); + Eigen::DSizes sizes12(Eigen::array(2,2,7)); + Eigen::DSizes first_slice(Eigen::array(0,0,0)); + result.slice(first_slice, sizes12) = tensor1; + Eigen::DSizes second_slice(Eigen::array(2,0,0)); + result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2; + + Eigen::DSizes sizes3(Eigen::array(4,3,5)); + Eigen::DSizes third_slice(Eigen::array(0,2,0)); + result.slice(third_slice, sizes3) = tensor3; + + Eigen::DSizes sizes4(Eigen::array(4,3,2)); + Eigen::DSizes fourth_slice(Eigen::array(0,2,5)); + result.slice(fourth_slice, sizes4) = tensor4; + + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k)); + VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k)); + } + } + } + for (int i = 0; i < 4; ++i) { + for (int j = 2; j < 5; ++j) { + for (int k = 0; k < 5; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k)); + } + for (int k = 5; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5)); + } + } + } +} + + void test_cxx11_tensor_morphing() { CALL_SUBTEST(test_simple_reshape()); CALL_SUBTEST(test_reshape_in_expr()); + CALL_SUBTEST(test_reshape_as_lvalue()); + + CALL_SUBTEST(test_simple_slice()); + CALL_SUBTEST(test_slice_in_expr()); + CALL_SUBTEST(test_slice_as_lvalue()); } From 40bb98e76acbe6e077903e15896c100ee6cced39 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 10 Jul 2014 11:29:51 -0700 Subject: [PATCH 030/214] Added primitives to compare tensor dimensions --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 3e5687915..3b169a06f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -210,6 +210,60 @@ struct DSizes : array { }; +namespace internal { + +template struct array_size > { + static const size_t value = NumDims; +}; +template struct array_size > { + static const size_t value = NumDims; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template struct array_size > { +static const size_t value = Sizes::count; +}; +template struct array_size > { +static const size_t value = Sizes::count; +}; +#else +template struct array_size > { + static const size_t value = Sizes::count; +}; +template struct array_size > { + static const size_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes& a) { + return get::Base>::value; +}; + +#endif + + +template +struct sizes_match_up_to_dim { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) & + sizes_match_up_to_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_up_to_dim { + static inline bool run(Dims1& dims1, Dims2& dims2) { + return (array_get<0>(dims1) == array_get<0>(dims2)); + } +}; + +template +bool dimensions_match(Dims1& dims1, Dims2& dims2) { + if (array_size::value != array_size::value) { + return false; + } + return sizes_match_up_to_dim::value-1>::run(dims1, dims2); +} + +} // end namespace internal + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H From f7bb7ee3f36474163da7c7f6f88306d553238df2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Jul 2014 10:31:21 -0700 Subject: [PATCH 031/214] Fixed the assignment operator of the Tensor and TensorMap classes. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 11 +++++++++++ unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 9 ++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 09601fc7d..547bb74d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -229,6 +229,17 @@ class Tensor : public TensorBase > EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + // FIXME: we need to resize the tensor to fix the dimensions of the other. + // Unfortunately this isn't possible yet when the rhs is an expression. + // resize(other.dimensions()); + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index c97135b63..417717b90 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -241,9 +241,16 @@ template class TensorMap : public Tensor } #endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const OtherDerived& other) { typedef TensorAssignOp Assign; From 1f371e78e659d6e5fd781aea93b6b9c7a0604aeb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 22 Jul 2014 10:32:40 -0700 Subject: [PATCH 032/214] Added a few tests to validate the behavior of the assignment operator. --- unsupported/test/cxx11_tensor_assign.cpp | 43 ++++++++++++++++++++++++ unsupported/test/cxx11_tensor_simple.cpp | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index c88872950..b024bed19 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -186,10 +186,53 @@ static void test_3d() } } +static void test_same_type() +{ + Tensor orig_tensor(5); + Tensor dest_tensor(5); + orig_tensor.setRandom(); + dest_tensor.setRandom(); + int* orig_data = orig_tensor.data(); + int* dest_data = dest_tensor.data(); + dest_tensor = orig_tensor; + VERIFY_IS_EQUAL(orig_tensor.data(), orig_data); + VERIFY_IS_EQUAL(dest_tensor.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i)); + } + + TensorFixedSize > orig_array; + TensorFixedSize > dest_array; + orig_array.setRandom(); + dest_array.setRandom(); + orig_data = orig_array.data(); + dest_data = dest_array.data(); + dest_array = orig_array; + VERIFY_IS_EQUAL(orig_array.data(), orig_data); + VERIFY_IS_EQUAL(dest_array.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest_array(i), orig_array(i)); + } + + int orig[5] = {1, 2, 3, 4, 5}; + int dest[5] = {6, 7, 8, 9, 10}; + TensorMap > orig_map(orig, 5); + TensorMap > dest_map(dest, 5); + orig_data = orig_map.data(); + dest_data = dest_map.data(); + dest_map = orig_map; + VERIFY_IS_EQUAL(orig_map.data(), orig_data); + VERIFY_IS_EQUAL(dest_map.data(), dest_data); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(dest[i], i+1); + } +} + void test_cxx11_tensor_assign() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); + CALL_SUBTEST(test_same_type()); } diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index 1f76033ea..1455f2a4c 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -244,7 +244,7 @@ static void test_simple_assign() epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1; epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1; - Tensor e2(2,3,1); + Tensor e2(3,3,3); e2.setZero(); VERIFY_IS_EQUAL((e2(1,2,0)), 0); From 2116e261fb27c795d153f171467cf7912ff3eec5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 25 Jul 2014 09:47:59 -0700 Subject: [PATCH 033/214] Made sure that the data stored in fixed sized tensor is aligned. --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index c9d6517eb..0c4f8a3d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -38,7 +38,7 @@ template Date: Thu, 31 Jul 2014 17:39:04 -0700 Subject: [PATCH 034/214] The tensor assignment code now resizes the destination tensor as needed. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 547bb74d1..fdbe8df4c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -79,6 +79,7 @@ class Tensor : public TensorBase > }; static const int Options = Options_; + static const std::size_t NumIndices = NumIndices_; typedef DSizes Dimensions; @@ -232,11 +233,9 @@ class Tensor : public TensorBase > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { - // FIXME: we need to resize the tensor to fix the dimensions of the other. - // Unfortunately this isn't possible yet when the rhs is an expression. - // resize(other.dimensions()); typedef TensorAssignOp Assign; Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } @@ -244,11 +243,9 @@ class Tensor : public TensorBase > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) { - // FIXME: we need to resize the tensor to fix the dimensions of the other. - // Unfortunately this isn't possible yet when the rhs is an expression. - // resize(other.dimensions()); typedef TensorAssignOp Assign; Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); return *this; } From 439feca139a093292923e14c085352e5dd2239a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:22:05 -0700 Subject: [PATCH 035/214] Reworked the TensorExecutor code to support in place evaluation. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 106 ++++++++++-------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f50f839fc..d6e2ab1a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -11,7 +11,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #ifdef EIGEN_USE_THREADS -#include +#include " #endif namespace Eigen { @@ -28,45 +28,49 @@ namespace internal { // Default strategy: the expression is evaluated with a single cpu thread. template::PacketAccess> -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } } - evaluator.cleanup(); } }; template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; - const Index size = evaluator.dimensions().TotalSize(); - static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); - } - evaluator.cleanup(); } }; @@ -107,38 +111,40 @@ struct EvalRange { }; template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; Evaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); - const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; - static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; + int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; - int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + } - Index i = 0; - vector > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange::run(&evaluator, numblocks * blocksize, size); + } } - - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } - - if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); - } - evaluator.cleanup(); } }; @@ -157,19 +163,23 @@ __global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { } template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); - const int block_size = maxCudaThreadsPerBlock(); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); - const Index size = evaluator.dimensions().TotalSize(); - EigenMetaKernel > <<>>(evaluator, size); - eigen_assert(cudaGetLastError() == cudaSuccess); + const Index size = evaluator.dimensions().TotalSize(); + EigenMetaKernel > <<>>(evaluator, size); + assert(cudaGetLastError() == cudaSuccess); + } evaluator.cleanup(); } }; From b1892ab14d8ac94bef233d0cef0ef7df1e9a592e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:26:44 -0700 Subject: [PATCH 036/214] Added suppor for in place evaluation to simple tensor expressions. Use mempy to speedup tensor copies whenever possible. --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 12 +++- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 55 ++++++++++++++----- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index a2a925775..3bfe80c9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -102,6 +102,7 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; @@ -112,9 +113,14 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_leftImpl.evalSubExprsIfNeeded(); - m_rightImpl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + eigen_assert(internal::dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index ac9829ce9..0f969036c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -39,13 +39,20 @@ struct TensorEvaluator PacketAccess = Derived::PacketAccess, }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) - : m_data(const_cast(m.data())), m_dims(m.dimensions()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(const_cast(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + return false; + } + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -70,9 +77,12 @@ struct TensorEvaluator return internal::pstoret(m_data + index, x); } + Scalar* data() const { return m_data; } + protected: Scalar* m_data; Dimensions m_dims; + const Device& m_device; }; @@ -98,7 +108,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -112,6 +122,8 @@ struct TensorEvaluator return internal::ploadt(m_data + index); } + const Scalar* data() const { return m_data; } + protected: const Scalar* m_data; Dimensions m_dims; @@ -138,13 +150,14 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -158,6 +171,8 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } + Scalar* data() const { return NULL; } + private: const NullaryOp m_functor; TensorEvaluator m_argImpl; @@ -183,14 +198,16 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_argImpl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); @@ -207,6 +224,8 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } + Scalar* data() const { return NULL; } + private: const UnaryOp m_functor; TensorEvaluator m_argImpl; @@ -233,6 +252,7 @@ struct TensorEvaluator::Dimensions Dimensions; @@ -243,9 +263,10 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } + Scalar* data() const { return NULL; } + private: const BinaryOp m_functor; TensorEvaluator m_leftImpl; @@ -289,6 +312,7 @@ struct TensorEvaluator { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; @@ -299,10 +323,11 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_condImpl.evalSubExprsIfNeeded(); - m_thenImpl.evalSubExprsIfNeeded(); - m_elseImpl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); @@ -327,6 +352,8 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } + Scalar* data() const { return NULL; } + private: TensorEvaluator m_condImpl; TensorEvaluator m_thenImpl; From 1aa2bf82741f2f51fbf0a29ff95e0d017f6962a3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:27:58 -0700 Subject: [PATCH 037/214] Support for in place evaluation of expressions containing slicing and reshaping operations --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3b42c8514..2b1b503cf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -103,13 +103,14 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -126,6 +127,8 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + Scalar* data() const { return NULL; } + protected: NewDimensions m_dimensions; TensorEvaluator m_impl; @@ -150,13 +153,14 @@ struct TensorEvaluator, Device> { } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -182,6 +186,8 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } + Scalar* data() const { return NULL; } + private: NewDimensions m_dimensions; TensorEvaluator m_impl; @@ -306,14 +312,16 @@ struct TensorEvaluator, Devi } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef Sizes Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -366,6 +374,8 @@ struct TensorEvaluator, Devi } } + Scalar* data() const { return NULL; } + private: Dimensions m_dimensions; array m_outputStrides; @@ -415,14 +425,16 @@ struct TensorEvaluator, Device> } typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; typedef Sizes Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -515,6 +527,8 @@ struct TensorEvaluator, Device> } } + Scalar* data() const { return NULL; } + private: Dimensions m_dimensions; array m_outputStrides; From 72e75297089e7c141108696195763c024571974d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:29:40 -0700 Subject: [PATCH 038/214] Fixed a typo. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d6e2ab1a2..faf965df8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -11,7 +11,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #ifdef EIGEN_USE_THREADS -#include " +#include #endif namespace Eigen { From f8fad09301106c574ed88ffde52e15483d14673f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:33:18 -0700 Subject: [PATCH 039/214] Updated the convolution and contraction evaluators to follow the new EvalSubExprsIfNeeded apu. --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 7 ++++--- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b2e12fd15..8d7a1351e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,9 +184,10 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); @@ -151,11 +151,12 @@ struct TensorEvaluator Date: Wed, 13 Aug 2014 08:36:33 -0700 Subject: [PATCH 040/214] Added missing apis. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +++- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8d7a1351e..b2969337f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -173,7 +173,7 @@ struct TensorEvaluator::value> m_leftOffsets; array::value> m_rightOffsets; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 8864c5329..e3068dcae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -148,6 +148,7 @@ struct TensorEvaluator m_inputStride; array m_outputStride; From f1d8c13dbcbe38938dcd727f9b50339a981197c3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 13 Aug 2014 08:40:26 -0700 Subject: [PATCH 041/214] Fixed misc typos. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b2969337f..897d73806 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -184,7 +184,7 @@ struct TensorEvaluator Date: Wed, 13 Aug 2014 08:44:47 -0700 Subject: [PATCH 042/214] Added ability to get the nth element from an abstract array type. --- .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 1d3164d6a..4c6b95773 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -181,6 +181,15 @@ array repeat(t v) { return array; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list& a) { + return get >::value; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list& a) { + return get >::value; +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; @@ -196,8 +205,8 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; } -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const T& array_get(const array& a) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { return a[I]; } From eeb43f9e2b7ac56af685d8fc494685df8227a53f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 00:22:47 -0700 Subject: [PATCH 043/214] Added support for padding, stridding, and shuffling --- unsupported/Eigen/CXX11/Tensor | 3 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 15 ++ .../src/Tensor/TensorForwardDeclarations.h | 3 + .../Eigen/CXX11/src/Tensor/TensorPadding.h | 163 +++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 168 +++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorStriding.h | 172 ++++++++++++++++++ 6 files changed, 524 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7e504b302..0775d440a 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,6 +42,9 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 527d47c57..0295fcdbc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -215,6 +215,21 @@ class TensorBase slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } // Force the evaluation of the expression. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 5d6e7776a..baa5968bc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -26,6 +26,9 @@ template class template class TensorConvolutionOp; template class TensorReshapingOp; template class TensorSlicingOp; +template class TensorPaddingOp; +template class TensorShufflingOp; +template class TensorStridingOp; template class TensorAssignOp; template class TensorEvalToOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 000000000..45558d7dd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,163 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only 0-padding is supported. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPaddingOp type; +}; + +} // end namespace internal + + + +template +class TensorPaddingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims) + : m_xpr(expr), m_padding_dims(padding_dims) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()) + { + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } else { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + Scalar* data() const { return NULL; } + + protected: + PaddingDimensions m_padding; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 000000000..4dfc99203 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorShufflingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorShufflingOp type; +}; + +} // end namespace internal + + + +template +class TensorShufflingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle) + : m_xpr(expr), m_shuffle(shuffle) {} + + EIGEN_DEVICE_FUNC + const Shuffle& shuffle() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_shuffle(op.shuffle()) + { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = input_dims[m_shuffle[i]]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } else { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + } + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[m_shuffle[i]]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[m_shuffle[0]]; + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + Shuffle m_shuffle; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 000000000..7acdbfc72 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,172 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorStridingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorStridingOp type; +}; + +} // end namespace internal + + + +template +class TensorStridingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorStridingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = /*TensorEvaluator::PacketAccess*/false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = ceilf(static_cast(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } else { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + } + } + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] *= op.strides()[i]; + } + } + + // typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + return m_impl.coeff(inputIndex); + } + + /* template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + }*/ + + Scalar* data() const { return NULL; } + + protected: + // Strides m_strides; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H From 8c8db49331a89236be7fdf045279504dd7d1797a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 00:25:22 -0700 Subject: [PATCH 044/214] Added a few regression tests --- unsupported/test/CMakeLists.txt | 3 + unsupported/test/cxx11_tensor_padding.cpp | 54 +++++++++ unsupported/test/cxx11_tensor_shuffling.cpp | 116 ++++++++++++++++++++ unsupported/test/cxx11_tensor_striding.cpp | 71 ++++++++++++ 4 files changed, 244 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_padding.cpp create mode 100644 unsupported/test/cxx11_tensor_shuffling.cpp create mode 100644 unsupported/test/cxx11_tensor_striding.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 406564673..cd2063848 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -109,6 +109,9 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") + ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_shuffling "-std=c++0x") + ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") # ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp new file mode 100644 index 000000000..d93bb1883 --- /dev/null +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_padding() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array, 4> paddings; + paddings[0] = make_pair(0, 0); + paddings[1] = make_pair(2, 1); + paddings[2] = make_pair(3, 4); + paddings[3] = make_pair(0, 0); + + Tensor padded; + padded = tensor.pad(paddings); + + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(tensor(i,j-2,k-3,l), padded(i,j,k,l)); + } else { + VERIFY_IS_EQUAL(0.0f, padded(i,j,k,l)); + } + } + } + } + } +} + + +void test_cxx11_tensor_padding() +{ + CALL_SUBTEST(test_simple_padding()); +} diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp new file mode 100644 index 000000000..92dd01a52 --- /dev/null +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -0,0 +1,116 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_shuffling() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + + Tensor no_shuffle; + no_shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor shuffle; + shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(shuffle.dimension(0), 5); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + + +static void test_expr_shuffling() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array shuffles; + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor expected; + expected = tensor.shuffle(shuffles); + + Tensor result(5,7,3,2); + + array src_slice_dim(Eigen::array(2,3,1,7)); + array src_slice_start(Eigen::array(0,0,0,0)); + array dst_slice_dim(Eigen::array(1,7,3,2)); + array dst_slice_start(Eigen::array(0,0,0,0)); + + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles); + src_slice_start[2] += 1; + dst_slice_start[0] += 1; + } + + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + VERIFY_IS_EQUAL(result.dimension(2), 3); + VERIFY_IS_EQUAL(result.dimension(3), 2); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_shuffling() +{ + CALL_SUBTEST(test_simple_shuffling()); + CALL_SUBTEST(test_expr_shuffling()); +} diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp new file mode 100644 index 000000000..502569d1d --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -0,0 +1,71 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_striding() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + Tensor no_stride; + no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + Tensor stride; + stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < 1; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_striding() +{ + CALL_SUBTEST(test_simple_striding()); +} From 756292f8aa124c842d1e6d9beeb0c416c0d9a7f3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 00:32:59 -0700 Subject: [PATCH 045/214] Fixed compilation errors --- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_padding.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cd2063848..520935105 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,7 +110,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_map "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") - ei_add_test(cxx11_tensor_shuffling "-std=c++0x") +# ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index d93bb1883..cb010f512 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -18,11 +18,11 @@ static void test_simple_padding() Tensor tensor(2,3,5,7); tensor.setRandom(); - array, 4> paddings; - paddings[0] = make_pair(0, 0); - paddings[1] = make_pair(2, 1); - paddings[2] = make_pair(3, 4); - paddings[3] = make_pair(0, 0); + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); Tensor padded; padded = tensor.pad(paddings); From 33c702c79fe227a5b22229c26af276d359a6cb1d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 14 Aug 2014 22:13:21 -0700 Subject: [PATCH 046/214] Added support for fast integer divisions by a constant Sped up tensor slicing by a factor of 3 by using these fast integer divisions. --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 82 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 26 +++--- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_intdiv.cpp | 77 +++++++++++++++++ 5 files changed, 177 insertions(+), 10 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h create mode 100644 unsupported/test/cxx11_tensor_intdiv.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 0775d440a..82552c3c2 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -34,6 +34,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 000000000..cf97031be --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at http://dx.doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +template +struct TensorIntDivisor { + public: + TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 1 <= divider <= 2^31-1 + TensorIntDivisor(const T divider) { + static const int N = 32; + eigen_assert(divider > 0); + eigen_assert(divider <= (1<<(N-1)) - 1); + + // fast ln2 + const int leading_zeros = __builtin_clz(divider); + const int l = N - (leading_zeros+1); + + multiplier = (static_cast(1) << (N+l)) / divider - (static_cast(1) << N) + 1; + shift1 = (std::min)(1, l); + shift2 = (std::max)(0, l-1); + } + + // Must have 0 <= numerator <= 2^32-1 + T divide(const T numerator) const { + static const int N = 32; + eigen_assert(numerator >= 0); + eigen_assert(numerator <= (1ull<> 32; + uint32_t t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + uint64_t multiplier; + int32_t shift1; + int32_t shift2; +}; + + +template +static T operator / (const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 2b1b503cf..ca3735d64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -305,8 +305,10 @@ struct TensorEvaluator, Devi for (int i = 0; i < NumDims; ++i) { if (i > 0) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } else { m_outputStrides[0] = 1; + m_fastOutputStrides[0] = 1; } } } @@ -331,7 +333,7 @@ struct TensorEvaluator, Devi { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -349,8 +351,8 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; indices[0] -= idx0 * m_outputStrides[i]; @@ -379,6 +381,7 @@ struct TensorEvaluator, Devi private: Dimensions m_dimensions; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; const StartIndices m_offsets; TensorEvaluator m_impl; @@ -418,9 +421,11 @@ struct TensorEvaluator, Device> for (int i = 0; i < NumDims; ++i) { if (i > 0) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } else { m_outputStrides[0] = 1; - } + m_fastOutputStrides[0] = 1; + } } } @@ -444,7 +449,7 @@ struct TensorEvaluator, Device> { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -460,8 +465,8 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; indices[0] -= idx0 * m_outputStrides[i]; @@ -489,7 +494,7 @@ struct TensorEvaluator, Device> { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } @@ -504,8 +509,8 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; indices[0] -= idx0 * m_outputStrides[i]; @@ -532,6 +537,7 @@ struct TensorEvaluator, Device> private: Dimensions m_dimensions; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; const StartIndices m_offsets; TensorEvaluator m_impl; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 520935105..e2204827e 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,6 +106,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp new file mode 100644 index 000000000..a510dc695 --- /dev/null +++ b/unsupported/test/cxx11_tensor_intdiv.cpp @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_signed_32bit() +{ + for (int32_t i = 1; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (int32_t j = 0; j < 25000; ++j) { + const int32_t fast_div = j / div; + const int32_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_unsigned_32bit() +{ + for (uint32_t i = 1; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (uint32_t j = 0; j < 25000; ++j) { + const uint32_t fast_div = j / div; + const uint32_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_signed_64bit() +{ + for (int64_t i = 2; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (int64_t j = 0; j < 25000; ++j) { + const int64_t fast_div = j / div; + const int64_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +static void test_unsigned_64bit() +{ + for (uint64_t i = 2; i < 25000; ++i) { + const Eigen::internal::TensorIntDivisor div(i); + + for (uint64_t j = 0; j < 25000; ++j) { + const uint64_t fast_div = j / div; + const uint64_t slow_div = j / i; + VERIFY_IS_EQUAL(fast_div, slow_div); + } + } +} + + +void test_cxx11_tensor_intdiv() +{ + CALL_SUBTEST(test_signed_32bit()); + CALL_SUBTEST(test_unsigned_32bit()); + CALL_SUBTEST(test_signed_64bit()); + CALL_SUBTEST(test_unsigned_64bit()); +} From 9ac3c821ea3b956634116bcdf80bfab7d9a00d91 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 Aug 2014 16:57:10 -0700 Subject: [PATCH 047/214] Improved the speed of convolutions when running on cuda devices --- .../CXX11/src/Tensor/TensorConvolution.h | 632 +++++++++++++++++- 1 file changed, 622 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4158271c3..7d0a21c3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -20,6 +20,126 @@ namespace Eigen { * */ namespace internal { + + +template class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } else { + inputStrides[0] = 1; + outputStrides[0] = 1; + } + } + + array cudaInputDimensions; + array cudaOutputDimensions; + array tmp = dimensions; + array ordering; + for (int i = 0; i < NumKernelDims; ++i) { + ordering[i] = indices[i]; + tmp[indices[i]] = -1; + cudaInputDimensions[i] = input_dims[ordering[i]]; + cudaOutputDimensions[i] = dimensions[ordering[i]]; + } + int written = NumKernelDims; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1]; + m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1]; + } else { + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_cudaInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_cudaOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { + return i * m_inputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { + return i * m_outputStrides[0]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { + return i * m_inputStrides[0] + j*m_inputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { + return i * m_outputStrides[0] + j * m_outputStrides[1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2]; + } + + private: + static const size_t NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_cudaInputStrides; + array m_cudaOutputStrides; +}; + + + template struct traits > { @@ -75,15 +195,15 @@ class TensorConvolutionOp : public TensorBase::type& inputExpression() const { return m_input_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& kernelExpression() const { return m_kernel_xpr; } @@ -99,8 +219,8 @@ struct TensorEvaluator XprType; - static const int NumDims = TensorEvaluator::Dimensions::count; - static const int KernelDims = internal::array_size::value; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; typedef typename XprType::Index Index; typedef DSizes Dimensions; @@ -111,7 +231,7 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -124,7 +244,8 @@ struct TensorEvaluator m_inputStride; array m_outputStride; - array m_indexStride; - array m_kernelStride; + array m_indexStride; + array m_kernelStride; TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; Dimensions m_dimensions; }; + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + + + + +template +__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + + +template +__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { + extern __shared__ float s[]; + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + + +template +__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { + extern __shared__ float s[]; + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = sharedMemPerBlock(); + const int maxThreadsPerBlock = maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = getNumCudaMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + + int maxX; + dim3 block_size; + if (m_indices[0] == 0) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = (std::min)(inner_dim, numX); + const int maxP = (std::min)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = (std::min)(maxThreadsPerBlock, maxX); + block_size.y = (std::min)(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = (std::min)(inner_dim, numP); + maxX = (std::min)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = (std::min)(warpSize, maxX); + block_size.y = (std::min)(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, min(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + EigenConvolutionKernel1D, Index, InputDims, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + EigenConvolutionKernel1D, Index, InputDims, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + EigenConvolutionKernel1D, Index, InputDims, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + case 2: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = (std::min)(inner_dim, numX); + const int maxY = (std::min)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = (std::min)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = (std::min)(1024, maxX); + block_size.y = (std::min)(1024/block_size.x, maxY); + block_size.z = (std::min)(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, min(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0], m_indices[1]); + const array kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]); + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + EigenConvolutionKernel2D, Index, InputDims, 4, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + EigenConvolutionKernel2D, Index, InputDims, 4, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + EigenConvolutionKernel2D, Index, InputDims, 7, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + EigenConvolutionKernel2D, Index, InputDims, 7, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + EigenConvolutionKernel2D, Index, InputDims, Eigen::Dynamic, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + case 3: { + const int kernel_size_x = m_kernelImpl.dimensions()[0]; + const int kernel_size_y = m_kernelImpl.dimensions()[1]; + const int kernel_size_z = m_kernelImpl.dimensions()[2]; + + const int numX = dimensions()[m_indices[0]]; + const int numY = dimensions()[m_indices[1]]; + const int numZ = dimensions()[m_indices[2]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = (std::min)(128, (std::min)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = (std::min)(128, (std::min)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = (std::min)(128, (std::min)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = (std::min)(32, maxX); + block_size.y = (std::min)(32, maxY); + block_size.z = (std::min)(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[0], m_indices[1], m_indices[2]); + const array kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + EigenConvolutionKernel3D, Index, InputDims> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + cudaError_t error = cudaGetLastError(); + assert(error == cudaSuccess); + break; + } + + default: { + assert(false && "not supported yet"); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + assert(m_buf); + assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H From 3d298da2696ac956a430f6fbef93bf65ada0d304 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 20 Aug 2014 17:00:50 -0700 Subject: [PATCH 048/214] Added support for broadcasting --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 6 + .../CXX11/src/Tensor/TensorBroadcasting.h | 186 ++++++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 1 + unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_broadcasting.cpp | 114 +++++++++++ 6 files changed, 309 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h create mode 100644 unsupported/test/cxx11_tensor_broadcasting.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 82552c3c2..ebe6419e8 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -42,6 +42,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0295fcdbc..da5148a5b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -204,6 +204,12 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp(derived(), broadcast); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 000000000..3b2a9c8b9 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,186 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorBroadcastingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorBroadcastingOp type; +}; + +} // end namespace internal + + + +template +class TensorBroadcastingOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + }; + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Broadcast& broadcast = op.broadcast(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * broadcast[i]; + } + + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index % m_impl.dimensions()[0]); + return m_impl.coeff(inputIndex); + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + const Index innermostLoc = index % m_impl.dimensions()[0]; + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeff(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index baa5968bc..afbcc9486 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -22,6 +22,7 @@ template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; template class TensorReductionOp; +template class TensorBroadcastingOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e2204827e..164388746 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -109,6 +109,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") + ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") # ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp new file mode 100644 index 000000000..9663912a4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -0,0 +1,114 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_broadcasting() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 1; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 1; + + Tensor no_broadcast; + no_broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2); + VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3); + VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l)); + } + } + } + } + + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 1; + broadcasts[3] = 4; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 4); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 28); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 28; ++l) { + VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l)); + } + } + } + } +} + + +static void test_vectorized_broadcasting() +{ + Tensor tensor(8,3,5); + tensor.setRandom(); + array broadcasts; + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 4; + + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 16); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k)); + } + } + } + + tensor.resize(11,3,5); + tensor.setRandom(); + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 22); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 22; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_broadcasting() +{ + CALL_SUBTEST(test_simple_broadcasting()); + CALL_SUBTEST(test_vectorized_broadcasting()); +} From fb5c1e9097886616d40a0988af5ca706292e54eb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 23 Aug 2014 13:18:30 -0700 Subject: [PATCH 049/214] Optimized and cleaned up the tensor morphing code --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 218 +++++------------- 1 file changed, 63 insertions(+), 155 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index ca3735d64..d9a6b3f1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -127,7 +127,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - Scalar* data() const { return NULL; } + Scalar* data() const { return m_impl.data(); } protected: NewDimensions m_dimensions; @@ -136,10 +136,12 @@ struct TensorEvaluator, Device> // Eval as lvalue -// TODO(bsteiner): share the code with the evaluator for rvalue reshapes. template -struct TensorEvaluator, Device> + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + { + typedef TensorEvaluator, Device> Base; typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; @@ -149,7 +151,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + : Base(op, device) { } typedef typename XprType::Index Index; @@ -157,40 +159,15 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return m_impl.coeffRef(index); + return this->m_impl.coeffRef(index); } template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - m_impl.template writePacket(index, x); + this->m_impl.template writePacket(index, x); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet(index); - } - - Scalar* data() const { return NULL; } - - private: - NewDimensions m_dimensions; - TensorEvaluator m_impl; }; @@ -286,7 +263,7 @@ struct TensorEvaluator, Devi }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { for (int i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); @@ -321,24 +298,37 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_impl.evalSubExprsIfNeeded(NULL); + if (data && m_impl.data()) { + Index contiguous_values = 1; + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + if (contiguous_values > 2 * m_device.numThreads()) { + Scalar* src = m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy(data+i, src+offset, contiguous_values * sizeof(Scalar)); + } + return false; + } + } return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - return m_impl.coeff(inputIndex); + return m_impl.coeff(srcCoeff(index)); } template @@ -376,23 +366,37 @@ struct TensorEvaluator, Devi } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + return inputIndex; + } - private: Dimensions m_dimensions; array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; const StartIndices m_offsets; TensorEvaluator m_impl; + const Device& m_device; }; // Eval as lvalue -// TODO(bsteiner): share the code with the evaluator for rvalue slices. template struct TensorEvaluator, Device> + : public TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Base; typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; @@ -402,32 +406,8 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) - { - for (int i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } else { - m_inputStrides[0] = 1; - } - } - - const Sizes& output_dims = op.sizes(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } else { - m_outputStrides[0] = 1; - m_fastOutputStrides[0] = 1; - } - } - } + : Base(op, device) + { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -435,71 +415,9 @@ struct TensorEvaluator, Device> typedef typename XprType::PacketReturnType PacketReturnType; typedef Sizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - static const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - PacketReturnType rslt = m_impl.template packet(inputIndices[0]); - return rslt; - } - else { - CoeffReturnType values[packetSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - return m_impl.coeffRef(inputIndex); + return this->m_impl.coeffRef(this->srcCoeff(index)); } template EIGEN_STRONG_INLINE @@ -509,38 +427,28 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - m_impl.template writePacket(inputIndices[0], x); + this->m_impl.template writePacket(inputIndices[0], x); } else { CoeffReturnType values[packetSize]; internal::pstore(values, x); - m_impl.coeffRef(inputIndices[0]) = values[0]; - m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; for (int i = 1; i < packetSize-1; ++i) { - coeffRef(index+i) = values[i]; + this->coeffRef(index+i) = values[i]; } } } - - Scalar* data() const { return NULL; } - - private: - Dimensions m_dimensions; - array m_outputStrides; - array, NumDims> m_fastOutputStrides; - array m_inputStrides; - const StartIndices m_offsets; - TensorEvaluator m_impl; }; From 36fffe48f7231e07915ec231d33cf46faa0fa918 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 23 Aug 2014 14:35:41 -0700 Subject: [PATCH 050/214] Misc api improvements and cleanups --- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 9 +++++ .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 26 +++++++++++++ unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_morphing.cpp | 37 ++++++++++--------- 4 files changed, 55 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index b9c8c19fe..ef5e11537 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -27,6 +27,10 @@ struct DefaultDevice { EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + + EIGEN_STRONG_INLINE size_t numThreads() const { + return 1; + } }; @@ -115,6 +119,11 @@ struct GpuDevice { cudaMemsetAsync(buffer, c, n, *stream_); } + EIGEN_STRONG_INLINE size_t numThreads() const { + // Fixme: + return 32; + } + private: // TODO: multigpu. const cudaStream_t* stream_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 3b169a06f..5a113dc19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -195,6 +195,32 @@ struct DSizes : array { } EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } + DSizes& operator = (const array& other) { *static_cast(this) = other; return *this; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 164388746..615ff3e6d 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,7 +110,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") -# ei_add_test(cxx11_tensor_morphing "-std=c++0x") + ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index fbfdaadb7..2a6a97856 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -52,7 +52,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - array::DimensionPair, 1> contract_along{{1, 0}}; + array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -74,7 +74,8 @@ static void test_reshape_as_lvalue() Tensor::Dimensions dim{{2,3,7}}; tensor2d.reshape(dim) = tensor; - Tensor tensor5d(2,3,1,7,1); + float scratch[2*3*1*7*1]; + TensorMap> tensor5d(scratch, 2,3,1,7,1); tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor; for (int i = 0; i < 2; ++i) { @@ -94,14 +95,14 @@ static void test_simple_slice() tensor.setRandom(); Tensor slice1(1,1,1,1,1); - Eigen::DSizes indices(Eigen::array(1,2,3,4,5)); - Eigen::DSizes sizes(Eigen::array(1,1,1,1,1)); + Eigen::DSizes indices(1,2,3,4,5); + Eigen::DSizes sizes(1,1,1,1,1); slice1 = tensor.slice(indices, sizes); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); Tensor slice2(1,1,2,2,3); - Eigen::DSizes indices2(Eigen::array(1,1,3,4,5)); - Eigen::DSizes sizes2(Eigen::array(1,1,2,2,3)); + Eigen::DSizes indices2(1,1,3,4,5); + Eigen::DSizes sizes2(1,1,2,2,3); slice2 = tensor.slice(indices2, sizes2); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 2; ++j) { @@ -124,12 +125,12 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array::DimensionPair, 1> contract_along{{1, 0}}; + array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; - Eigen::DSizes indices1(Eigen::array(1,2)); - Eigen::DSizes sizes1(Eigen::array(3,3)); - Eigen::DSizes indices2(Eigen::array(0,2)); - Eigen::DSizes sizes2(Eigen::array(3,1)); + Eigen::DSizes indices1(1,2); + Eigen::DSizes sizes1(3,3); + Eigen::DSizes indices2(0,2); + Eigen::DSizes sizes2(3,1); tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along); Map res(tensor3.data(), 3, 1); @@ -153,18 +154,18 @@ static void test_slice_as_lvalue() tensor4.setRandom(); Tensor result(4,5,7); - Eigen::DSizes sizes12(Eigen::array(2,2,7)); - Eigen::DSizes first_slice(Eigen::array(0,0,0)); + Eigen::DSizes sizes12(2,2,7); + Eigen::DSizes first_slice(0,0,0); result.slice(first_slice, sizes12) = tensor1; - Eigen::DSizes second_slice(Eigen::array(2,0,0)); + Eigen::DSizes second_slice(2,0,0); result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2; - Eigen::DSizes sizes3(Eigen::array(4,3,5)); - Eigen::DSizes third_slice(Eigen::array(0,2,0)); + Eigen::DSizes sizes3(4,3,5); + Eigen::DSizes third_slice(0,2,0); result.slice(third_slice, sizes3) = tensor3; - Eigen::DSizes sizes4(Eigen::array(4,3,2)); - Eigen::DSizes fourth_slice(Eigen::array(0,2,5)); + Eigen::DSizes sizes4(4,3,2); + Eigen::DSizes fourth_slice(0,2,5); result.slice(fourth_slice, sizes4) = tensor4; for (int j = 0; j < 2; ++j) { From 2959045f2fe111f93b23517fd6f7afe49720a290 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 26 Aug 2014 09:47:18 -0700 Subject: [PATCH 051/214] Optimized the tensor padding code. --- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 95 ++++++++++++++++--- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 45558d7dd..4482c0992 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -87,7 +87,7 @@ struct TensorEvaluator, Device enum { IsAligned = false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -100,15 +100,13 @@ struct TensorEvaluator, Device } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - } + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; } typedef typename XprType::Scalar Scalar; @@ -128,7 +126,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i >= 0; --i) { + for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { return Scalar(0); @@ -136,21 +134,90 @@ struct TensorEvaluator, Device inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } + if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { + return Scalar(0); + } + inputIndex += (index - m_padding[0].first); return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const int first = index; + const int last = index + packetSize - 1; + const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const int lastPaddedRight = m_outputStrides[i+1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + const int lastPaddedLeft = m_padding[0].first; + const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const int lastPaddedRight = m_outputStrides[1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } Scalar* data() const { return NULL; } protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + PaddingDimensions m_padding; Dimensions m_dimensions; - array m_outputStrides; + array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; }; From b24fe22b1a4518f27ca064d496bfdb6c96d973ab Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 3 Sep 2014 11:38:13 -0700 Subject: [PATCH 052/214] Improved the performance of the tensor convolution code by a factor of about 4. --- .../CXX11/src/Tensor/TensorConvolution.h | 147 +++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 7 +- 2 files changed, 111 insertions(+), 43 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 7d0a21c3b..4a5fd9c79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -226,22 +226,18 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ - false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; - } else { - m_inputStride[0] = 1; - } + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1]; } m_dimensions = m_inputImpl.dimensions(); @@ -251,7 +247,6 @@ struct TensorEvaluator 0) { m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1]; } else { @@ -260,16 +255,12 @@ struct TensorEvaluator 0) { - m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; - } else { - m_outputStride[0] = 1; - } + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1]; } } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -278,57 +269,126 @@ struct TensorEvaluator= 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - CoeffReturnType result = CoeffReturnType(0); - convolve(startInput, 0, 0, result); + convolve(firstInput(index), 0, NumKernelDims-1, result); return result; } - /* TODO: vectorization template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const { - assert(false); - }*/ + const int PacketSize = internal::unpacket_traits::size; + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; - EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex < NumKernelDims-1) { - convolve(input, kernel, DimIndex+1, accum); - } else { - - accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel); + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_DEFAULT Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); } } Scalar* data() const { return NULL; } private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + // No copy, no assignment TensorEvaluator(const TensorEvaluator&); TensorEvaluator& operator = (const TensorEvaluator&); @@ -341,6 +401,11 @@ struct TensorEvaluator m_inputImpl; TensorEvaluator m_kernelImpl; Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device& m_device; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index db716a80e..587cbd5ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -108,8 +108,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { @@ -134,6 +135,8 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + Scalar* data() const { return NULL; } + private: TensorEvaluator m_impl; const Device& m_device; From f50548e86af75fd8e0d1689a9fb4184cf1fec509 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 4 Sep 2014 19:50:27 -0700 Subject: [PATCH 053/214] Added missing tensor copy constructors. As a result it is now possible to declare and initialize a tensor on the same line, as in: Tensor T = A + B; or Tensor T(A.reshape(new_shape)); --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 27 ++++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index fdbe8df4c..879057f38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -55,7 +55,7 @@ namespace Eigen { * change dramatically. * * - * \ref TopicStorageOrders + * \ref TopicStorageOrders */ template @@ -75,7 +75,7 @@ class Tensor : public TensorBase > enum { IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), - PacketAccess = true, + PacketAccess = (internal::packet_traits::size > 1), }; static const int Options = Options_; @@ -224,12 +224,31 @@ class Tensor : public TensorBase > } #endif - inline Tensor(const array& dimensions) - : m_storage(internal::array_prod(dimensions), dimensions) + inline explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { From d43f737b4ad52e84a3b4d954d9bfb4c40cf9e819 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 4 Sep 2014 20:02:28 -0700 Subject: [PATCH 054/214] Added support for evaluation of tensor shuffling operations as lvalues --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 19 +++- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 104 ++++++++++++++---- 2 files changed, 96 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index da5148a5b..2da8f8cc8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -222,19 +222,19 @@ class TensorBase return TensorSlicingOp(derived(), startIndices, sizes); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorPaddingOp + const TensorPaddingOp pad(const PaddingDimensions& padding) const { - return TensorPaddingOp(derived(), padding); + return TensorPaddingOp(derived(), padding); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorShufflingOp + const TensorShufflingOp shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); + return TensorShufflingOp(derived(), shuffle); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingOp + const TensorStridingOp stride(const Strides& strides) const { - return TensorStridingOp(derived(), strides); + return TensorStridingOp(derived(), strides); } // Force the evaluation of the expression. @@ -244,6 +244,7 @@ class TensorBase } protected: + template friend class Tensor; template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } @@ -258,6 +259,7 @@ class TensorBase : public TensorBase::type PacketReturnType; + template friend class Tensor; template friend class TensorBase; EIGEN_DEVICE_FUNC @@ -293,6 +295,11 @@ class TensorBase : public TensorBase(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); + } // Select the device on which to evaluate the expression. template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 4dfc99203..f7e7fc107 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorShufflingOp : public TensorBase, WriteAccessors> +class TensorShufflingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -94,33 +94,38 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shuffle()) + : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Shuffle& shuffle = op.shuffle(); for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = input_dims[m_shuffle[i]]; + m_dimensions[i] = input_dims[shuffle[i]]; } + array inputStrides; + for (int i = 0; i < NumDims; ++i) { if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; } else { - m_inputStrides[0] = 1; + inputStrides[0] = 1; m_outputStrides[0] = 1; } } + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[shuffle[i]]; + } } - // typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -136,33 +141,90 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[m_shuffle[i]]; - index -= idx * m_outputStrides[i]; - } - inputIndex += index * m_inputStrides[m_shuffle[0]]; - return m_impl.coeff(inputIndex); + return m_impl.coeff(srcCoeff(index)); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } Scalar* data() const { return NULL; } protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } + Dimensions m_dimensions; - Shuffle m_shuffle; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; }; +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H From 1abe4ed14c0012d85e833c5f507f282cf26edc36 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 4 Sep 2014 20:27:28 -0700 Subject: [PATCH 055/214] Created more regression tests --- test/main.h | 1 + unsupported/test/cxx11_tensor_assign.cpp | 26 ++ unsupported/test/cxx11_tensor_contraction.cpp | 166 +++++++++++ unsupported/test/cxx11_tensor_device.cpp | 281 +++++++++++++++--- unsupported/test/cxx11_tensor_shuffling.cpp | 47 +++ unsupported/test/cxx11_tensor_simple.cpp | 26 ++ 6 files changed, 511 insertions(+), 36 deletions(-) diff --git a/test/main.h b/test/main.h index 3295dcb71..763cec8f9 100644 --- a/test/main.h +++ b/test/main.h @@ -207,6 +207,7 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b)) +#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b)) #define VERIFY_IS_APPROX(a, b) VERIFY(test_isApprox(a, b)) #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b)) #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b)) diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index b024bed19..f2b126413 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -228,6 +228,30 @@ static void test_same_type() } } +static void test_auto_resize() +{ + Tensor tensor1; + Tensor tensor2(3); + Tensor tensor3(5); + Tensor tensor4(7); + + Tensor new_tensor(5); + new_tensor.setRandom(); + + tensor1 = tensor2 = tensor3 = tensor4 = new_tensor; + + VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0)); + VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0)); + for (int i = 0; i < new_tensor.dimension(0); ++i) { + VERIFY_IS_EQUAL(tensor1(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor2(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor3(i), new_tensor(i)); + VERIFY_IS_EQUAL(tensor4(i), new_tensor(i)); + } +} + void test_cxx11_tensor_assign() { @@ -235,4 +259,6 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_same_type()); + CALL_SUBTEST(test_auto_resize()); + } diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index fc67d500b..a37fcd967 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -141,6 +141,66 @@ static void test_multidims() } +static void test_holes() { + Tensor t1(2, 5, 7, 3); + Tensor t2(2, 7, 11, 13, 3); + t1.setRandom(); + t2.setRandom(); + + Eigen::array dims({{DimPair(0, 0), DimPair(3, 4)}}); + Tensor result = t1.contract(t2, dims); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + VERIFY_IS_EQUAL(result.dimension(2), 7); + VERIFY_IS_EQUAL(result.dimension(3), 11); + VERIFY_IS_EQUAL(result.dimension(4), 13); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 5; ++l) { + for (int m = 0; m < 5; ++m) { + VERIFY_IS_APPROX(result(i, j, k, l, m), + t1(0, i, j, 0) * t2(0, k, l, m, 0) + + t1(1, i, j, 0) * t2(1, k, l, m, 0) + + t1(0, i, j, 1) * t2(0, k, l, m, 1) + + t1(1, i, j, 1) * t2(1, k, l, m, 1) + + t1(0, i, j, 2) * t2(0, k, l, m, 2) + + t1(1, i, j, 2) * t2(1, k, l, m, 2)); + } + } + } + } + } +} + + +static void test_full_redux() +{ + Tensor t1(2, 2); + Tensor t2(2, 2, 2); + t1.setRandom(); + t2.setRandom(); + + Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); + Tensor result = t1.contract(t2, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) + + t1(0, 1) * t2(0, 1, 0) + t1(1, 1) * t2(1, 1, 0)); + VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) + t1(1, 0) * t2(1, 0, 1) + + t1(0, 1) * t2(0, 1, 1) + t1(1, 1) * t2(1, 1, 1)); + + dims[0] = DimPair(1, 0); + dims[1] = DimPair(2, 1); + result = t2.contract(t1, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(0, 1, 0) + + t1(0, 1) * t2(0, 0, 1) + t1(1, 1) * t2(0, 1, 1)); + VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) + t1(1, 0) * t2(1, 1, 0) + + t1(0, 1) * t2(1, 0, 1) + t1(1, 1) * t2(1, 1, 1)); +} + + static void test_expr() { Tensor mat1(2, 3); @@ -160,10 +220,116 @@ static void test_expr() } +static void test_out_of_order_contraction() +{ + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2); + + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(2, 2); + + Eigen::array dims({{DimPair(2, 0), DimPair(0, 2)}}); + mat3 = mat1.contract(mat2, dims); + + VERIFY_IS_APPROX(mat3(0, 0), + mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) + + mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(1, 0), + mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) + + mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(0, 1), + mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) + + mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1)); + VERIFY_IS_APPROX(mat3(1, 1), + mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); + + Eigen::array dims2({{DimPair(0, 2), DimPair(2, 0)}}); + mat3 = mat1.contract(mat2, dims2); + + VERIFY_IS_APPROX(mat3(0, 0), + mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) + + mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(1, 0), + mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) + + mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1)); + VERIFY_IS_APPROX(mat3(0, 1), + mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) + + mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1)); + VERIFY_IS_APPROX(mat3(1, 1), + mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) + + mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1)); + +} + + +static void test_consistency() +{ + // this does something like testing (A*B)^T = (B^T * A^T) + + Tensor mat1(4, 3, 5); + Tensor mat2(3, 2, 1, 5, 4); + mat1.setRandom(); + mat2.setRandom(); + + Tensor mat3(5, 2, 1, 5); + Tensor mat4(2, 1, 5, 5); + + // contract on dimensions of size 4 and 3 + Eigen::array dims1({{DimPair(0, 4), DimPair(1, 0)}}); + Eigen::array dims2({{DimPair(4, 0), DimPair(0, 1)}}); + + mat3 = mat1.contract(mat2, dims1); + mat4 = mat2.contract(mat1, dims2); + + // check that these are equal except for ordering of dimensions + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + } + } +} + + +static void test_large_contraction() +{ + Tensor t_left(30, 50, 8, 31); + Tensor t_right(8, 31, 7, 20, 10); + Tensor t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + MatrixXf m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_scalar()); CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_holes()); + CALL_SUBTEST(test_full_redux()); CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_out_of_order_contraction()); + CALL_SUBTEST(test_consistency()); + CALL_SUBTEST(test_large_contraction()); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index caf2e9735..f331cb481 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -22,17 +22,43 @@ using Eigen::RowMajor; // Context for evaluation on cpu struct CPUContext { - CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out) { } + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(Eigen::array(2,2)), kernel_3d_(Eigen::array(2,2,2)) { + kernel_1d_(0) = 3.14f; + kernel_1d_(1) = 2.7f; + + kernel_2d_(Eigen::array(0,0)) = 3.14f; + kernel_2d_(Eigen::array(1,0)) = 2.7f; + kernel_2d_(Eigen::array(0,1)) = 0.2f; + kernel_2d_(Eigen::array(1,1)) = 7.0f; + + kernel_3d_(Eigen::array(0,0,0)) = 3.14f; + kernel_3d_(Eigen::array(0,1,0)) = 2.7f; + kernel_3d_(Eigen::array(0,0,1)) = 0.2f; + kernel_3d_(Eigen::array(0,1,1)) = 7.0f; + kernel_3d_(Eigen::array(1,0,0)) = -1.0f; + kernel_3d_(Eigen::array(1,1,0)) = -0.3f; + kernel_3d_(Eigen::array(1,0,1)) = -0.7f; + kernel_3d_(Eigen::array(1,1,1)) = -0.5f; + } + + const Eigen::DefaultDevice& device() const { return cpu_device_; } const Eigen::Tensor& in1() const { return in1_; } const Eigen::Tensor& in2() const { return in2_; } - Eigen::TensorDevice, Eigen::DefaultDevice> out() { return TensorDevice, Eigen::DefaultDevice>(cpu_device_, out_); } + Eigen::Tensor& out() { return out_; } + const Eigen::Tensor& kernel1d() const { return kernel_1d_; } + const Eigen::Tensor& kernel2d() const { return kernel_2d_; } + const Eigen::Tensor& kernel3d() const { return kernel_3d_; } private: const Eigen::Tensor& in1_; const Eigen::Tensor& in2_; Eigen::Tensor& out_; + Eigen::Tensor kernel_1d_; + Eigen::Tensor kernel_2d_; + Eigen::Tensor kernel_3d_; + Eigen::DefaultDevice cpu_device_; }; @@ -40,19 +66,45 @@ struct CPUContext { // Context for evaluation on GPU struct GPUContext { GPUContext(const Eigen::TensorMap >& in1, Eigen::TensorMap >& in2, Eigen::TensorMap >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { - cudaStreamCreate(&stream_); + assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); + float kernel_1d_val[] = {3.14f, 2.7f}; + assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); + float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; + assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); + float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; + assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + + assert(cudaStreamCreate(&stream_) == cudaSuccess); } ~GPUContext() { - cudaStreamDestroy(stream_); + assert(cudaFree(kernel_1d_) == cudaSuccess); + assert(cudaFree(kernel_2d_) == cudaSuccess); + assert(cudaFree(kernel_3d_) == cudaSuccess); + assert(cudaStreamDestroy(stream_) == cudaSuccess); } + + const Eigen::GpuDevice& device() const { return gpu_device_; } + const Eigen::TensorMap >& in1() const { return in1_; } const Eigen::TensorMap >& in2() const { return in2_; } - Eigen::TensorDevice >, Eigen::GpuDevice> out() { return TensorDevice >, Eigen::GpuDevice>(gpu_device_, out_); } + Eigen::TensorMap >& out() { return out_; } + Eigen::TensorMap > kernel1d() const { return Eigen::TensorMap >(kernel_1d_, 2); } + Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, Eigen::array(2, 2)); } + Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, Eigen::array(2, 2, 2)); } private: const Eigen::TensorMap >& in1_; const Eigen::TensorMap >& in2_; Eigen::TensorMap >& out_; + + float* kernel_1d_; + float* kernel_2d_; + float* kernel_3d_; + cudaStream_t stream_; Eigen::GpuDevice gpu_device_; }; @@ -62,49 +114,151 @@ struct GPUContext { template static void test_contextual_eval(Context* context) { - context->out() = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); + context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template static void test_forced_contextual_eval(Context* context) { - context->out() = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); + context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } -static void test_cpu() { - Eigen::Tensor in1(Eigen::array(2,3,7)); - Eigen::Tensor in2(Eigen::array(2,3,7)); - Eigen::Tensor out(Eigen::array(2,3,7)); +template +static void test_contraction(Context* context) +{ + Eigen::array, 2> dims; + dims[0] = std::make_pair(1, 1); + dims[1] = std::make_pair(2, 2); - in1.setRandom(); - in2.setRandom(); + Eigen::array shape(40, 50*70); + + Eigen::DSizes indices(0,0); + Eigen::DSizes sizes(40,40); + + context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); +} + + +template +static void test_1d_convolution(Context* context) +{ + Eigen::DSizes indices(Eigen::array(0,0,0)); + Eigen::DSizes sizes(Eigen::array(40,49,70)); + + Eigen::array dims(1); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); +} + +template +static void test_2d_convolution(Context* context) +{ + Eigen::DSizes indices(Eigen::array(0,0,0)); + Eigen::DSizes sizes(Eigen::array(40,49,69)); + + Eigen::array dims(1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); +} + +template +static void test_3d_convolution(Context* context) +{ + Eigen::DSizes indices(Eigen::array(0,0,0)); + Eigen::DSizes sizes(Eigen::array(39,49,69)); + + Eigen::array dims(0,1,2); + context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); +} + + +static void test_cpu() { + Eigen::Tensor in1(Eigen::array(40,50,70)); + Eigen::Tensor in2(Eigen::array(40,50,70)); + Eigen::Tensor out(Eigen::array(40,50,70)); + + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); CPUContext context(in1, in2, out); test_contextual_eval(&context); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } test_forced_contextual_eval(&context); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); } } } + + test_contraction(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(Eigen::array(i,j,0)); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f) + + (in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + + in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f) + + (in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + + in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { + continue; + } + VERIFY_IS_APPROX(expected, result); + } + } + } } static void test_gpu() { - Eigen::Tensor in1(Eigen::array(2,3,7)); - Eigen::Tensor in2(Eigen::array(2,3,7)); - Eigen::Tensor out(Eigen::array(2,3,7)); - in1.setRandom(); - in2.setRandom(); + Eigen::Tensor in1(Eigen::array(40,50,70)); + Eigen::Tensor in2(Eigen::array(40,50,70)); + Eigen::Tensor out(Eigen::array(40,50,70)); + in1 = in1.random() + in1.constant(10.0f); + in2 = in2.random() + in2.constant(10.0f); std::size_t in1_bytes = in1.size() * sizeof(float); std::size_t in2_bytes = in2.size() * sizeof(float); @@ -120,32 +274,87 @@ static void test_gpu() { cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(2,3,7)); - Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(2,3,7)); - Eigen::TensorMap > gpu_out(d_out, Eigen::array(2,3,7)); + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(40,50,70)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(40,50,70)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(40,50,70)); GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); } } } test_forced_contextual_eval(&context); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); } } } -} + test_contraction(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 40; ++j) { + const float result = out(Eigen::array(i,j,0)); + float expected = 0; + for (int k = 0; k < 50; ++k) { + for (int l = 0; l < 70; ++l) { + expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + } + } + VERIFY_IS_APPROX(expected, result); + } + } + + test_1d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + } + } + } + + test_2d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + + in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + VERIFY_IS_APPROX(expected, result); + } + } + } + + test_3d_convolution(&context); + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); + assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + for (int i = 0; i < 39; ++i) { + for (int j = 0; j < 49; ++j) { + for (int k = 0; k < 69; ++k) { + const float result = out(Eigen::array(i,j,k)); + const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + + in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f + + in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + + in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + VERIFY_IS_APPROX(expected, result); + } + } + } +} void test_cxx11_tensor_device() diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 92dd01a52..5ab8b6821 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -106,11 +106,58 @@ static void test_expr_shuffling() } } } + + dst_slice_start[0] = 0; + result.setRandom(); + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim); + dst_slice_start[0] += 1; + } + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } } +static void test_shuffling_as_value() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array shuffles; + shuffles[2] = 0; + shuffles[3] = 1; + shuffles[1] = 2; + shuffles[0] = 3; + Tensor shuffle(5,7,3,2); + shuffle.shuffle(shuffles) = tensor; + + VERIFY_IS_EQUAL(shuffle.dimension(0), 5); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + void test_cxx11_tensor_shuffling() { CALL_SUBTEST(test_simple_shuffling()); CALL_SUBTEST(test_expr_shuffling()); + CALL_SUBTEST(test_shuffling_as_value()); } diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index 1455f2a4c..a70591c82 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -257,12 +257,38 @@ static void test_simple_assign() VERIFY_IS_EQUAL((e2(1,0,2)), -1); } +static void test_resize() +{ + Tensor epsilon; + epsilon.resize(2,3,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 2); + VERIFY_IS_EQUAL(epsilon.dimension(1), 3); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7); + + const int* old_data = epsilon.data(); + epsilon.resize(3,2,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 3); + VERIFY_IS_EQUAL(epsilon.dimension(1), 2); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 2ul*3*7); + VERIFY_IS_EQUAL(epsilon.data(), old_data); + + epsilon.resize(3,5,7); + VERIFY_IS_EQUAL(epsilon.dimension(0), 3); + VERIFY_IS_EQUAL(epsilon.dimension(1), 5); + VERIFY_IS_EQUAL(epsilon.dimension(2), 7); + VERIFY_IS_EQUAL(epsilon.dimensions().TotalSize(), 3ul*5*7); + VERIFY_IS_NOT_EQUAL(epsilon.data(), old_data); +} + void test_cxx11_tensor_simple() { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_simple_assign()); + CALL_SUBTEST(test_resize()); } /* From 74db22455ae0172faaae91321da0b303bb82369d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 5 Sep 2014 07:47:43 -0700 Subject: [PATCH 056/214] Misc fixes. --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 +++--- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- unsupported/test/cxx11_tensor_padding.cpp | 38 ++++++++++++++++++- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d9a6b3f1b..28ae7b3c6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -163,7 +163,7 @@ template { return this->m_impl.coeffRef(index); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket(index, x); @@ -314,7 +314,7 @@ struct TensorEvaluator, Devi Scalar* src = m_impl.data(); for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy(data+i, src+offset, contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); } return false; } @@ -334,7 +334,7 @@ struct TensorEvaluator, Devi template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -355,7 +355,7 @@ struct TensorEvaluator, Devi return rslt; } else { - CoeffReturnType values[packetSize]; + typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); for (int i = 1; i < packetSize-1; ++i) { @@ -420,10 +420,10 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; for (int i = NumDims - 1; i > 0; --i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 4482c0992..7da89458f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorPaddingOp : public TensorBase > +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index cb010f512..6f74216dd 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -37,9 +37,42 @@ static void test_simple_padding() for (int k = 0; k < 12; ++k) { for (int l = 0; l < 7; ++l) { if (j >= 2 && j < 5 && k >= 3 && k < 8) { - VERIFY_IS_EQUAL(tensor(i,j-2,k-3,l), padded(i,j,k,l)); + VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); } else { - VERIFY_IS_EQUAL(0.0f, padded(i,j,k,l)); + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } +} + +static void test_padded_expr() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + Eigen::DSizes reshape_dims; + reshape_dims[0] = 12; + reshape_dims[1] = 84; + + Tensor result; + result = tensor.pad(paddings).reshape(reshape_dims); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(result(i+2*j,k+12*l), tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(result(i+2*j,k+12*l), 0.0f); } } } @@ -51,4 +84,5 @@ static void test_simple_padding() void test_cxx11_tensor_padding() { CALL_SUBTEST(test_simple_padding()); + CALL_SUBTEST(test_padded_expr()); } From efdff157493826bbcc023a85e08596fd58d7997a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Sat, 6 Sep 2014 13:28:24 -0700 Subject: [PATCH 057/214] Fixed a typo in the contraction code --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 897d73806..46624724c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -168,7 +168,7 @@ struct TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } From 1c236f4c9ae78cc58156eebe3b2bb43588897af4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Oct 2014 20:21:42 -0700 Subject: [PATCH 058/214] Added tests for tensors of const values and tensors of stringswwq:: --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- unsupported/test/CMakeLists.txt | 2 + .../test/cxx11_tensor_of_const_values.cpp | 105 +++++++++++++ unsupported/test/cxx11_tensor_of_strings.cpp | 142 ++++++++++++++++++ 4 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_of_const_values.cpp create mode 100644 unsupported/test/cxx11_tensor_of_strings.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 28ae7b3c6..13109f514 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -301,7 +301,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_impl.evalSubExprsIfNeeded(NULL); - if (data && m_impl.data()) { + if (internal::is_arithmetic::value && data && m_impl.data()) { Index contiguous_values = 1; for (int i = 0; i < NumDims; ++i) { contiguous_values *= dimensions()[i]; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 615ff3e6d..8d4e7db66 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,6 +106,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp new file mode 100644 index 000000000..f179a0c21 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_const_values.cpp @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_assign() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + const TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + + Tensor rslt1; + rslt1 = mat1; + Tensor rslt2; + rslt2 = mat2; + + Tensor rslt3 = mat1; + Tensor rslt4 = mat2; + + Tensor rslt5(mat1); + Tensor rslt6(mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(rslt1(i,j), static_cast(i + 2*j)); + VERIFY_IS_APPROX(rslt2(i,j), static_cast(-i - 2*j)); + VERIFY_IS_APPROX(rslt3(i,j), static_cast(i + 2*j)); + VERIFY_IS_APPROX(rslt4(i,j), static_cast(-i - 2*j)); + VERIFY_IS_APPROX(rslt5(i,j), static_cast(i + 2*j)); + VERIFY_IS_APPROX(rslt6(i,j), static_cast(-i - 2*j)); + } + } +} + + +static void test_plus() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + + Tensor sum1; + sum1 = mat1 + mat2; + Tensor sum2; + sum2 = mat2 + mat1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(sum1(i,j), 0.0f); + VERIFY_IS_APPROX(sum2(i,j), 0.0f); + } + } +} + + +static void test_plus_equal() +{ + float data1[6]; + TensorMap> mat1(data1, 2, 3); + float data2[6]; + TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + data1[i] = i; + data2[i] = -i; + } + mat2 += mat1; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(mat2(i,j), 0.0f); + } + } +} + + +void test_cxx11_tensor_of_const_values() +{ + CALL_SUBTEST(test_assign()); + CALL_SUBTEST(test_plus()); + CALL_SUBTEST(test_plus_equal()); +} diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp new file mode 100644 index 000000000..0ffa341c4 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -0,0 +1,142 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include +#include + +using std::string; +using Eigen::Tensor; +using Eigen::TensorMap; + +static void test_assign() +{ + string data1[6]; + TensorMap> mat1(data1, 2, 3); + string data2[6]; + const TensorMap> mat2(data2, 2, 3); + + for (int i = 0; i < 6; ++i) { + std::ostringstream s1; + s1 << "abc" << i*3; + data1[i] = s1.str(); + std::ostringstream s2; + s2 << "def" << i*5; + data2[i] = s2.str(); + } + + Tensor rslt1; + rslt1 = mat1; + Tensor rslt2; + rslt2 = mat2; + + Tensor rslt3 = mat1; + Tensor rslt4 = mat2; + + Tensor rslt5(mat1); + Tensor rslt6(mat2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]); + VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]); + VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]); + VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]); + } + } +} + + +static void test_concat() +{ + Tensor t1(2, 3); + Tensor t2(2, 3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + std::ostringstream s1; + s1 << "abc" << i + j*2; + t1(i, j) = s1.str(); + std::ostringstream s2; + s2 << "def" << i*5 + j*32; + t2(i, j) = s2.str(); + } + } + + Tensor result = t1.concatenate(t2, 1); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 6); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(result(i, j), t1(i, j)); + VERIFY_IS_EQUAL(result(i, j+3), t2(i, j)); + } + } +} + + +static void test_slices() +{ + Tensor data(2, 6); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + std::ostringstream s1; + s1 << "abc" << i + j*2; + data(i, j) = s1.str(); + } + } + + const Eigen::DSizes half_size{{2, 3}}; + const Eigen::DSizes first_half{{0, 0}}; + const Eigen::DSizes second_half{{0, 3}}; + + Tensor t1 = data.slice(first_half, half_size); + Tensor t2 = data.slice(second_half, half_size); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(data(i, j), t1(i, j)); + VERIFY_IS_EQUAL(data(i, j+3), t2(i, j)); + } + } +} + + +static void test_additions() +{ + Tensor data1(3); + Tensor data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = "abc"; + std::ostringstream s1; + s1 << i; + data2(i) = s1.str(); + } + + Tensor sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + std::ostringstream concat; + concat << "abc" << i; + string expected = concat.str(); + VERIFY_IS_EQUAL(sum(i), expected); + } +} + + +void test_cxx11_tensor_of_strings() +{ + // Beware: none of this is likely to ever work on a GPU. + CALL_SUBTEST(test_assign()); + CALL_SUBTEST(test_concat()); + CALL_SUBTEST(test_slices()); + CALL_SUBTEST(test_additions()); +} From 7caaf6453b7b1f58d953729380d596b2d9b27835 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Oct 2014 20:38:22 -0700 Subject: [PATCH 059/214] Added support for tensor reductions and concatenations --- unsupported/Eigen/CXX11/Tensor | 3 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 28 +++ .../CXX11/src/Tensor/TensorConcatenation.h | 217 +++++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 62 +++++ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 226 ++++++++++++++++++ unsupported/test/CMakeLists.txt | 4 +- .../test/cxx11_tensor_concatenation.cpp | 110 +++++++++ unsupported/test/cxx11_tensor_reduction.cpp | 147 ++++++++++++ 9 files changed, 798 insertions(+), 2 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h create mode 100644 unsupported/test/cxx11_tensor_concatenation.cpp create mode 100644 unsupported/test/cxx11_tensor_reduction.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index ebe6419e8..11161a547 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -34,12 +34,15 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 2da8f8cc8..2f7c9ecda 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -204,12 +204,40 @@ class TensorBase return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp broadcast(const Broadcast& broadcast) const { return TensorBroadcastingOp(derived(), broadcast); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 000000000..b8e43f484 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,217 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename packet_traits::type Packet; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + enum { Flags = 0 }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Packet Packet; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int RightNumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1]; + m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array subs; + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Scalar* data() const { return NULL; } + + protected: + const Axis m_axis; + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index afbcc9486..bc67586a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,8 +21,9 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; -template class TensorReductionOp; template class TensorBroadcastingOp; +template class TensorReductionOp; +template class TensorConcatenationOp; template class TensorContractionOp; template class TensorConvolutionOp; template class TensorReshapingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 000000000..92984336c --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,62 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + +// Standard reduction functors +template struct SumReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SumReducer() : m_sum(0) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { + m_sum += t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { + return m_sum; + } + + private: + T m_sum; +}; + +template struct MaxReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max((std::numeric_limits::min)()) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { + if (t > m_max) { m_max = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { + return m_max; + } + + private: + T m_max; +}; + +template struct MinReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MinReducer() : m_min((std::numeric_limits::max)()) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { + if (t < m_min) { m_min = t; } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { + return m_min; + } + + private: + T m_min; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 000000000..eef992106 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,226 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +namespace Eigen { + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { +template +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReductionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReductionOp type; +}; + +} // end namespace internal + + +template +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = false, // The code isn't vectorized properly yet + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()) + { + EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + + array reduced; + for (int i = 0; i < NumInputDims; ++i) { + reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + m_reducedDims[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + m_dimensions[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + + array strides; + strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + strides[i] = strides[i-1] * input_dims[i-1]; + } + outputIndex = 0; + reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + m_reducedStrides[reduceIndex] = strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = strides[i]; + ++outputIndex; + } + } + + // Special case for full reductions + if (NumInputDims == NumReducedDims) { + m_dimensions[0] = 1; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Op reducer(m_reducer); + reduce(firstInput(index), 0, reducer); + return reducer.finalize(); + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Scalar* data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + return startInput; + } + + EIGEN_DEVICE_FUNC void reduce(Index firstIndex, int DimIndex, Op& reducer) const { + for (int j = 0; j < m_reducedDims[DimIndex]; ++j) { + const Index input = firstIndex + j * m_reducedStrides[DimIndex]; + if (DimIndex < NumReducedDims-1) { + reduce(input, DimIndex+1, reducer); + } else { + reducer.reduce(m_impl.coeff(input)); + } + } + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_preservedStrides; + array m_reducedStrides; + array m_reducedDims; + Op m_reducer; + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 8d4e7db66..e83d8b54e 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,14 +106,16 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") - ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") +# ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") + ei_add_test(cxx11_tensor_concatenation "-std=c++0x") ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_reduction "-std=c++0x") # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp new file mode 100644 index 000000000..8fd4f5f80 --- /dev/null +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_dimension_failures() +{ + Tensor left(2, 3, 1); + Tensor right(3, 3, 1); + left.setRandom(); + right.setRandom(); + + // Okay; other dimensions are equal. + Tensor concatenation = left.concatenate(right, 0); + + // Dimension mismatches. + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1)); + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2)); + + // Axis > NumDims or < 0. + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3)); + VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1)); +} + +static void test_static_dimension_failure() +{ + Tensor left(2, 3); + Tensor right(2, 3, 1); + +#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE + // Technically compatible, but we static assert that the inputs have same + // NumDims. + Tensor concatenation = left.concatenate(right, 0); +#endif + + // This can be worked around in this case. + Tensor concatenation = left + .reshape(Tensor::Dimensions{{2, 3, 1}}) + .concatenate(right, 0); + Tensor alternative = left + .concatenate(right.reshape(Tensor::Dimensions{{2, 3}}), 0); +} + +static void test_simple_concatenation() +{ + Tensor left(2, 3, 1); + Tensor right(2, 3, 1); + left.setRandom(); + right.setRandom(); + + Tensor concatenation = left.concatenate(right, 0); + VERIFY_IS_EQUAL(concatenation.dimension(0), 4); + VERIFY_IS_EQUAL(concatenation.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation.dimension(2), 1); + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + } + for (int i = 2; i < 4; ++i) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0)); + } + } + + concatenation = left.concatenate(right, 1); + VERIFY_IS_EQUAL(concatenation.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation.dimension(1), 6); + VERIFY_IS_EQUAL(concatenation.dimension(2), 1); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + } + for (int j = 3; j < 6; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0)); + } + } + + concatenation = left.concatenate(right, 2); + VERIFY_IS_EQUAL(concatenation.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation.dimension(2), 2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0)); + VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0)); + } + } +} + + +// TODO(phli): Add test once we have a real vectorized implementation. +// static void test_vectorized_concatenation() {} + + +void test_cxx11_tensor_concatenation() +{ + CALL_SUBTEST(test_dimension_failures()); + CALL_SUBTEST(test_static_dimension_failure()); + CALL_SUBTEST(test_simple_concatenation()); + // CALL_SUBTEST(test_vectorized_concatenation()); +} diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp new file mode 100644 index 000000000..27135b982 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include +#include + +using Eigen::Tensor; + +static void test_simple_reductions() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + Tensor result = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + float sum = 0.0f; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor(i, k, j, l); + } + } + VERIFY_IS_APPROX(result(i, j), sum); + } + } + + reduction_axis[0] = 0; + reduction_axis[1] = 2; + result = tensor.maximum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 3); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + float max_val = std::numeric_limits::lowest(); + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 5; ++l) { + max_val = (std::max)(max_val, tensor(k, i, l, j)); + } + } + VERIFY_IS_APPROX(result(i, j), max_val); + } + } + + reduction_axis[0] = 0; + reduction_axis[1] = 1; + result = tensor.minimum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + float min_val = (std::numeric_limits::max)(); + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 3; ++l) { + min_val = (std::min)(min_val, tensor(k, l, i, j)); + } + } + VERIFY_IS_APPROX(result(i, j), min_val); + } + } +} + + +static void test_full_reductions() +{ + Tensor tensor(2,3); + tensor.setRandom(); + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + + Tensor result = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 1); + + float sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + sum += tensor(i, j); + } + } + VERIFY_IS_APPROX(result(0), sum); + + result = tensor.square().sum(reduction_axis).sqrt(); + VERIFY_IS_EQUAL(result.dimension(0), 1); + + sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + sum += tensor(i, j) * tensor(i, j); + } + } + VERIFY_IS_APPROX(result(0), sqrtf(sum)); +} + + +struct UserReducer { + UserReducer(float offset) : offset_(offset), sum_(0.0f) {} + void reduce(const float val) { + sum_ += val * val; + } + float finalize() const { + return 1.0f / (sum_ + offset_); + } + + private: + float offset_; + float sum_; +}; + +static void test_user_defined_reductions() +{ + Tensor tensor(5,7); + tensor.setRandom(); + array reduction_axis; + reduction_axis[0] = 1; + + UserReducer reducer(10.0f); + Tensor result = tensor.reduce(reduction_axis, reducer); + VERIFY_IS_EQUAL(result.dimension(0), 5); + for (int i = 0; i < 5; ++i) { + float expected = 10.0f; + for (int j = 0; j < 7; ++j) { + expected += tensor(i, j) * tensor(i, j); + } + expected = 1.0f / expected; + VERIFY_IS_APPROX(result(i), expected); + } +} + + +void test_cxx11_tensor_reduction() +{ + CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_user_defined_reductions()); +} From 5cc23199be743d0d1be85d709eb366e67e87a262 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 10:30:44 -0700 Subject: [PATCH 060/214] More tests to validate the const-correctness of the tensor code. --- Eigen/src/Core/GenericPacketMath.h | 2 ++ Eigen/src/Core/util/XprHelper.h | 8 +++++ unsupported/test/CMakeLists.txt | 3 +- unsupported/test/cxx11_tensor_const.cpp | 39 +++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 unsupported/test/cxx11_tensor_const.cpp diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6ec29d0fd..e6fea5bba 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -95,6 +95,8 @@ template struct packet_traits : default_packet_traits }; }; +template struct packet_traits : packet_traits { }; + /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 7c77b2263..67ca49754 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -415,6 +415,14 @@ template struct promote_storage_type { typedef A ret; }; +template struct promote_storage_type +{ + typedef A ret; +}; +template struct promote_storage_type +{ + typedef A ret; +}; /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type. * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType. diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e83d8b54e..a47c7bc74 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -106,7 +106,8 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") -# ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_const "-std=c++0x") + ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp new file mode 100644 index 000000000..0ffb02afd --- /dev/null +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +using Eigen::Tensor; + + + + +static void test_simple_assign() +{ + Tensor random(2,3,7); + random.setRandom(); + + TensorMap > constant(random.data(), 2, 3, 7); + Tensor result(2,3,7); + result = constant; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k)); + } + } + } +} + +void test_cxx11_tensor_const() +{ + CALL_SUBTEST(test_simple_assign()); +} From 8b2afe33a165ff0cc5a7afd14fcfb06cdf703235 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 10:39:36 -0700 Subject: [PATCH 061/214] Fixes for the forced evaluation of tensor expressions More tests --- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 13 +++-- unsupported/test/CMakeLists.txt | 3 +- unsupported/test/cxx11_tensor_dimension.cpp | 51 +++++++++++++++++++ unsupported/test/cxx11_tensor_forced_eval.cpp | 51 +++++++++++++++++++ 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_dimension.cpp create mode 100644 unsupported/test/cxx11_tensor_forced_eval.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 6f6641de6..cb14cc7f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -87,31 +87,28 @@ struct TensorEvaluator, Device> enum { IsAligned = true, - PacketAccess = true, + PacketAccess = (internal::packet_traits::size > 1), }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } - EIGEN_DEVICE_FUNC ~TensorEvaluator() { - eigen_assert(!m_buffer); - } - typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { - m_impl.evalSubExprsIfNeeded(); + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); typedef TensorEvalToOp EvalTo; EvalTo evalToTmp(m_buffer, m_op); internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); m_impl.cleanup(); + return true; } EIGEN_STRONG_INLINE void cleanup() { m_device.deallocate(m_buffer); @@ -129,6 +126,8 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + Scalar* data() const { return m_buffer; } + private: TensorEvaluator m_impl; const ArgType m_op; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index a47c7bc74..5d8913dd8 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -101,10 +101,12 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") # ei_add_test(cxx11_tensor_assign "-std=c++0x") +# ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") + ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") @@ -120,6 +122,5 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") # ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp new file mode 100644 index 000000000..fc0d29c50 --- /dev/null +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + + +static void test_dynamic_size() +{ + Eigen::DSizes dimensions(Eigen::array(2,3,7)); + + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); + VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); +} + +static void test_fixed_size() +{ + Eigen::Sizes<2,3,7> dimensions; + + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); + VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); + VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); +} + + +static void test_match() +{ + Eigen::DSizes dyn(Eigen::array(2,3,7)); + Eigen::Sizes<2,3,7> stat; + VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true); +} + + +void test_cxx11_tensor_dimension() +{ + CALL_SUBTEST(test_dynamic_size()); + CALL_SUBTEST(test_fixed_size()); + CALL_SUBTEST(test_match()); +} diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp new file mode 100644 index 000000000..529584a7b --- /dev/null +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include +#include + +using Eigen::MatrixXf; +using Eigen::Tensor; + +static void test_simple() +{ + MatrixXf m1(3,3); + MatrixXf m2(3,3); + m1.setRandom(); + m2.setRandom(); + + TensorMap> mat1(m1.data(), 3,3); + TensorMap> mat2(m2.data(), 3,3); + + Tensor mat3(3,3); + mat3 = mat1; + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 0)}}); + + mat3 = mat3.contract(mat2, dims).eval(); + + VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0)); + VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1)); + VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2)); + VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0)); + VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1)); + VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2)); + VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0)); + VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1)); + VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2)); +} + + +void test_cxx11_tensor_forced_eval() +{ + CALL_SUBTEST(test_simple()); +} From b7271dffb5b1ceeee4c8bd99402ff89dcce58d74 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 2 Oct 2014 16:51:57 -0700 Subject: [PATCH 062/214] Generalized the gebp apis --- .../Core/products/GeneralBlockPanelKernel.h | 423 +++++++++--------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 80 ++-- .../products/GeneralMatrixMatrixTriangular.h | 54 ++- .../Core/products/SelfadjointMatrixMatrix.h | 51 ++- .../Core/products/TriangularMatrixMatrix.h | 65 +-- .../Core/products/TriangularSolverMatrix.h | 49 +- Eigen/src/Core/util/BlasUtil.h | 108 ++++- unsupported/test/CMakeLists.txt | 2 +- 8 files changed, 474 insertions(+), 358 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7da52c2e8..090c8f4e6 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -667,7 +667,7 @@ protected: * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ -template +template struct gebp_kernel { typedef gebp_traits Traits; @@ -676,14 +676,15 @@ struct gebp_kernel typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - + typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; - + + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, @@ -693,14 +694,16 @@ struct gebp_kernel }; EIGEN_DONT_INLINE - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template +template EIGEN_DONT_INLINE -void gebp_kernel - ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, +void gebp_kernel + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; @@ -743,15 +746,15 @@ void gebp_kernel traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0); - internal::prefetch(r1); - internal::prefetch(r2); - internal::prefetch(r3); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -813,48 +816,48 @@ void gebp_kernel ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r0+2*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r1+0*Traits::ResPacketSize); - R1 = ploadu(r1+1*Traits::ResPacketSize); - R2 = ploadu(r1+2*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); - pstoreu(r1+0*Traits::ResPacketSize, R0); - pstoreu(r1+1*Traits::ResPacketSize, R1); - pstoreu(r1+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r2+1*Traits::ResPacketSize); - R2 = ploadu(r2+2*Traits::ResPacketSize); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r2+2*Traits::ResPacketSize, R2); - - R0 = ploadu(r3+0*Traits::ResPacketSize); - R1 = ploadu(r3+1*Traits::ResPacketSize); - R2 = ploadu(r3+2*Traits::ResPacketSize); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); - pstoreu(r3+0*Traits::ResPacketSize, R0); - pstoreu(r3+1*Traits::ResPacketSize, R1); - pstoreu(r3+2*Traits::ResPacketSize, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 traits.initAcc(C4); traits.initAcc(C8); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -912,19 +916,19 @@ void gebp_kernel ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r0+2*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - traits.acc(C8 , alphav, R2); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r0+2*Traits::ResPacketSize, R2); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); } } } - + //---------- Process 2 * LhsProgress rows at once ---------- if(mr>=2*Traits::LhsProgress) { @@ -946,15 +950,15 @@ void gebp_kernel traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -978,7 +982,7 @@ void gebp_kernel traits.madd(A1, B2, C6, B2); \ traits.madd(A0, B3, C3, T0); \ traits.madd(A1, B3, C7, B3) - + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1002,37 +1006,37 @@ void gebp_kernel blA += 2*Traits::LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1, R2, R3; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); - R2 = ploadu(r1+0*Traits::ResPacketSize); - R3 = ploadu(r1+1*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); traits.acc(C5, alphav, R3); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); - pstoreu(r1+0*Traits::ResPacketSize, R2); - pstoreu(r1+1*Traits::ResPacketSize, R3); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r2+1*Traits::ResPacketSize); - R2 = ploadu(r3+0*Traits::ResPacketSize); - R3 = ploadu(r3+1*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); traits.acc(C7, alphav, R3); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r2+1*Traits::ResPacketSize, R1); - pstoreu(r3+0*Traits::ResPacketSize, R2); - pstoreu(r3+1*Traits::ResPacketSize, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 traits.initAcc(C0); traits.initAcc(C4); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - internal::prefetch(r0+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1089,12 +1093,12 @@ void gebp_kernel ResPacket R0, R1; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r0+1*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r0+1*Traits::ResPacketSize, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); } } } @@ -1120,15 +1124,15 @@ void gebp_kernel traits.initAcc(C2); traits.initAcc(C3); - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = &res[(j2+1)*resStride + i]; - ResScalar* r2 = &res[(j2+2)*resStride + i]; - ResScalar* r3 = &res[(j2+3)*resStride + i]; - - internal::prefetch(r0+prefetch_res_offset); - internal::prefetch(r1+prefetch_res_offset); - internal::prefetch(r2+prefetch_res_offset); - internal::prefetch(r3+prefetch_res_offset); + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; @@ -1171,25 +1175,25 @@ void gebp_kernel blA += 1*LhsProgress; } #undef EIGEN_GEBGP_ONESTEP - + ResPacket R0, R1; ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*Traits::ResPacketSize); - R1 = ploadu(r1+0*Traits::ResPacketSize); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); - pstoreu(r0+0*Traits::ResPacketSize, R0); - pstoreu(r1+0*Traits::ResPacketSize, R1); - - R0 = ploadu(r2+0*Traits::ResPacketSize); - R1 = ploadu(r3+0*Traits::ResPacketSize); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); - pstoreu(r2+0*Traits::ResPacketSize, R0); - pstoreu(r3+0*Traits::ResPacketSize, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); } - + // Deal with remaining columns of the rhs for(Index j2=packet_cols4; j2 AccPacket C0; traits.initAcc(C0); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); // performs "inner" products const RhsScalar* blB = &blockB[j2*strideB+offsetB]; @@ -1241,9 +1245,9 @@ void gebp_kernel #undef EIGEN_GEBGP_ONESTEP ResPacket R0; ResPacket alphav = pset1(alpha); - R0 = ploadu(r0+0*Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0+0*Traits::ResPacketSize, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); } } } @@ -1259,7 +1263,7 @@ void gebp_kernel const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - + if( (SwappedTraits::LhsProgress % 4)==0 ) { // NOTE The following piece of code wont work for 512 bit registers @@ -1268,32 +1272,32 @@ void gebp_kernel straits.initAcc(C1); straits.initAcc(C2); straits.initAcc(C3); - + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); const Index endk = (depth/spk)*spk; const Index endk4 = (depth/(spk*4))*(spk*4); - + Index k=0; for(; k { SLhsPacket A0; SRhsPacket B_0; - + straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); straits.madd(A0,B_0,C0,B_0); - + blB += SwappedTraits::LhsProgress; blA += spk; } @@ -1317,10 +1321,10 @@ void gebp_kernel typedef typename conditional::half,SLhsPacket>::type SLhsPacketHalf; typedef typename conditional::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional::half,SAccPacket>::type SAccPacketHalf; - - SResPacketHalf R = pgather(&res[j2*resStride + i], resStride); + + SResPacketHalf R = res.template gatherPacket(i, j2); SResPacketHalf alphav = pset1(alpha); - + if(depth-endk>0) { // We have to handle the last row of the rhs which corresponds to a half-packet @@ -1336,14 +1340,14 @@ void gebp_kernel { straits.acc(predux4(C0), alphav, R); } - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } else { - SResPacket R = pgather(&res[j2*resStride + i], resStride); + SResPacket R = res.template gatherPacket(i, j2); SResPacket alphav = pset1(alpha); straits.acc(C0, alphav, R); - pscatter(&res[j2*resStride + i], R, resStride); + res.scatterPacket(i, j2, R); } } else // scalar path @@ -1355,25 +1359,25 @@ void gebp_kernel { LhsScalar A0; RhsScalar B_0, B_1; - + A0 = blA[k]; - + B_0 = blB[0]; B_1 = blB[1]; MADD(cj,A0,B_0,C0, B_0); MADD(cj,A0,B_1,C1, B_1); - + B_0 = blB[2]; B_1 = blB[3]; MADD(cj,A0,B_0,C2, B_0); MADD(cj,A0,B_1,C3, B_1); - + blB += 4; } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - res[(j2+2)*resStride + i] += alpha*C2; - res[(j2+3)*resStride + i] += alpha*C3; + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; } } } @@ -1394,7 +1398,7 @@ void gebp_kernel RhsScalar B_0 = blB[k]; MADD(cj, A0, B_0, C0, B_0); } - res[(j2+0)*resStride + i] += alpha*C0; + res(i, j2) += alpha * C0; } } } @@ -1417,15 +1421,16 @@ void gebp_kernel // // 32 33 34 35 ... // 36 36 38 39 ... -template -struct gemm_pack_lhs +template +struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; @@ -1436,30 +1441,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 : Pack2>1 ? (rows/Pack2)*Pack2 : 0; - + Index i=0; - + // Pack 3 packets if(Pack1>=3*PacketSize) { for(; i(&lhs(i+0*PacketSize, k)); - B = ploadu(&lhs(i+1*PacketSize, k)); - C = ploadu(&lhs(i+2*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1473,12 +1477,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(&lhs(i+0*PacketSize, k)); - B = ploadu(&lhs(i+1*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1491,11 +1495,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(&lhs(i+0*PacketSize, k)); + A = lhs.loadPacket(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1508,11 +1512,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_lhs +template +struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; @@ -1543,13 +1548,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; - + // const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; // const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; // const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - + int pack = Pack1; Index i = 0; while(pack>0) @@ -1569,7 +1573,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu(&lhs(i+p+m, k)); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1594,15 +1598,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_rhs +template +struct gemm_pack_rhs { typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1685,27 +1690,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=4) { for(Index j2=packet_cols8; j2 kernel; - kernel.packet[0] = ploadu(&b0[k]); - kernel.packet[1] = ploadu(&b1[k]); - kernel.packet[2] = ploadu(&b2[k]); - kernel.packet[3] = ploadu(&b3[k]); + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1] = dm1.loadPacket(k); + kernel.packet[2] = dm2.loadPacket(k); + kernel.packet[3] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); @@ -1716,10 +1721,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs -struct gemm_pack_rhs +template +struct gemm_pack_rhs { typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); EIGEN_UNUSED_VARIABLE(stride); @@ -1762,7 +1768,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - + // if(nr>=8) // { // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); + Packet A = rhs.loadPacket(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - blockB[count+2] = cj(b0[2]); - blockB[count+3] = cj(b0[3]); + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); count += 4; } } @@ -1825,10 +1831,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs::ReturnType ResScal static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* _res, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) { - const_blas_data_mapper lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gebp_kernel gebp; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; #ifdef EIGEN_HAS_OPENMP if(info) @@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth, // In order to reduce the chance that a thread has to wait for the other, // let's start by packing B'. - pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc); + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); // Pack A_k to A' in a parallel fashion: // each thread packs the sub block A_k,i to A'_i where i is the thread id. @@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth, // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; - - pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length); + + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; @@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth, // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! - if(shift>0) - while(info[i].sync!=k) {} - gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + if (shift>0) { + while(info[i].sync!=k) { + } + } + + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } // Then keep going as usual with the remaining B' @@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth, const Index actual_nc = (std::min)(j+nc,cols)-j; // pack B_k,j to B' - pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc); + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); // C_j += A' * B' - gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha); + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); } // Release all the sub blocks A'_i of A' for the current thread, @@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); // For each horizontal panel of the rhs, and corresponding panel of the lhs... - for(Index k2=0; k2 Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) - // Note that this panel will be read as many times as the number of blocks in the rhs's - // horizontal panel which is, in practice, a very low number. - pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows); - - // For each kc x nc block of the rhs's horizontal panel... - for(Index j2=0; j2 Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2m_nc; computeProductBlockingSizes(this->m_kc, this->m_mc, n); } - + m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 225b994d1..daa8a1d8a 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -58,13 +58,17 @@ struct general_matrix_matrix_triangular_product::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha) { - const_blas_data_mapper lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction @@ -75,10 +79,10 @@ struct general_matrix_matrix_triangular_product pack_lhs; - gemm_pack_rhs pack_rhs; - gebp_kernel gebp; + + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; tribb_kernel sybb; for(Index k2=0; k2 processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + + sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0); + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); } } } @@ -129,13 +134,16 @@ struct tribb_kernel { typedef gebp_traits Traits; typedef typename Traits::ResScalar ResScalar; - + enum { BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - gebp_kernel gebp_kernel; + typedef blas_data_mapper ResMapper; + ResMapper res(_res, resStride); + gebp_kernel gebp_kernel; + Matrix buffer; // let's process the block per panel of actual_mc x BlockSize, @@ -146,7 +154,7 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0); // selfadjoint micro block @@ -154,12 +162,12 @@ struct tribb_kernel Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1 lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper LhsTransposeMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction @@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; + gebp_kernel gebp_kernel; symm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_rhs pack_rhs; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2 transposed packed copy @@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix() - (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -414,15 +420,18 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix lhs(_lhs,lhsStride); - typedef gebp_traits Traits; + typedef const_blas_data_mapper LhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride); + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction @@ -432,8 +441,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2& blocking) { // strip zeros @@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2 skip it @@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() - (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix& blocking) { // strip zeros @@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix lhs(_lhs,lhsStride); - const_blas_data_mapper rhs(_rhs,rhsStride); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; - gemm_pack_rhs pack_rhs_panel; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; @@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix0) @@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix& blocking) { Index cols = otherSize; - const_blas_data_mapper tri(_tri,triStride); - blas_data_mapper other(_other,otherStride); + + typedef const_blas_data_mapper TriMapper; + typedef blas_data_mapper OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride); typedef gebp_traits Traits; + enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower @@ -71,9 +75,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; - gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; - gemm_pack_rhs pack_rhs; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements @@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; - pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset); } } @@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { - pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } @@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix& blocking) { Index rows = otherSize; - const_blas_data_mapper rhs(_tri,triStride); - blas_data_mapper lhs(_other,otherStride); + + typedef blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + LhsMapper lhs(_other, otherStride); + RhsMapper rhs(_tri, triStride); typedef gebp_traits Traits; enum { @@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; - gebp_kernel gebp_kernel; - gemm_pack_rhs pack_rhs; - gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gebp_kernel gebp_kernel; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k20) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) { - gebp_kernel(&lhs(i2,absolute_j2), otherStride, + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, Scalar(-1), @@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix0) - gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 0d8e2705a..25a62d528 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -18,13 +18,13 @@ namespace Eigen { namespace internal { // forward declarations -template +template struct gebp_kernel; -template +template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -117,32 +117,96 @@ template struct get_factor::R static EIGEN_STRONG_INLINE typename NumTraits::Real run(const Scalar& x) { return numext::real(x); } }; -// Lightweight helper class to access matrix coefficients. -// Yes, this is somehow redundant with Map<>, but this version is much much lighter, -// and so I hope better compilation performance (time and code quality). -template -class blas_data_mapper -{ + +template +class MatrixLinearMapper { public: - blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j) - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt(m_data + i); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt(m_data + i); + } + + EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + pstoret(m_data + i, p); + } + protected: - Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + Scalar *m_data; +}; + +// Lightweight helper class to access matrix coefficients. +template +class blas_data_mapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef MatrixLinearMapper LinearMapper; + + EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + + EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride); + } + + EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j)); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; + } + + EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + + EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + + template + EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const { + pscatter(&operator()(i, j), p, m_stride); + } + + template + EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + + protected: + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; }; // lightweight helper class to access matrix coefficients (const version) template -class const_blas_data_mapper -{ +class const_blas_data_mapper : public blas_data_mapper { public: - const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {} - EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const - { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - protected: - const Scalar* EIGEN_RESTRICT m_data; - Index m_stride; + EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper(data, stride) {} + + EIGEN_ALWAYS_INLINE const_blas_data_mapper getSubMapper(Index i, Index j) const { + return const_blas_data_mapper(&(this->operator()(i, j)), this->m_stride); + } }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 5d8913dd8..75423f516 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -48,7 +48,7 @@ if(MPFR_FOUND) include_directories(${MPFR_INCLUDES} ./mpreal) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) +# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() From 12693928228922ecf8fa3fcf14341d195e376a11 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 10:16:59 -0700 Subject: [PATCH 063/214] Created the IndexPair type to store pair of tensor indices. CUDA doesn't support std::pair so we can't use them when targeting GPUs. Improved the performance on tensor contractions --- .../CXX11/src/Core/util/CXX11Workarounds.h | 4 +- .../CXX11/src/Tensor/TensorContraction.h | 741 +++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 7 + 3 files changed, 662 insertions(+), 90 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 3812ecd1f..227522ecb 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -69,11 +69,13 @@ template constexpr inline T const& array_ #undef STD_GET_ARR_HACK template struct array_size; +template struct array_size > { + static const size_t value = N; +}; template struct array_size > { static const size_t value = N; }; - /* Suppose you have a template of the form * template struct X; * And you want to specialize it in such a way: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 46624724c..1e6f276e0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,6 +20,319 @@ namespace Eigen { * */ namespace internal { + +enum { + Rhs = 0, + Lhs = 1, +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +template +class BaseTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(int i) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + Index nocontract_val = left ? row : col; + Index linidx = 0; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + for (int i = array_size::value - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + EIGEN_STATIC_ASSERT(array_size::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + return IndexPair(linidx[0], linidx[1]); + } + + protected: + const Tensor m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + + + +template +class TensorContractionInputMapper; + +template +class TensorContractionSubMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef TensorContractionInputMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + private: + const ParentMapper& m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template::size : 1), + bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned> +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index last = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (last - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_DEFAULT Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(last); + + return pload(data); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); + } +}; + + +template +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + + TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + EIGEN_ALIGN_DEFAULT Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload::type>(data); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); + } +}; + + template struct traits > { @@ -53,6 +366,14 @@ struct nested, 1, typena typedef TensorContractionOp type; }; +template +struct traits, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef Device_ Device; +}; + } // end namespace internal @@ -102,143 +423,385 @@ template <> struct max_n_1<0> { }; -template -struct TensorEvaluator, Device> +template +struct TensorContractionEvaluatorBase { + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::Device Device; + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - typedef typename XprType::Index Index; + typedef DSizes Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = /*TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess */ - false, + IsAligned = true, + PacketAccess = (internal::packet_traits::size > 1), }; - TensorEvaluator(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL) { - Index index = 0; - Index stride = 1; - m_shiftright = 1; + eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); - int skipped = 0; + array::Dimensions::count> lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i]; + } + + array::Dimensions::count> rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i]; + } + + m_i_strides[0] = 1; + m_j_strides[0] = 1; + m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + int nocontract_idx = 0; const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { - bool skip = false; - for (int j = 0; j < internal::array_size::value; ++j) { + for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < internal::array_size::value; j++) { if (op.indices()[j].first == i) { - skip = true; - m_leftOffsets[2*skipped] = stride; - m_leftOffsets[2*skipped+1] = stride * left_dims[i]; - m_stitchsize[skipped] = left_dims[i]; + contracting = true; break; } } - if (!skip) { - m_dimensions[index++] = left_dims[i]; - m_shiftright *= left_dims[i]; - } else { - ++skipped; + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size::value) { + m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * left_dims[i]; + } + dim_idx++; + nocontract_idx++; } - stride *= left_dims[i]; } - stride = 1; - skipped = 0; + nocontract_idx = 0; const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; ++i) { - bool skip = false; - for (int j = 0; j < internal::array_size::value; ++j) { + for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < internal::array_size::value; j++) { if (op.indices()[j].second == i) { - skip = true; - m_rightOffsets[2*skipped] = stride; - m_rightOffsets[2*skipped+1] = stride * right_dims[i]; + contracting = true; break; } } - if (!skip) { - m_dimensions[index++] = right_dims[i]; - } else { - ++skipped; + if (!contracting) { + m_dimensions[dim_idx] = right_dims[i]; + if (nocontract_idx+1 < internal::array_size::value) { + m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; } - stride *= right_dims[i]; } - // Scalar case + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < internal::array_size::value; i++) { + Index left = op.indices()[i].first; + Index right = op.indices()[i].second; + + Index size = left_dims[left]; + eigen_assert(size == right_dims[right] && "Contraction axes must be same size"); + + if (i+1 < internal::array_size::value) { + m_k_strides[i+1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < op.indices()[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // Scalar case. We represent the result as a 1d tensor of size 1. if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { m_dimensions[0] = 1; } } - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - const Dimensions& dimensions() const { return m_dimensions; } - - void evalTo(typename XprType::Scalar* buffer) const { - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - const Index startLeft = index % m_shiftright; - const Index startRight = index / m_shiftright; - CoeffReturnType result = CoeffReturnType(0); - partialStitch(startLeft, startRight, 0, result); - return result; - } - - /* TODO: vectorization - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - assert(false); - }*/ - - private: - EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const { - Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]); - Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]); - - for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) { - const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex]; - const Index right = firstRight+j*m_rightOffsets[2*StitchIndex]; - if (StitchIndex < internal::array_size::value-1) { - partialStitch(left, right, StitchIndex+1, accum); - } else { - accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right); + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalTyped(buffer); + } + else { + static_cast(this)->template evalTyped(buffer); + } } } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + + protected: + // Prevent assignment + TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); - private: - array::value> m_leftOffsets; - array::value> m_rightOffsets; - array::value> m_stitchsize; - Index m_shiftright; Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + const Device& m_device; + Scalar* m_result; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; }; +template +struct TensorEvaluator, Device> : + public TensorContractionEvaluatorBase, Device> > { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template + EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + internal::gebp_kernel gebp; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + typedef typename internal::gemm_blocking_space BlockingType; + + // Sizes of the blocks to load in cache. See the Goto paper for details. + BlockingType blocking(m, n, k, true); + const Index kc = blocking.kc(); + const Index mc = (std::min)(m, blocking.mc()); + const Index nc = (std::min)(n, blocking.nc()); + int sizeA = mc * kc; + int sizeB = kc * nc; + + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); + + for(Index i2=0; i2m_device.deallocate(blockA); + this->m_device.deallocate(blockB); + } +}; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 5a113dc19..11590b474 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,6 +29,13 @@ namespace Eigen { * \sa Tensor */ +// Can't use std::pairs on cuda devices +template struct IndexPair { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } + Index first; + Index second; +}; // Boiler plate code From af2e5995e2ba48384024bbc8432bd6dbbebf71d2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:18:07 -0700 Subject: [PATCH 064/214] Improved support for CUDA devices. Improved contractions on GPU --- unsupported/Eigen/CXX11/Tensor | 1 + .../CXX11/src/Tensor/TensorContractionCuda.h | 1206 +++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 38 +- 3 files changed, 1237 insertions(+), 8 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 11161a547..b1bd2f676 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -44,6 +44,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 000000000..babe33fff --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,1206 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = Scalar(0); \ + lhs_pf1 = Scalar(0); \ + lhs_pf2 = Scalar(0); \ + lhs_pf3 = Scalar(0); \ + lhs_pf4 = Scalar(0); \ + lhs_pf5 = Scalar(0); \ + lhs_pf6 = Scalar(0); \ + lhs_pf7 = Scalar(0); \ + \ + rhs_pf0 = Scalar(0); \ + rhs_pf1 = Scalar(0); \ + rhs_pf2 = Scalar(0); \ + rhs_pf3 = Scalar(0); \ + rhs_pf4 = Scalar(0); \ + rhs_pf5 = Scalar(0); \ + rhs_pf6 = Scalar(0); \ + rhs_pf7 = Scalar(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = Scalar(0); \ + Scalar res(i, 1) = Scalar(0); \ + Scalar res(i, 2) = Scalar(0); \ + Scalar res(i, 3) = Scalar(0); \ + Scalar res(i, 4) = Scalar(0); \ + Scalar res(i, 5) = Scalar(0); \ + Scalar res(i, 6) = Scalar(0); \ + Scalar res(i, 7) = Scalar(0); \ + + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res + } + + + template +__global__ void +__launch_bounds__(512) + EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ volatile Scalar lhs_shmem[72 * 64]; + __shared__ volatile Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } + } + + + + template +__device__ EIGEN_STRONG_INLINE void + EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float4* lhs_shmem4, float2* rhs_shmem2, + const Index m_size, const Index n_size, const Index k_size) { + typedef float Scalar; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + const Index lane = threadIdx.x + 8 * (threadIdx.y % 4); + + // prefetch registers + float4 lhs_pf0; + float4 lhs_pf1; + + float4 rhs_pf0; + float4 rhs_pf1; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // all of these indices assume float4 loading + // this thread loads the float4 starting at this index, and then also loads + // another float4 starting 32 columns to to the right + const Index horiz_block_idx = threadIdx.z / 2; + const Index vert_block_idx = threadIdx.x / 2 + 4 * (threadIdx.y % 2); + const Index horiz_idx_in_block = threadIdx.y / 2 + 4 * (threadIdx.z % 2); + const Index vert_idx_in_block = threadIdx.x % 2; + + // there's padding in both the LHS and RHS shared memory layouts. This padding + // allows for 0 bank conflicts on all shmem stores and loads. + // LHS padding: 1 float4 on each 8x8 block of floats + // RHS padding: 1 float2 on each block, and 12 additional float2s between vertical blocks + // 3 and 4 + + // storage indices + // lhs index with respect to float4s + const Index lhs_store_idx_base = + 136 * horiz_block_idx + + 17 * vert_block_idx + + 8 * vert_idx_in_block + + horiz_idx_in_block; + + // rhs index with respect to floats + const Index rhs_store_idx_base = + 552 * horiz_block_idx + + 66 * vert_block_idx + + 32 * (horiz_idx_in_block / 4) + (horiz_idx_in_block % 4) + + 16 * vert_idx_in_block + + ((vert_block_idx < 4) ? 0 : 24); + + const Index lhs_store_idx_0 = lhs_store_idx_base + 544 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 544 * 1; + + const Index rhs_store_idx_0 = (rhs_store_idx_base / 2) + ((lane < 16) ? 0 : 4); + const Index rhs_store_idx_1 = rhs_store_idx_0 + 2; + const Index rhs_store_idx_2 = rhs_store_idx_0 + 1104; + const Index rhs_store_idx_3 = rhs_store_idx_1 + 1104; + + // The below diagrams show which shmem index (with respect to floats) each element + // in an 8x8 input block gets packed into: + // LHS: + // 0 4 8 12 16 20 24 28 + // 1 5 9 13 17 21 25 29 + // 2 6 10 14 18 22 26 30 + // 3 7 11 15 19 23 27 31 + // 32 36 40 44 48 52 56 60 + // ... (pack as 2 rows of float4 indexed row major, each float4 is vertical) + // + // RHS: + // 0 1 2 3 32 33 34 35 + // 4 5 6 7 36 37 38 39 + // ... (pack as 2 cols of float4 indexed col major, each float4 is horizontal) + + // Each thread in a warp loads 2 float4s. This happens in 2 instructions. On each of these + // instruction, the warp loads 2 columns (2 cols * 64 elements / col = 128 elements = 32 threads + // * 4 elements/thread). For the LHS, we're able to store the loaded float4 directly into + // shmem (using a 128 bit store instruction). For the RHS, we need to transpose the data. + // This is done with warp shuffles. Furthermore, we only use 64 bit stores for the RHS, because + // 64 bits is only 2 columns (which is all we load in a warp), and the padding for the RHS + // doesn't meet 64 bit alignment requirements (namely, the 4 consecutive floats that we want + // to load on the RHS are 8 byte aligned, not 16 byte aligned, which is required for float4). + + const Index load_idx_vert = 4 * (threadIdx.x + 8 * (threadIdx.y % 2)); + const Index load_idx_horiz = (threadIdx.y / 2) + 4 * threadIdx.z; + + const Index lhs_vert = base_m + load_idx_vert; + const Index rhs_horiz_0 = base_n + load_idx_horiz; + const Index rhs_horiz_1 = base_n + load_idx_horiz + 32; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = internal::pset1(0); \ + lhs_pf1 = internal::pset1(0); \ + \ + rhs_pf0 = internal::pset1(0); \ + rhs_pf1 = internal::pset1(0); \ + \ + const Index lhs_horiz_0 = base_k + load_idx_horiz; \ + const Index lhs_horiz_1 = base_k + load_idx_horiz + 32; \ + if (!needs_edge_check || lhs_vert + 3 < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs.loadPacket(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ + } \ + } else if (lhs_vert + 2 < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ + \ + lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ + lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ + lhs_pf1.z = lhs(lhs_vert + 2, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ + } \ + } else if (lhs_vert + 1 < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + \ + lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ + lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ + } \ + } else if (lhs_vert < m_size) { \ + if (lhs_horiz_1 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ + } \ +} \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (rhs_vert + 3 < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ + } \ + } else if (rhs_vert + 2 < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ + \ + rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ + } \ + } else if (rhs_vert + 1 < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + \ + rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ + } \ + } else if (rhs_vert < k_size) { \ + if (!needs_edge_check || rhs_horiz_1 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ + } \ +} \ + \ + float swap_val0 = (lane < 16) ? rhs_pf0.z : rhs_pf0.x; \ + float swap_val1 = (lane < 16) ? rhs_pf0.w : rhs_pf0.y; \ + float swap_val2 = (lane < 16) ? rhs_pf1.z : rhs_pf1.x; \ + float swap_val3 = (lane < 16) ? rhs_pf1.w : rhs_pf1.y; \ + \ + swap_val0 = __shfl_xor(swap_val0, 16); \ + swap_val1 = __shfl_xor(swap_val1, 16); \ + swap_val2 = __shfl_xor(swap_val2, 16); \ + swap_val3 = __shfl_xor(swap_val3, 16); \ + \ + if (lane < 16) { \ + rhs_pf0.z = swap_val0; \ + rhs_pf0.w = swap_val1; \ + rhs_pf1.z = swap_val2; \ + rhs_pf1.w = swap_val3; \ + } else { \ + rhs_pf0.x = swap_val0; \ + rhs_pf0.y = swap_val1; \ + rhs_pf1.x = swap_val2; \ + rhs_pf1.y = swap_val3; \ + } \ +} \ + + +#define writeRegToShmem(_) \ + lhs_shmem4[lhs_store_idx_0] = lhs_pf0; \ + \ + rhs_shmem2[rhs_store_idx_0] = make_float2(rhs_pf0.x, rhs_pf0.z); \ + rhs_shmem2[rhs_store_idx_1] = make_float2(rhs_pf0.y, rhs_pf0.w); \ + \ + lhs_shmem4[lhs_store_idx_1] = lhs_pf1; \ + \ + rhs_shmem2[rhs_store_idx_2] = make_float2(rhs_pf1.x, rhs_pf1.z); \ + rhs_shmem2[rhs_store_idx_3] = make_float2(rhs_pf1.y, rhs_pf1.w); \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = Scalar(0); \ + Scalar res(i, 1) = Scalar(0); \ + Scalar res(i, 2) = Scalar(0); \ + Scalar res(i, 3) = Scalar(0); \ + Scalar res(i, 4) = Scalar(0); \ + Scalar res(i, 5) = Scalar(0); \ + Scalar res(i, 6) = Scalar(0); \ + Scalar res(i, 7) = Scalar(0); \ + + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + +#undef prefetchIntoRegisters +#undef writeRegoToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + + float4 _lcol0; + float4 _lcol1; + float2 _rrow0; + float2 _rrow1; + float2 _rrow2; + float2 _rrow3; + +#define lcol0 _lcol0.x +#define lcol1 _lcol0.y +#define lcol2 _lcol0.z +#define lcol3 _lcol0.w +#define lcol4 _lcol1.x +#define lcol5 _lcol1.y +#define lcol6 _lcol1.z +#define lcol7 _lcol1.w +#define rrow0 _rrow0.x +#define rrow1 _rrow0.y +#define rrow2 _rrow1.x +#define rrow3 _rrow1.y +#define rrow4 _rrow2.x +#define rrow5 _rrow2.y +#define rrow6 _rrow3.x +#define rrow7 _rrow3.y + + // Now x corresponds to k, y to m, and z to n + const float4* lhs_block = &lhs_shmem4[threadIdx.x + 8 * (threadIdx.y % 2) + 17 * (threadIdx.y / 2)]; + const float2* rhs_block = &rhs_shmem2[2 * threadIdx.x + 16 * (threadIdx.z % 2) + 276 * (threadIdx.z / 2)]; + +#define lhs_element(i, k) lhs_block[68 * i + 136 * k] +#define rhs_element(k, j) rhs_block[33 * k + 1104 * j + ((k < 4) ? 0 : 12)] + +#define loadData(i) \ + _lcol0 = lhs_element(0, i); \ + _rrow0 = rhs_element(i, 0); \ + _rrow1 = *(&(rhs_element(i, 0)) + 1); \ + _lcol1 = lhs_element(1, i); \ + _rrow2 = rhs_element(i, 1); \ + _rrow3 = *(&(rhs_element(i, 1)) + 1); \ + +#define computeCol(j) \ + res(0, j) += lcol0 * rrow##j; \ + res(1, j) += lcol1 * rrow##j; \ + res(2, j) += lcol2 * rrow##j; \ + res(3, j) += lcol3 * rrow##j; \ + res(4, j) += lcol4 * rrow##j; \ + res(5, j) += lcol5 * rrow##j; \ + res(6, j) += lcol6 * rrow##j; \ + res(7, j) += lcol7 * rrow##j; \ + +#define computePass(i) \ + loadData(i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol0 +#undef lcol1 +#undef lcol2 +#undef lcol3 +#undef lcol4 +#undef lcol5 +#undef lcol6 +#undef lcol7 +#undef rrow0 +#undef rrow1 +#undef rrow2 +#undef rrow3 +#undef rrow4 +#undef rrow5 +#undef rrow6 +#undef rrow7 + +#undef computePass +#undef computeCol +#undef loadData +#undef lhs_element +#undef rhs_element + + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (3) Copies the values into new registers using conditional logic. + +#define makeAssignments(i) \ + val0 = res(i, 0); \ + val1 = res(i, 1); \ + val2 = res(i, 2); \ + val3 = res(i, 3); \ + val4 = res(i, 4); \ + val5 = res(i, 5); \ + val6 = res(i, 6); \ + val7 = res(i, 7); \ + + Scalar val0; + Scalar val1; + Scalar val2; + Scalar val3; + Scalar val4; + Scalar val5; + Scalar val6; + Scalar val7; + + switch (threadIdx.x) { + case 0: + makeAssignments(0); + break; + case 1: + makeAssignments(1); + break; + case 2: + makeAssignments(2); + break; + case 3: + makeAssignments(3); + break; + case 4: + makeAssignments(4); + break; + case 5: + makeAssignments(5); + break; + case 6: + makeAssignments(6); + break; + case 7: + makeAssignments(7); + break; + } + +#undef res + + const Index vert_base = base_m + 4 * threadIdx.y + (threadIdx.x % 4) + 32 * (threadIdx.x / 4); + const Index horiz_base = base_n + 4 * threadIdx.z; + + if (!needs_edge_check || vert_base < m_size) { + if (!needs_edge_check || horiz_base + 35 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + output(vert_base, horiz_base + 33) = val5; + output(vert_base, horiz_base + 34) = val6; + output(vert_base, horiz_base + 35) = val7; + } else if (horiz_base + 34 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + output(vert_base, horiz_base + 33) = val5; + output(vert_base, horiz_base + 34) = val6; + } else if (horiz_base + 33 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + output(vert_base, horiz_base + 33) = val5; + } else if (horiz_base + 32 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + output(vert_base, horiz_base + 32) = val4; + } else if (horiz_base + 3 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + output(vert_base, horiz_base + 3) = val3; + } else if (horiz_base + 2 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + output(vert_base, horiz_base + 2) = val2; + } else if (horiz_base + 1 < n_size) { + output(vert_base, horiz_base + 0) = val0; + output(vert_base, horiz_base + 1) = val1; + } else if (horiz_base < n_size) { + output(vert_base, horiz_base + 0) = val0; + } + } + } + + + template +__global__ void + __launch_bounds__(512) + EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float4 lhs_shmem[(68 * 64) / 4]; + __shared__ float2 rhs_shmem[((66 * 8 + 24) * 8) / 2]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } + } + + + template + struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + if (internal::is_same::value && + internal::is_same::value) { + EigenFloatContractionKernel + <<m_device.stream()>>>(lhs, rhs, output, m, n, k); + } else { + EigenContractionKernel + <<m_device.stream()>>>(lhs, rhs, output, m, n, k); + } + + assert(cudaGetLastError() == cudaSuccess); + } + }; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index ef5e11537..fad342eab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -104,19 +104,41 @@ struct GpuDevice { EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; } - /*EIGEN_DEVICE_FUNC*/ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { +#ifndef __CUDA_ARCH__ void* result; - cudaMalloc(&result, num_bytes); + assert(cudaMalloc(&result, num_bytes) == cudaSuccess); + assert(result != NULL); return result; +#else + assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif } - /*EIGEN_DEVICE_FUNC */EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - cudaFree(buffer); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { +#ifndef __CUDA_ARCH__ + assert(buffer != NULL); + assert(cudaFree(buffer) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif } - EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif } - EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - cudaMemsetAsync(buffer, c, n, *stream_); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess); +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif } EIGEN_STRONG_INLINE size_t numThreads() const { From 152f3218ac9b6941cf6dbc960c2d4a6d1099eb06 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:33:44 -0700 Subject: [PATCH 065/214] Improved contraction test --- unsupported/test/cxx11_tensor_contraction.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index a37fcd967..2b599d30d 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -201,6 +201,37 @@ static void test_full_redux() } +static void test_contraction_of_contraction() +{ + Tensor t1(2, 2); + Tensor t2(2, 2); + Tensor t3(2, 2); + Tensor t4(2, 2); + t1.setRandom(); + t2.setRandom(); + t3.setRandom(); + t4.setRandom(); + + Eigen::array dims({{DimPair(1, 0)}}); + auto contract1 = t1.contract(t2, dims); + auto diff = t3 - contract1; + auto contract2 = t1.contract(t4, dims); + Tensor result = contract2.contract(diff, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 2); + + Eigen::Map m1(t1.data(), 2, 2); + Eigen::Map m2(t2.data(), 2, 2); + Eigen::Map m3(t3.data(), 2, 2); + Eigen::Map m4(t4.data(), 2, 2); + Eigen::MatrixXf expected = (m1 * m4) * (m3 - m1 * m2); + VERIFY_IS_APPROX(result(0, 0), expected(0, 0)); + VERIFY_IS_APPROX(result(0, 1), expected(0, 1)); + VERIFY_IS_APPROX(result(1, 0), expected(1, 0)); + VERIFY_IS_APPROX(result(1, 1), expected(1, 1)); +} + + static void test_expr() { Tensor mat1(2, 3); @@ -328,6 +359,7 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_multidims()); CALL_SUBTEST(test_holes()); CALL_SUBTEST(test_full_redux()); + CALL_SUBTEST(test_contraction_of_contraction()); CALL_SUBTEST(test_expr()); CALL_SUBTEST(test_out_of_order_contraction()); CALL_SUBTEST(test_consistency()); From 95a430a2ca8489a85d0a12ffa66d260011c11745 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:45:19 -0700 Subject: [PATCH 066/214] Vector primitives for CUDA --- Eigen/Core | 5 + Eigen/src/Core/arch/CUDA/MathFunctions.h | 75 +++++++ Eigen/src/Core/arch/CUDA/PacketMath.h | 260 +++++++++++++++++++++++ 3 files changed, 340 insertions(+) create mode 100644 Eigen/src/Core/arch/CUDA/MathFunctions.h create mode 100644 Eigen/src/Core/arch/CUDA/PacketMath.h diff --git a/Eigen/Core b/Eigen/Core index 776b7faf3..537ac16b2 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -294,6 +294,11 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/Complex.h" #endif +#if defined EIGEN_VECTORIZE_CUDA + #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/MathFunctions.h" +#endif + #include "src/Core/arch/Default/Settings.h" #include "src/Core/functors/BinaryFunctors.h" diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/CUDA/MathFunctions.h new file mode 100644 index 000000000..e7305c01e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -0,0 +1,75 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_STRONG_INLINE +float4 plog(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 plog(const double2& a) +{ + return make_double2(log(a.x), log(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 pexp(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 pexp(const double2& a) +{ + return make_double2(exp(a.x), exp(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 psqrt(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 psqrt(const double2& a) +{ + return make_double2(sqrt(a.x), sqrt(a.y)); +} + +template<> EIGEN_STRONG_INLINE +float4 prsqrt(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} + +template<> EIGEN_STRONG_INLINE +double2 prsqrt(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 000000000..5b0abe2e6 --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,260 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + + +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef float type; enum {size=4}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +#ifdef __CUDA_ARCH__ +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { + return __ldg((const float4*)from); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { + return __ldg((const double2*)from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { + return make_double2(__ldg(from+0), __ldg(from+1)); +} +#endif + +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, int stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, int stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, int stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +template<> EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H From bbce6fa65d8a196f05e0428d014e0e3865e202f3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Oct 2014 19:55:35 -0700 Subject: [PATCH 067/214] define EIGEN_VECTORIZE_CUDA when compiling with nvcc --- Eigen/Core | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/Core b/Eigen/Core index 537ac16b2..acdeca5f4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -178,6 +178,11 @@ #endif #endif +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif From 6c047d398daba5784da35d3b502360a5a7a83f33 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 8 Oct 2014 13:29:36 -0700 Subject: [PATCH 068/214] Fixed a comment --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index d42167da9..4d7f9e1fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -18,7 +18,7 @@ namespace Eigen { * \brief The fixed sized version of the tensor class. * * The fixes sized equivalent of - * Eigen::Tensor t(3, 5, 7); + * Eigen::Tensor t(3, 5, 7); * is * Eigen::TensorFixedSize> t; */ From 0a07ac574ead83d314d518127d8d69595f6212b2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 8 Oct 2014 13:32:41 -0700 Subject: [PATCH 069/214] Added support for the *= and /* operators to TensorBase --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 2f7c9ecda..90a9cc2c4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -307,11 +307,18 @@ class TensorBase : public TensorBase, const Derived, const OtherDerived>(derived(), other.derived()); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const OtherDerived& other) { return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp From 44beee9d68e13dc299c6e2ea321aedc74c23d039 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 8 Oct 2014 14:14:20 -0700 Subject: [PATCH 070/214] Removed dead code --- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 417717b90..04849dd9f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -12,9 +12,6 @@ namespace Eigen { -template class Stride; - - /** \class TensorMap * \ingroup CXX11_Tensor_Module * From 767424af18a55604496f38dd4593542db97240a1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Oct 2014 15:36:23 -0700 Subject: [PATCH 071/214] Improved the functors defined for standard reductions Added a functor to encapsulate the generation of random numbers on cpu and gpu. --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 72 +++++++++++++++++-- unsupported/test/cxx11_tensor_reduction.cpp | 33 +++++++++ 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 92984336c..e9aa22183 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -25,12 +25,12 @@ template struct SumReducer } private: - T m_sum; + typename internal::remove_all::type m_sum; }; template struct MaxReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max((std::numeric_limits::min)()) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max(-(std::numeric_limits::max)()) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { if (t > m_max) { m_max = t; } } @@ -39,7 +39,7 @@ template struct MaxReducer } private: - T m_max; + typename internal::remove_all::type m_max; }; template struct MinReducer @@ -53,9 +53,73 @@ template struct MinReducer } private: - T m_min; + typename internal::remove_all::type m_min; }; + +#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) +// We're not compiling a cuda kernel +template struct UniformRandomGenerator { + template + T operator()(Index, Index = 0) const { + return random(); + } + template + typename internal::packet_traits::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = random(); + } + return internal::pload::type>(values); + } +}; + +#else + +// We're compiling a cuda kernel +template struct UniformRandomGenerator; + +template <> struct UniformRandomGenerator { + UniformRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + + template + float operator()(Index, Index = 0) const { + return curand_uniform(&m_state); + } + template + float4 packetOp(Index, Index = 0) const { + return curand_uniform4(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> struct UniformRandomGenerator { + UniformRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + template + double operator()(Index, Index = 0) const { + return curand_uniform_double(&m_state); + } + template + double2 packetOp(Index, Index = 0) const { + return curand_uniform2_double(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 27135b982..da9885166 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -139,9 +139,42 @@ static void test_user_defined_reductions() } +static void test_tensor_maps() +{ + int inputs[2*3*5*7]; + TensorMap > tensor_map(inputs, 2,3,5,7); + TensorMap > tensor_map_const(inputs, 2,3,5,7); + const TensorMap > tensor_map_const_const(inputs, 2,3,5,7); + + tensor_map.setRandom(); + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + Tensor result = tensor_map.sum(reduction_axis); + Tensor result2 = tensor_map_const.sum(reduction_axis); + Tensor result3 = tensor_map_const_const.sum(reduction_axis); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + int sum = 0; + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 7; ++l) { + sum += tensor_map(i, k, j, l); + } + } + VERIFY_IS_EQUAL(result(i, j), sum); + VERIFY_IS_EQUAL(result2(i, j), sum); + VERIFY_IS_EQUAL(result3(i, j), sum); + } + } +} + + void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_simple_reductions()); CALL_SUBTEST(test_full_reductions()); CALL_SUBTEST(test_user_defined_reductions()); + CALL_SUBTEST(test_tensor_maps()); } From 498b7eed25bdb3b90f2fc45dd822c96aa08db2f8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Oct 2014 15:39:13 -0700 Subject: [PATCH 072/214] Rewrote the TensorBase::random method to support the generation of random number on gpu. --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 90a9cc2c4..d4b7846a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -39,9 +39,14 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> random() const { - return TensorCwiseNullaryOp, const Derived>(derived()); + return TensorCwiseNullaryOp, const Derived>(derived()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random() const { + return TensorCwiseNullaryOp(derived()); } // Coefficient-wise unary operators From a991f94c0e5c51555875564ce58681a82d07cd69 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 15:20:37 -0700 Subject: [PATCH 073/214] Fixed the thread pool test --- test/main.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_thread_pool.cpp | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/main.h b/test/main.h index b504970f3..9cb41c828 100644 --- a/test/main.h +++ b/test/main.h @@ -47,8 +47,8 @@ // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses +//#define min(A,B) please_protect_your_min_with_parentheses +//#define max(A,B) please_protect_your_max_with_parentheses #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index faf965df8..84768ca09 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -131,7 +131,7 @@ class TensorExecutor const Index numblocks = size / blocksize; Index i = 0; - vector > results; + std::vector > results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 75423f516..1c4d0838a 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -122,5 +122,5 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") -# ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 2e67b2064..e02d8e4be 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -17,9 +17,9 @@ using Eigen::Tensor; void test_cxx11_tensor_thread_pool() { - Eigen::Tensor in1(Eigen::array(2,3,7)); - Eigen::Tensor in2(Eigen::array(2,3,7)); - Eigen::Tensor out(Eigen::array(2,3,7)); + Eigen::Tensor in1(2,3,7); + Eigen::Tensor in2(2,3,7); + Eigen::Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); @@ -30,7 +30,7 @@ void test_cxx11_tensor_thread_pool() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); } } } From 4b36c3591f247d4be38e5a12dbed7ac0d1ad2bff Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 15:43:21 -0700 Subject: [PATCH 074/214] Fixed the tensor shuffling test --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 133 +++++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 8 +- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_fixed_size.cpp | 2 +- unsupported/test/cxx11_tensor_shuffling.cpp | 9 +- 5 files changed, 141 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 11590b474..732c6b344 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -37,8 +37,7 @@ template struct IndexPair { Index second; }; - -// Boiler plate code +// Boilerplate code namespace internal { template struct dget { @@ -110,6 +109,11 @@ struct Sizes : internal::numeric_list { } }; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { + return Sizes::total_size; +} + #else template @@ -136,9 +140,21 @@ template Sizes(DenseIndex... indices) { } explicit Sizes(std::initializer_list l) { // todo: add assertion } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + } + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + } #endif template Sizes& operator = (const T& other) { @@ -156,9 +172,14 @@ template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { + return Sizes::total_size; +}; + #endif -// Boiler plate +// Boilerplate namespace internal { template struct tensor_index_linearization_helper @@ -243,6 +264,112 @@ struct DSizes : array { }; + + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_vsize_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const&) + { + return array_get(indices); + } +}; +} // end namespace internal + +template +struct VSizes : std::vector { + typedef std::vector Base; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { + return internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC VSizes() { } + EIGEN_DEVICE_FUNC explicit VSizes(const std::vector& a) : Base(a) { } + + template + EIGEN_DEVICE_FUNC explicit VSizes(const array& a) { + this->resize(NumDims); + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) { + this->resize(1); + (*this)[0] = i0; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) { + this->resize(2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + this->resize(3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + this->resize(4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + this->resize(5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } + + VSizes& operator = (const std::vector& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { + return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { + return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + + +// Boilerplate +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes& sizes) { + DenseIndex total_size = 1; + for (int i = 0; i < sizes.size(); ++i) { + total_size *= sizes[i]; + } + return total_size; +} +} + namespace internal { template struct array_size > { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 84768ca09..10f5a5ee7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -39,7 +39,7 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); for (Index i = 0; i < size; ++i) { evaluator.evalScalar(i); } @@ -60,7 +60,7 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = unpacket_traits::PacketReturnType>::size; const int VectorizedSize = (size / PacketSize) * PacketSize; @@ -122,7 +122,7 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; @@ -176,7 +176,7 @@ class TensorExecutor const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock(); - const Index size = evaluator.dimensions().TotalSize(); + const Index size = array_prod(evaluator.dimensions()); EigenMetaKernel > <<>>(evaluator, size); assert(cudaGetLastError() == cudaSuccess); } diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 1c4d0838a..ac2ccaf27 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -119,7 +119,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") ei_add_test(cxx11_tensor_reduction "-std=c++0x") -# ei_add_test(cxx11_tensor_shuffling "-std=c++0x") + ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index d270486f2..b0501aaa3 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -179,7 +179,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(mat3(array(i,j,k)), powf(val, 3.5f)); + VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f)); val += 1.0; } } diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 5ab8b6821..39c623499 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -12,6 +12,7 @@ #include using Eigen::Tensor; +using Eigen::array; static void test_simple_shuffling() { @@ -80,10 +81,10 @@ static void test_expr_shuffling() Tensor result(5,7,3,2); - array src_slice_dim(Eigen::array(2,3,1,7)); - array src_slice_start(Eigen::array(0,0,0,0)); - array dst_slice_dim(Eigen::array(1,7,3,2)); - array dst_slice_start(Eigen::array(0,0,0,0)); + array src_slice_dim{{2,3,1,7}}; + array src_slice_start{{0,0,0,0}}; + array dst_slice_dim{{1,7,3,2}}; + array dst_slice_start{{0,0,0,0}}; for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = From 2ed1838aeb6d3c70c35dbd8d545fba1e7e1c68dc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 16:11:27 -0700 Subject: [PATCH 075/214] Added support for tensor chips --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 232 +++++++++++++++++ .../src/Tensor/TensorForwardDeclarations.h | 3 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_chipping.cpp | 244 ++++++++++++++++++ 6 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h create mode 100644 unsupported/test/cxx11_tensor_chipping.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index b1bd2f676..5a6246a03 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -47,6 +47,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index d4b7846a0..cadeb3b19 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -254,6 +254,11 @@ class TensorBase slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorPaddingOp pad(const PaddingDimensions& padding) const { @@ -327,7 +332,7 @@ class TensorBase : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp - reshape(const NewDimensions& newDimensions) { + reshape(const NewDimensions& newDimensions) const { return TensorReshapingOp(derived(), newDimensions); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -335,6 +340,11 @@ class TensorBase : public TensorBase(derived(), startIndices, sizes); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp shuffle(const Shuffle& shuffle) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 000000000..9ecea9108 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorChippingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorChippingOp type; +}; + +} // end namespace internal + + + +template +class TensorChippingOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset) + : m_xpr(expr), m_offset(offset) {} + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + PacketAccess = false, // not yet implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device) + { + // We could also support the case where NumInputDims==1 if needed. + EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumInputDims > DimId, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != DimId) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + for (int i = 0; i < DimId; ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + m_inputStride *= input_dims[DimId]; + m_inputOffset = m_stride * op.offset(); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + /* to be done + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + + }*/ + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if (DimId == 0) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if (DimId == NumInputDims-1) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const Device& m_device; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + /* to be done + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + } */ +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index bc67586a4..86ddd1ae8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -21,11 +21,12 @@ template class TensorCwiseNullaryO template class TensorCwiseUnaryOp; template class TensorCwiseBinaryOp; template class TensorSelectOp; -template class TensorBroadcastingOp; template class TensorReductionOp; template class TensorConcatenationOp; template class TensorContractionOp; template class TensorConvolutionOp; +template class TensorBroadcastingOp; +template class TensorChippingOp; template class TensorReshapingOp; template class TensorSlicingOp; template class TensorPaddingOp; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index ac2ccaf27..48435eb9c 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -115,6 +115,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_lvalue "-std=c++0x") ei_add_test(cxx11_tensor_map "-std=c++0x") ei_add_test(cxx11_tensor_broadcasting "-std=c++0x") + ei_add_test(cxx11_tensor_chipping "-std=c++0x") ei_add_test(cxx11_tensor_concatenation "-std=c++0x") ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp new file mode 100644 index 000000000..8c8a0cec2 --- /dev/null +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -0,0 +1,244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + + +static void test_simple_chip() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Tensor chip1; + chip1 = tensor.chip<0>(1); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); + VERIFY_IS_EQUAL(chip1.dimension(1), 5); + VERIFY_IS_EQUAL(chip1.dimension(2), 7); + VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l)); + } + } + } + } + + Tensor chip2 = tensor.chip<1>(1); + VERIFY_IS_EQUAL(chip2.dimension(0), 2); + VERIFY_IS_EQUAL(chip2.dimension(1), 5); + VERIFY_IS_EQUAL(chip2.dimension(2), 7); + VERIFY_IS_EQUAL(chip2.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); + } + } + } + } + + Tensor chip3 = tensor.chip<2>(2); + VERIFY_IS_EQUAL(chip3.dimension(0), 2); + VERIFY_IS_EQUAL(chip3.dimension(1), 3); + VERIFY_IS_EQUAL(chip3.dimension(2), 7); + VERIFY_IS_EQUAL(chip3.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l)); + } + } + } + } + + Tensor chip4(tensor.chip<3>(5)); + VERIFY_IS_EQUAL(chip4.dimension(0), 2); + VERIFY_IS_EQUAL(chip4.dimension(1), 3); + VERIFY_IS_EQUAL(chip4.dimension(2), 5); + VERIFY_IS_EQUAL(chip4.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); + } + } + } + } + + Tensor chip5(tensor.chip<4>(7)); + VERIFY_IS_EQUAL(chip5.dimension(0), 2); + VERIFY_IS_EQUAL(chip5.dimension(1), 3); + VERIFY_IS_EQUAL(chip5.dimension(2), 5); + VERIFY_IS_EQUAL(chip5.dimension(3), 7); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7)); + } + } + } + } +} + + +static void test_chip_in_expr() { + Tensor input1(2,3,5,7,11); + input1.setRandom(); + Tensor input2(3,5,7,11); + input2.setRandom(); + + Tensor result = input1.chip<0>(0) + input2; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + float expected = input1(0,i,j,k,l) + input2(i,j,k,l); + VERIFY_IS_EQUAL(result(i,j,k,l), expected); + } + } + } + } + + Tensor input3(3,7,11); + input3.setRandom(); + Tensor result2 = input1.chip<0>(0).chip<1>(2) + input3; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 11; ++k) { + float expected = input1(0,i,2,j,k) + input3(i,j,k); + VERIFY_IS_EQUAL(result2(i,j,k), expected); + } + } + } +} + + +static void test_chip_as_lvalue() +{ + Tensor input1(2,3,5,7,11); + input1.setRandom(); + + Tensor input2(3,5,7,11); + input2.setRandom(); + Tensor tensor = input1; + tensor.chip<0>(1) = input2; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (i != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); + } + } + } + } + } + } + + Tensor input3(2,5,7,11); + input3.setRandom(); + tensor = input1; + tensor.chip<1>(1) = input3; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (j != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m)); + } + } + } + } + } + } + + Tensor input4(2,3,7,11); + input4.setRandom(); + tensor = input1; + tensor.chip<2>(3) = input4; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (k != 3) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m)); + } + } + } + } + } + } + + Tensor input5(2,3,5,11); + input5.setRandom(); + tensor = input1; + tensor.chip<3>(4) = input5; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (l != 4) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m)); + } + } + } + } + } + } + + Tensor input6(2,3,5,7); + input6.setRandom(); + tensor = input1; + tensor.chip<4>(5) = input6; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (m != 5) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l)); + } + } + } + } + } + } +} + + +void test_cxx11_tensor_chipping() +{ + CALL_SUBTEST(test_simple_chip()); + CALL_SUBTEST(test_chip_in_expr()); + CALL_SUBTEST(test_chip_as_lvalue()); +} From 0219f8aed44279858330b1c07402c066f5b75459 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Oct 2014 16:17:26 -0700 Subject: [PATCH 076/214] Added ability to print a tensor using an iostream. --- unsupported/Eigen/CXX11/Tensor | 2 + unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 44 ++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_io.cpp | 70 +++++++++++++++++++ 4 files changed, 117 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorIO.h create mode 100644 unsupported/test/cxx11_tensor_io.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 5a6246a03..79510fd96 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -64,6 +64,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" + #include "Eigen/src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 000000000..959b5db73 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +template +std::ostream& operator << (std::ostream& os, const TensorBase& expr) { + // Evaluate the expression if needed + TensorForcedEvalOp eval = expr.eval(); + TensorEvaluator, DefaultDevice> tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + typedef typename T::Scalar Scalar; + typedef typename T::Index Index; + typedef typename TensorEvaluator, DefaultDevice>::Dimensions Dimensions; + const Index total_size = internal::array_prod(tensor.dimensions()); + + // Print the tensor as a 1d vector or a 2d matrix. + if (internal::array_size::value == 1) { + Map > array(tensor.data(), total_size); + os << array; + } else { + const Index first_dim = tensor.dimensions()[0]; + Map > matrix(tensor.data(), first_dim, total_size/first_dim); + os << matrix; + } + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 48435eb9c..99593b562 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -124,4 +124,5 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_io "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp new file mode 100644 index 000000000..b73c024f5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -0,0 +1,70 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include +#include +#include + + +static void test_output_1d() +{ + Tensor tensor(5); + for (int i = 0; i < 5; ++i) { + tensor(i) = i; + } + + std::stringstream os; + os << tensor; + + std::string expected("0\n1\n2\n3\n4"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +static void test_output_2d() +{ + Tensor tensor(5, 3); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 3; ++j) { + tensor(i, j) = i*j; + } + } + + std::stringstream os; + os << tensor; + + std::string expected("0 0 0\n0 1 2\n0 2 4\n0 3 6\n0 4 8"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +static void test_output_expr() +{ + Tensor tensor1(5); + Tensor tensor2(5); + for (int i = 0; i < 5; ++i) { + tensor1(i) = i; + tensor2(i) = 7; + } + + std::stringstream os; + os << tensor1 + tensor2; + + std::string expected(" 7\n 8\n 9\n10\n11"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +void test_cxx11_tensor_io() +{ + CALL_SUBTEST(test_output_1d()); + CALL_SUBTEST(test_output_2d()); + CALL_SUBTEST(test_output_expr()); +} From 4c70b0a7627d45286ecbb3c73d2d774412168205 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 13 Oct 2014 10:04:04 -0700 Subject: [PATCH 077/214] Added support for patch extraction --- unsupported/Eigen/CXX11/Tensor | 7 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 6 + .../src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorPatch.h | 212 ++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_patch.cpp | 103 +++++++++ 6 files changed, 330 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h create mode 100644 unsupported/test/cxx11_tensor_patch.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 79510fd96..0dac95e45 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner // Copyright (C) 2013 Christian Seiler // // This Source Code Form is subject to the terms of the Mozilla @@ -27,6 +28,11 @@ #include #include +#include + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#include +#endif #include "Eigen/Core" @@ -46,6 +52,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index cadeb3b19..27c10f64f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -243,6 +243,12 @@ class TensorBase return TensorConcatenationOp(derived(), other.derived(), axis); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 86ddd1ae8..67f478822 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -25,6 +25,7 @@ template class TensorReductionOp; template class TensorConcatenationOp; template class TensorContractionOp; template class TensorConvolutionOp; +template class TensorPatchOp; template class TensorBroadcastingOp; template class TensorChippingOp; template class TensorReshapingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 000000000..01f2daf52 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPatchOp type; +}; + +} // end namespace internal + + + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[NumDims - 1]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[NumDims - 1]; + + Index inputIndex = 0; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index indices[2] = {index, index + packetSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[NumDims - 1], + indices[1] / m_outputStrides[NumDims - 1]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[NumDims - 1], + indices[1] - patchIndices[1] * m_outputStrides[NumDims - 1]}; + + Index inputIndices[2] = {0, 0}; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 99593b562..d6c435947 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -119,6 +119,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_concatenation "-std=c++0x") ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") + ei_add_test(cxx11_tensor_patch "-std=c++0x") ei_add_test(cxx11_tensor_reduction "-std=c++0x") ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp new file mode 100644 index 000000000..e2ba5bfd8 --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_patch() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + Tensor no_patch; + no_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + Tensor twod_patch; + twod_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + Tensor threed_patch; + threed_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } + } + } + } + } + } + } +} + + +void test_cxx11_tensor_patch() +{ + CALL_SUBTEST(test_simple_patch()); + // CALL_SUBTEST(test_expr_shuffling()); +} From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 13 Oct 2014 17:02:09 -0700 Subject: [PATCH 078/214] Misc improvements and cleanups --- Eigen/src/Core/GenericPacketMath.h | 15 +- unsupported/Eigen/CXX11/Tensor | 4 + .../CXX11/src/Core/util/CXX11Workarounds.h | 5 + .../CXX11/src/Core/util/EmulateCXX11Meta.h | 101 +++++++- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 8 +- .../CXX11/src/Tensor/TensorBroadcasting.h | 8 +- .../CXX11/src/Tensor/TensorConvolution.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 35 +++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 73 +++--- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 20 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 36 ++- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 24 +- .../Eigen/CXX11/src/Tensor/TensorMap.h | 22 +- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 61 +++-- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 32 +-- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_assign.cpp | 35 ++- unsupported/test/cxx11_tensor_convolution.cpp | 70 ++++++ unsupported/test/cxx11_tensor_device.cpp | 27 ++ unsupported/test/cxx11_tensor_morphing.cpp | 5 +- unsupported/test/cxx11_tensor_of_complex.cpp | 64 +++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 232 +++++++++++++++++- 29 files changed, 779 insertions(+), 140 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_of_complex.cpp diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e6fea5bba..3ef3475c7 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -359,7 +359,7 @@ pmadd(const Packet& a, /** \internal \returns a packet version of \a *from. * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */ template -inline Packet ploadt(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits::type* from) { if(LoadMode == Aligned) return pload(from); @@ -370,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits::type* from) /** \internal copy the packet \a from to \a *to. * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */ template -inline void pstoret(Scalar* to, const Packet& from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) { if(LoadMode == Aligned) pstore(to, from); @@ -378,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from) pstoreu(to, from); } +/** \internal \returns a packet version of \a *from. + * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the + * hardware if available to speedup the loading of data that won't be modified + * by the current computation. + */ +template +inline Packet ploadt_ro(const typename unpacket_traits::type* from) +{ + return ploadt(from); +} + /** \internal default implementation of palign() allowing partial specialization */ template struct palign_impl diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 0dac95e45..2137f4276 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -30,6 +30,10 @@ #include #include +#ifdef EIGEN_USE_THREADS +#include +#endif + #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) #include #endif diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index 227522ecb..e30eb6ad8 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -66,6 +66,11 @@ template constexpr inline T& array_ template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } +template constexpr inline T& array_get(std::vector& a) { return a[I]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } + + #undef STD_GET_ARR_HACK template struct array_size; diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index 4c6b95773..e45d0a3b1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -48,7 +48,8 @@ template class array { values[2] = v3; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -56,7 +57,8 @@ template class array { values[3] = v4; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) { + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v1; values[1] = v2; @@ -64,6 +66,43 @@ template class array { values[3] = v4; values[4] = v5; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES array(std::initializer_list l) { @@ -93,9 +132,11 @@ template struct type_list { struct null_type { }; -template +template struct make_type_list { - typedef typename make_type_list::type tailresult; + typedef typename make_type_list::type tailresult; typedef type_list type; }; @@ -150,6 +191,23 @@ template struct gen_numeric_list_repeated { typedef typename make_type_list, type2val, type2val, type2val, type2val >::type type; }; +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val >::type type; +}; + +template struct gen_numeric_list_repeated { + typedef typename make_type_list, type2val, type2val, + type2val, type2val, type2val, + type2val, type2val >::type type; +}; + template struct get; @@ -174,6 +232,7 @@ template <> struct arg_prod { static const int value = 1; }; + template array repeat(t v) { array array; @@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l return get >::value; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) { + return arg_prod::value; +}; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& a) { t prod = 1; @@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array& /*a*/) { return 0; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { return a[I]; @@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { return a[I]; } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector& a) { + return a[I]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector& a) { + return a[I]; +} +template struct array_size; +template struct array_size > { + static const size_t value = N; +}; +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; template struct array_size; template struct array_size > { static const size_t value = N; }; - +template struct array_size; +template struct array_size& > { + static const size_t value = N; +}; struct sum_op { template static inline bool run(A a, B b) { return a + b; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 3bfe80c9e..e973c00d3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -131,8 +131,8 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - static const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - static const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 27c10f64f..6018ecc66 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -30,6 +30,12 @@ class TensorBase typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; + // Dimensions + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); } + // Nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> @@ -187,7 +193,7 @@ class TensorBase } // Contractions. - typedef std::pair DimensionPair; + typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorContractionOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 3b2a9c8b9..0e55d4de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -48,7 +48,7 @@ struct nested, 1, typename eval -class TensorBroadcastingOp : public TensorBase, WriteAccessors> +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -91,7 +91,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess, }; -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D if (innermostLoc + packetSize <= m_impl.dimensions()[0]) { return m_impl.template packet(inputIndex); } else { - EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < packetSize; ++i) { values[i] = coeff(originalIndex+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 4a5fd9c79..34bdd5309 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -872,11 +872,19 @@ struct TensorEvaluator + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + private: // No assignment (copies are needed by the kernels) TensorEvaluator& operator = (const TensorEvaluator&); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 75519c9f5..649bdb308 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -38,6 +38,18 @@ template class TensorDevice { return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const DeviceType& m_device; ExpressionType& m_expression; @@ -58,6 +70,18 @@ template class TensorDevice + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const ThreadPoolDevice& m_device; ExpressionType& m_expression; @@ -79,6 +103,17 @@ template class TensorDevice return *this; } + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + protected: const GpuDevice& m_device; ExpressionType m_expression; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index fad342eab..5a6ff70e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -37,23 +37,41 @@ struct DefaultDevice { // Multiple cpu cores // We should really use a thread pool here but first we need to find a portable thread pool library. #ifdef EIGEN_USE_THREADS + +typedef std::future Future; + struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { } - size_t numThreads() const { return num_threads_; } + ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); } + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); } + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } + EIGEN_STRONG_INLINE size_t numThreads() const { + return num_threads_; + } + + template + EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const { + return std::async(std::launch::async, f, args...); + } + template + EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const { + std::async(std::launch::async, f, args...); + } + private: // todo: NUMA, ... size_t num_threads_; @@ -63,40 +81,33 @@ struct ThreadPoolDevice { // GPU offloading #ifdef EIGEN_USE_GPU -static int m_numMultiProcessors = 0; -static int m_maxThreadsPerBlock = 0; -static int m_maxThreadsPerMultiProcessor = 0; +static cudaDeviceProp m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess); + m_devicePropInitialized = true; + } +} static inline int getNumCudaMultiProcessors() { - if (m_numMultiProcessors == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_numMultiProcessors = deviceProp.multiProcessorCount; - } - return m_numMultiProcessors; + initializeDeviceProp(); + return m_deviceProperties.multiProcessorCount; } static inline int maxCudaThreadsPerBlock() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - } - return m_maxThreadsPerBlock; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerBlock; } static inline int maxCudaThreadsPerMultiProcessor() { - if (m_maxThreadsPerBlock == 0) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - m_numMultiProcessors = deviceProp.multiProcessorCount; - m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; - m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor; - } - return m_maxThreadsPerMultiProcessor; + initializeDeviceProp(); + return m_deviceProperties.maxThreadsPerMultiProcessor; } +static inline int sharedMemPerBlock() { + initializeDeviceProp(); + return m_deviceProperties.sharedMemPerBlock; +} + struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. @@ -141,8 +152,8 @@ struct GpuDevice { #endif } - EIGEN_STRONG_INLINE size_t numThreads() const { - // Fixme: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME return 32; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 732c6b344..2dd8e274b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -29,7 +29,7 @@ namespace Eigen { * \sa Tensor */ -// Can't use std::pairs on cuda devices +// Can't use std::pair on cuda devices template struct IndexPair { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 587cbd5ca..ce9d73578 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } - EIGEN_STRONG_INLINE void evalPacket(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 0f969036c..e324ba8d2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -65,13 +65,13 @@ struct TensorEvaluator return m_data[index]; } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return internal::ploadt(m_data + index); } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const Packet& x) { return internal::pstoret(m_data + index, x); @@ -113,13 +113,17 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else return m_data[index]; +#endif } - template EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_data + index); + return internal::ploadt_ro(m_data + index); } const Scalar* data() const { return m_data; } @@ -166,7 +170,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(index); } @@ -219,7 +223,7 @@ struct TensorEvaluator, Device> } template - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_argImpl.template packet(index)); } @@ -278,7 +282,7 @@ struct TensorEvaluator - EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); } @@ -340,7 +344,7 @@ struct TensorEvaluator return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); } template - PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { static const int PacketSize = internal::unpacket_traits::size; internal::Selector select; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 10f5a5ee7..01fa04c64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -10,10 +10,6 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#ifdef EIGEN_USE_THREADS -#include -#endif - namespace Eigen { /** \class TensorExecutor @@ -62,7 +58,7 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; + const Index VectorizedSize = (size / PacketSize) * PacketSize; for (Index i = 0; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); @@ -131,10 +127,10 @@ class TensorExecutor const Index numblocks = size / blocksize; Index i = 0; - std::vector > results; + std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + results.push_back(device.enqueue(&EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); } for (int i = 0; i < numblocks; ++i) { @@ -154,11 +150,31 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, unsigned int size) { + const int first_index = blockIdx.x * blockDim.x + threadIdx.x; const int step_size = blockDim.x * gridDim.x; - for (int i = first_index; i < size; i += step_size) { - eval.evalScalar(i); + + if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { + // Use the scalar path + for (int i = first_index; i < size; i += step_size) { + eval.evalScalar(i); + } + } + else { + // Use the vector path + const int PacketSize = unpacket_traits::size; + const int vectorized_step_size = step_size * PacketSize; + const int vectorized_size = (size / PacketSize) * PacketSize; + int i = first_index * PacketSize; + for ( ; i < vectorized_size; i += vectorized_step_size) { + eval.evalPacket(i); + } + for ( ; i < size; i += step_size) { + eval.evalScalar(i); + } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 4d7f9e1fd..a753c5a48 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -17,7 +17,7 @@ namespace Eigen { * * \brief The fixed sized version of the tensor class. * - * The fixes sized equivalent of + * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is * Eigen::TensorFixedSize> t; @@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase::size > 1), }; typedef Dimensions_ Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index cf97031be..2714117ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -31,30 +31,34 @@ namespace internal { template struct TensorIntDivisor { public: - TensorIntDivisor() { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { multiplier = 0; shift1 = 0; shift2 = 0; } // Must have 1 <= divider <= 2^31-1 - TensorIntDivisor(const T divider) { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = 32; eigen_assert(divider > 0); eigen_assert(divider <= (1<<(N-1)) - 1); // fast ln2 +#ifndef __CUDA_ARCH__ const int leading_zeros = __builtin_clz(divider); - const int l = N - (leading_zeros+1); +#else + const int leading_zeros = __clz(divider); +#endif + const int log_div = N - (leading_zeros+1); - multiplier = (static_cast(1) << (N+l)) / divider - (static_cast(1) << N) + 1; - shift1 = (std::min)(1, l); - shift2 = (std::max)(0, l-1); + multiplier = (static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1; + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; } // Must have 0 <= numerator <= 2^32-1 - T divide(const T numerator) const { - static const int N = 32; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + const int N = 32; eigen_assert(numerator >= 0); eigen_assert(numerator <= (1ull< -static T operator / (const T& numerator, const TensorIntDivisor& divisor) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { return divisor.divide(numerator); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 04849dd9f..2c0d2cd0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -42,26 +42,25 @@ template class TensorMap : public Tensor static const int Options = Options_; - static const std::size_t NumIndices = PlainObjectType::NumIndices; + static const Index NumIndices = PlainObjectType::NumIndices; typedef typename PlainObjectType::Dimensions Dimensions; - enum { - IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned), - PacketAccess = true, + IsAligned = ((int(Options_)&Aligned)==Aligned), + PacketAccess = (internal::packet_traits::size > 1), }; #ifdef EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array({{firstDimension, otherDimensions...}})) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array(firstDimension)) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif @@ -176,12 +175,13 @@ template class TensorMap : public Tensor template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) { - static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + const std::size_t NumDims = sizeof...(otherIndices) + 1; if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, otherIndices...}}); return m_data[index]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7da89458f..8da6e0f26 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -144,7 +144,7 @@ struct TensorEvaluator, Device template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -206,7 +206,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { - static const int packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index f7e7fc107..7e0063626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; @@ -194,7 +194,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; enum { - IsAligned = true, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 0c4f8a3d6..aaec39756 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -30,11 +30,11 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template +template class TensorStorage { private: @@ -62,7 +62,7 @@ class TensorStorage // pure-dynamic, but without specification of all dimensions explicitly -template +template class TensorStorage : public TensorStorage::type> { @@ -79,7 +79,7 @@ class TensorStorage }; // pure dynamic -template +template class TensorStorage::type> { T *m_data; @@ -140,6 +140,7 @@ class TensorStorage, 1, typename eval -class TensorStridingOp : public TensorBase, WriteAccessors> +class TensorStridingOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -97,7 +97,7 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = /*TensorEvaluator::PacketAccess*/false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -109,28 +109,23 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - } - } - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] *= op.strides()[i]; + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; } - // typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -150,16 +145,44 @@ struct TensorEvaluator, Device> return m_impl.coeff(inputIndex); } - /* template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return m_impl.template packet(index); - }*/ + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } Scalar* data() const { return NULL; } protected: - // Strides m_strides; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 40f805741..5940a8cf1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -70,14 +70,18 @@ struct traits > }; -template -struct traits > +template +struct traits > : public traits { typedef traits BaseTraits; typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + enum { + Options = Options_, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; }; @@ -105,16 +109,16 @@ struct eval, Eigen::Dense> typedef const TensorFixedSize& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; template @@ -141,16 +145,16 @@ struct nested, 1, typename e typedef const TensorFixedSize& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef const TensorMap& type; + typedef const TensorMap& type; }; } // end namespace internal diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d6c435947..a7ef2b402 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -110,6 +110,7 @@ if(EIGEN_TEST_CXX11) # ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") + ei_add_test(cxx11_tensor_of_complex "-std=c++0x") ei_add_test(cxx11_tensor_of_strings "-std=c++0x") ei_add_test(cxx11_tensor_intdiv "-std=c++0x") ei_add_test(cxx11_tensor_lvalue "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index f2b126413..0ac3f9bf9 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -253,6 +253,39 @@ static void test_auto_resize() } +static void test_compound_assign() +{ + Tensor start_tensor(10); + Tensor offset_tensor(10); + start_tensor.setRandom(); + offset_tensor.setRandom(); + + Tensor tensor = start_tensor; + tensor += offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i)); + } + + tensor = start_tensor; + tensor -= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i)); + } + + tensor = start_tensor; + tensor *= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i)); + } + + tensor = start_tensor; + tensor /= offset_tensor; + for (int i = 0; i < 10; ++i) { + VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i)); + } +} + + void test_cxx11_tensor_assign() { CALL_SUBTEST(test_1d()); @@ -260,5 +293,5 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_same_type()); CALL_SUBTEST(test_auto_resize()); - + CALL_SUBTEST(test_compound_assign()); } diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index bafe73edd..4672db463 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -64,8 +64,78 @@ static void test_expr() } +static void test_modes() { + Tensor input(3); + Tensor kernel(3); + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + const Eigen::array dims{{0}}; + Eigen::array, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor valid(1); + valid = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor same(3); + same = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + Tensor full(5); + full = input.pad(padding).convolve(kernel, dims); + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); +} + + +static void test_strides() { + Tensor input(13); + Tensor kernel(3); + input.setRandom(); + kernel.setRandom(); + + const Eigen::array dims{{0}}; + const Eigen::array stride_of_3{{3}}; + const Eigen::array stride_of_2{{2}}; + + Tensor result; + result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + + + + void test_cxx11_tensor_convolution() { CALL_SUBTEST(test_evals()); CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_modes()); + CALL_SUBTEST(test_strides()); } diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index f331cb481..26465ee11 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -123,6 +123,14 @@ static void test_forced_contextual_eval(Context* context) context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } +template +static void test_compound_assignment(Context* context) +{ + context->out().device(context->device()) = context->in1().constant(2.718f); + context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; +} + + template static void test_contraction(Context* context) { @@ -197,6 +205,15 @@ static void test_cpu() { } } + test_compound_assignment(&context); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { @@ -299,6 +316,16 @@ static void test_gpu() { } } + test_compound_assignment(&context); + assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + for (int i = 0; i < 40; ++i) { + for (int j = 0; j < 50; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + } + } + } + test_contraction(&context); assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); for (int i = 0; i < 40; ++i) { diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 2a6a97856..fd1b1fa32 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -12,6 +12,7 @@ #include using Eigen::Tensor; +using Eigen::IndexPair; static void test_simple_reshape() { @@ -52,7 +53,7 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + Eigen::array, 1> contract_along{{IndexPair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -125,7 +126,7 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array::DimensionPair, 1> contract_along{{std::make_pair(1, 0)}}; + array, 1> contract_along{{IndexPair(1, 0)}}; Eigen::DSizes indices1(1,2); Eigen::DSizes sizes1(3,3); diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp new file mode 100644 index 000000000..b5044b962 --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -0,0 +1,64 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::TensorMap; + + + +static void test_additions() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + for (int i = 0; i < 3; ++i) { + data1(i) = std::complex(i, -i); + data2(i) = std::complex(i, 7 * i); + } + + Tensor, 1> sum = data1 + data2; + for (int i = 0; i < 3; ++i) { + VERIFY_IS_EQUAL(sum(i), std::complex(2*i, 6*i)); + } +} + + +static void test_contractions() +{ + Tensor, 4> t_left(30, 50, 8, 31); + Tensor, 5> t_right(8, 31, 7, 20, 10); + Tensor, 5> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map, Dynamic, Dynamic>> MapXcf; + MapXcf m_left(t_left.data(), 1500, 248); + MapXcf m_right(t_right.data(), 248, 1400); + Matrix, Dynamic, Dynamic> m_result(1500, 1400); + + // This contraction should be equivalent to a regular matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + +void test_cxx11_tensor_of_complex() +{ + CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_contractions()); +} diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e02d8e4be..f0de61f8b 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -9,22 +9,23 @@ #define EIGEN_USE_THREADS - +#include #include "main.h" #include + using Eigen::Tensor; -void test_cxx11_tensor_thread_pool() +static void test_multithread_elementwise() { - Eigen::Tensor in1(2,3,7); - Eigen::Tensor in2(2,3,7); - Eigen::Tensor out(2,3,7); + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); - Eigen::ThreadPoolDevice thread_pool_device(3); + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); out.device(thread_pool_device) = in1 + in2 * 3.14f; for (int i = 0; i < 2; ++i) { @@ -35,3 +36,222 @@ void test_cxx11_tensor_thread_pool() } } } + + +static void test_multithread_compound_assignment() +{ + Tensor in1(2,3,7); + Tensor in2(2,3,7); + Tensor out(2,3,7); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(3, 11)); + out.device(thread_pool_device) = in1; + out.device(thread_pool_device) += in2 * 3.14f; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + } + } + } +} + + +static void test_multithread_contraction() +{ + Tensor t_left(30, 50, 37, 31); + Tensor t_right(37, 31, 70, 2, 10); + Tensor t_result(30, 50, 70, 2, 10); + + t_left.setRandom(); + t_right.setRandom(); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + + + typedef Map MapXf; + MapXf m_left(t_left.data(), 1500, 1147); + MapXf m_right(t_right.data(), 1147, 1400); + MatrixXf m_result(1500, 1400); + + Eigen::ThreadPoolDevice thread_pool_device(4); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_contraction_corner_cases() +{ + Tensor t_left(32, 500); + Tensor t_right(32, 28*28); + Tensor t_result(500, 28*28); + + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result = t_result.constant(NAN); + + // this contraction should be equivalent to a single matrix multiplication + typedef Tensor::DimensionPair DimPair; + Eigen::array dims{{DimPair(0, 0)}}; + + typedef Map MapXf; + MapXf m_left(t_left.data(), 32, 500); + MapXf m_right(t_right.data(), 32, 28*28); + MatrixXf m_result(500, 28*28); + + Eigen::ThreadPoolDevice thread_pool_device(12); + + // compute results by separate methods + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + m_result = m_left.transpose() * m_right; + + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_result.resize (1, 28*28); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 500); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (500, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 500); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } + + t_left.resize(32, 1); + t_right.resize(32, 4); + t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; + t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; + t_result.resize (1, 4); + t_result = t_result.constant(NAN); + t_result.device(thread_pool_device) = t_left.contract(t_right, dims); + new(&m_left) MapXf(t_left.data(), 32, 1); + new(&m_right) MapXf(t_right.data(), 32, 4); + m_result = m_left.transpose() * m_right; + for (ptrdiff_t i = 0; i < t_result.size(); i++) { + assert(!isnan(t_result.data()[i])); + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; + assert(false); + } + } +} + + +static void test_multithread_contraction_agrees_with_singlethread() { + int contract_size = internal::random(1, 5000); + + Tensor left(internal::random(1, 80), + contract_size, + internal::random(1, 100)); + + Tensor right(internal::random(1, 25), + internal::random(1, 37), + contract_size, + internal::random(1, 51)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(1, 2)}}); + + Eigen::ThreadPoolDevice thread_pool_device(internal::random(2, 11)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result(st_result.dimensions()); + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + + +static void test_memcpy() { + + for (int i = 0; i < 5; ++i) { + const int num_threads = internal::random(3, 11); + Eigen::ThreadPoolDevice thread_pool_device(num_threads); + + const int size = internal::random(13, 7632); + Tensor t1(size); + t1.setRandom(); + std::vector result(size); + thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); + for (int i = 0; i < size; i++) { + VERIFY_IS_EQUAL(t1(i), result[i]); + } + } +} + + +void test_cxx11_tensor_thread_pool() +{ + CALL_SUBTEST(test_multithread_elementwise()); + CALL_SUBTEST(test_multithread_compound_assignment()); + + CALL_SUBTEST(test_multithread_contraction()); + + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + + // Exercise various cases that have been problematic in the past. + CALL_SUBTEST(test_contraction_corner_cases()); + + CALL_SUBTEST(test_memcpy()); +} From dba55041ab62961e549ea58778dffa3eaa0cbdb5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Oct 2014 11:20:36 -0700 Subject: [PATCH 079/214] Added support for promises Started to improve multithreaded contractions --- unsupported/Eigen/CXX11/Tensor | 1 + .../src/Tensor/TensorContractionThreadPool.h | 351 ++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 11 + 3 files changed, 363 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 2137f4276..7ec60044e 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,6 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" +//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 000000000..dc0513305 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { +namespace internal { + +template +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template +struct packRhsAndKernelArg { + const std::vector* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + std::vector* kernel_promises; + const std::vector* lhs_futures; + const bool need_to_pack; +}; + +} // end namespace internal + + +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + typedef array::Dimensions::count> left_dim_mapper_t; + typedef array::Dimensions::count> right_dim_mapper_t; + + typedef array::value> contract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; + typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + + static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs LhsPacker; + typedef internal::gemm_pack_rhs RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel GebpKernel; + + typedef internal::packLhsArg packLArg; + typedef internal::packRhsAndKernelArg packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + LhsPacker pack_lhs; + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + Index mc = m; + Index nc = n; + Index kc = k; + internal::computeProductBlockingSizes(kc, mc, nc/*, num_threads*/); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); + const int sizeA = mc * kc; + const int sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = (std::min)(num_threads, m_blocks); + std::vector blockAs; + blockAs.reserve(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + std::vector blockBs; + blockBs.reserve(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_futures starts with all null futures + std::vector lhs_futures(num_threads); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_promises = num_threads * n_blocks; + Promise p; + p.set_value(); + std::vector kernel_promises(num_kernel_promises, p); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = (std::min)(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = (std::min)(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + for (int i = 0; i < n_blocks; ++i) { + int future_id = (blockAId * n_blocks + i); + wait_until_ready(&kernel_promises[future_id]); + kernel_promises[future_id] = Promise(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + lhs_futures[blockAId] = + this->m_device.enqueue(&Self::packLhs, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = (std::min)(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (int i = num_blocks; i < num_threads; ++i) { + int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + int future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(&kernel_promises[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_promises, // kernel_promises + &lhs_futures, // lhs_futures + need_to_pack, // need_to_pack + }; + + typedef decltype(Self::packRhsAndKernel) Func; + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); + } + } + } + + // collect the last frame of kernel futures + for (int i = 0; i < kernel_promises.size(); ++i) { + wait_until_ready(&kernel_promises[i]); + } + + // deallocate all of the memory for both A and B's + for (int i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (int i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + +#undef CEIL_DIV + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + + wait_until_ready(&(*arg.lhs_futures)[blockAId]); + const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0); + + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + eigen_assert(!(*arg.kernel_promises)[set_idx].ready()); + (*arg.kernel_promises)[set_idx].set_value(); + } + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index 5a6ff70e9..3748879cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -39,6 +39,17 @@ struct DefaultDevice { #ifdef EIGEN_USE_THREADS typedef std::future Future; +typedef std::promise Promise; + +static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) { + f->wait(); + // eigen_assert(f->ready()); +} + +static EIGEN_STRONG_INLINE void wait_until_ready(Promise* p) { + p->get_future().wait(); + // eigen_assert(p->get_future().ready()); +} struct ThreadPoolDevice { ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } From bfdd9f3ac95d9a2b41e6f2ec1f7434331125b9e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 15 Oct 2014 15:32:59 -0700 Subject: [PATCH 080/214] Made the blocking computation aware of the l3 cache Also optimized the blocking parameters to take into account the number of threads used for a computation --- Eigen/src/Core/SolveTriangular.h | 2 +- .../Core/products/GeneralBlockPanelKernel.h | 122 ++++++++++++------ Eigen/src/Core/products/GeneralMatrixMatrix.h | 18 +-- .../products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/Parallelizer.h | 4 +- .../Core/products/SelfadjointMatrixMatrix.h | 6 +- .../Core/products/TriangularMatrixMatrix.h | 2 +- .../Core/products/TriangularSolverMatrix.h | 4 +- blas/level3_impl.h | 12 +- test/product_large.cpp | 7 +- unsupported/Eigen/CXX11/Tensor | 2 +- .../CXX11/src/Tensor/TensorContraction.h | 2 +- .../src/Tensor/TensorContractionThreadPool.h | 13 +- 13 files changed, 117 insertions(+), 79 deletions(-) diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index ef17f288e..e158e3162 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 090c8f4e6..b91786037 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); - typedef gebp_traits Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits::mr, - mr_mask = (0xffffffff/mr)*mr - }; + if (num_threads > 1) { + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, - manage_caching_sizes(GetAction, &l1, &l2); + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, -// k = std::min(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m 0); + } + + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min)(n, (n_per_thread + nr - 1) & nr_mask); + } + + if (l3 > l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min(k,240); -// n = std::min(n,3840/sizeof(RhsScalar)); -// m = std::min(m,3840/sizeof(RhsScalar)); - - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min(n,3840/sizeof(RhsScalar)); + m = std::min(m,3840/sizeof(RhsScalar)); #else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + k = std::min(k,24); + n = std::min(n,384/sizeof(RhsScalar)); + m = std::min(m,384/sizeof(RhsScalar)); #endif + } } template -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes(k, m, n); + computeProductBlockingSizes(k, m, n, num_threads); } #ifdef EIGEN_HAS_FUSE_CJMADD @@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhsm_mc = ActualRows; this->m_nc = ActualCols; @@ -331,21 +331,21 @@ class gemm_blocking_spacem_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) + { + computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc, num_threads); + } + else // no l3 blocking { DenseIndex m = this->m_mc; - computeProductBlockingSizes(this->m_kc, m, this->m_nc); - } - else // full columns - { DenseIndex n = this->m_nc; - computeProductBlockingSizes(this->m_kc, this->m_mc, n); + computeProductBlockingSizes(this->m_kc, m, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; @@ -451,7 +451,7 @@ class GeneralProduct (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index daa8a1d8a..8de39f76f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 4079063eb..837e69415 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index d9e6084c3..21f8175d2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); + computeProductBlockingSizes(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 77aa3e5ee..4cbb79da0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -412,7 +412,7 @@ struct TriangularProduct Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max((subcols/Traits::nr)*Traits::nr, Traits::nr); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..37a803ced 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal else matrix(c, *m, *n, *ldc) *= beta; } - internal::gemm_blocking_space blocking(*m,*n,*k,true); + internal::gemm_blocking_space blocking(*m,*n,*k,1,true); int code = OP(*opa) | (OP(*opb) << 2); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0); @@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, a, *lda, b, *ldb, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*n, *m, a, *lda, b, *ldb, blocking); } @@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space blocking(*m,*n,*m); + internal::gemm_blocking_space blocking(*m,*n,*m,1,false); func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); } else { - internal::gemm_blocking_space blocking(*m,*n,*n); + internal::gemm_blocking_space blocking(*m,*n,*n,1,false); func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); } return 1; @@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda(10000,20000); - std::ptrdiff_t l2 = internal::random(1000000,2000000); - setCpuCacheSizes(l1,l2); + std::ptrdiff_t l2 = internal::random(100000,200000); + std::ptrdiff_t l3 = internal::random(1000000,2000000); + setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random(10,100)*16; std::ptrdiff_t m1 = internal::random(10,100)*16; std::ptrdiff_t n1 = internal::random(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes(k1,m1,n1); + internal::computeProductBlockingSizes(k1,m1,n1,1); } { diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7ec60044e..47447f446 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,7 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" -//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1e6f276e0..cd992daab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -766,7 +766,7 @@ struct TensorEvaluator BlockingType; // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, true); + BlockingType blocking(m, n, k, 1, true); const Index kc = blocking.kc(); const Index mc = (std::min)(m, blocking.mc()); const Index nc = (std::min)(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index dc0513305..8e4c7c11d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -152,7 +152,7 @@ struct TensorEvaluator(kc, mc, nc/*, num_threads*/); + internal::computeProductBlockingSizes(kc, mc, nc, num_threads); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); @@ -197,9 +197,10 @@ struct TensorEvaluator kernel_promises(num_kernel_promises, p); + std::vector kernel_promises(num_kernel_promises); + for (int i = 0; i < kernel_promises.size(); ++i) { + kernel_promises[i].set_value(); + } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; @@ -275,8 +276,7 @@ struct TensorEvaluator) Func; - this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel, arg); } } } @@ -338,7 +338,6 @@ struct TensorEvaluator Date: Thu, 16 Oct 2014 10:10:04 -0700 Subject: [PATCH 081/214] Avoid calling get_future() more than once on a given promise. --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 13 ++++++++----- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 5 ----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 8e4c7c11d..cf1352a31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -198,8 +198,10 @@ struct TensorEvaluator kernel_promises(num_kernel_promises); + std::vector kernel_futures(num_kernel_promises); for (int i = 0; i < kernel_promises.size(); ++i) { kernel_promises[i].set_value(); + kernel_futures[i] = kernel_promises[i].get_future(); } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { @@ -218,8 +220,9 @@ struct TensorEvaluatorready()); } -static EIGEN_STRONG_INLINE void wait_until_ready(Promise* p) { - p->get_future().wait(); - // eigen_assert(p->get_future().ready()); -} - struct ThreadPoolDevice { ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } From 94e47798f4e462b857a00b4ca60c954c71d16605 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 10:41:07 -0700 Subject: [PATCH 082/214] Fixed the return types of unary and binary expressions to properly handle the case where it is different from the input type (e.g. abs(complex)) --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 16 ++++++++-------- unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h | 7 ++----- unsupported/test/cxx11_tensor_of_complex.cpp | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index e324ba8d2..131326615 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -155,8 +155,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -203,8 +203,8 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -257,8 +257,8 @@ struct TensorEvaluator::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -317,8 +317,8 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename internal::traits::Packet PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index de66da13f..6e5503de1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -84,9 +84,7 @@ struct traits > typedef typename result_of< UnaryOp(typename XprType::Scalar) >::type Scalar; - typedef typename result_of< - UnaryOp(typename XprType::Packet) - >::type Packet; + typedef typename internal::packet_traits::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; }; @@ -188,8 +186,7 @@ class TensorCwiseBinaryOp : public TensorBase::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::promote_storage_type::ret PacketReturnType; + typedef typename internal::packet_traits::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index b5044b962..24b2bcb58 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -32,6 +32,22 @@ static void test_additions() } +static void test_abs() +{ + Tensor, 1> data1(3); + Tensor, 1> data2(3); + data1.setRandom(); + data2.setRandom(); + + Tensor abs1 = data1.abs(); + Tensor abs2 = data2.abs(); + for (int i = 0; i < 3; ++i) { + VERIFY_IS_APPROX(abs1(i), std::abs(data1(i))); + VERIFY_IS_APPROX(abs2(i), std::abs(data2(i))); + } +} + + static void test_contractions() { Tensor, 4> t_left(30, 50, 8, 31); @@ -60,5 +76,6 @@ static void test_contractions() void test_cxx11_tensor_of_complex() { CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_abs()); CALL_SUBTEST(test_contractions()); } From ae697b471c0d3961ebdb633e30046e5fe31fbe24 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 14:52:50 -0700 Subject: [PATCH 083/214] Silenced a few compilation warnings Generalized a TensorMap constructor --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 3 ++- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 3 ++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 2 +- unsupported/test/cxx11_tensor_fixed_size.cpp | 10 +++++----- 13 files changed, 24 insertions(+), 22 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 879057f38..ceed09505 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -1,6 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // +// Copyright (C) 2014 Benoit Steiner // Copyright (C) 2013 Christian Seiler // // This Source Code Form is subject to the terms of the Mozilla @@ -82,7 +83,7 @@ class Tensor : public TensorBase > static const std::size_t NumIndices = NumIndices_; - typedef DSizes Dimensions; + typedef DSizes Dimensions; protected: TensorStorage m_storage; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 0e55d4de1..2bd158dac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -114,7 +114,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 9ecea9108..3aa3eba24 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index b8e43f484..74485b15b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -140,7 +140,7 @@ struct TensorEvaluator m_outputStrides; array m_leftStrides; array m_rightStrides; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; + const Axis m_axis; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index cd992daab..0db34adb1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -671,10 +671,10 @@ struct TensorContractionEvaluatorBase Index m_j_size; Index m_k_size; - const Device& m_device; - Scalar* m_result; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; + const Device& m_device; + Scalar* m_result; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 34bdd5309..50cb10a33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -230,7 +230,7 @@ struct TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 2c0d2cd0f..0a8c10ac7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -64,7 +64,8 @@ template class TensorMap : public Tensor } #endif - inline TensorMap(PointerArgType dataPtr, const array& dimensions) + template + inline TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 13109f514..686bf5c24 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -130,8 +130,8 @@ struct TensorEvaluator, Device> Scalar* data() const { return m_impl.data(); } protected: - NewDimensions m_dimensions; TensorEvaluator m_impl; + NewDimensions m_dimensions; }; @@ -381,13 +381,13 @@ struct TensorEvaluator, Devi return inputIndex; } - Dimensions m_dimensions; array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; - const StartIndices m_offsets; TensorEvaluator m_impl; const Device& m_device; + Dimensions m_dimensions; + const StartIndices m_offsets; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 8da6e0f26..89c0cff05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -215,11 +215,11 @@ struct TensorEvaluator, Device return rslt; } - PaddingDimensions m_padding; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; + PaddingDimensions m_padding; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 01f2daf52..e2fe32d67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -120,7 +120,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eef992106..cbe87394b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -152,7 +152,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -217,8 +217,8 @@ struct TensorEvaluator, Device> array m_preservedStrides; array m_reducedStrides; array m_reducedDims; - Op m_reducer; TensorEvaluator m_impl; + Op m_reducer; }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 7e0063626..831a9f005 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -131,7 +131,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index b0501aaa3..99ffc7f07 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -32,10 +32,10 @@ static void test_1d() vec1(5) = 42.0; vec2(5) = 5.0; float data3[6]; - TensorMap > > vec3(data3, 6); + TensorMap > > vec3(data3, Sizes<6>()); vec3 = vec1.sqrt(); float data4[6]; - TensorMap, RowMajor> > vec4(data4, 6); + TensorMap, RowMajor> > vec4(data4, Sizes<6>()); vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); @@ -68,9 +68,9 @@ static void test_1d() static void test_2d() { float data1[6]; - TensorMap >> mat1(data1,2,3); + TensorMap >> mat1(data1, Sizes<2, 3>()); float data2[6]; - TensorMap, RowMajor>> mat2(data2,2,3); + TensorMap, RowMajor>> mat2(data2, Sizes<2, 3>()); VERIFY_IS_EQUAL((mat1.size()), 2*3); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); @@ -166,7 +166,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - mat1(array(i,j,k)) = val; + mat1(array{{i,j,k}}) = val; val += 1.0; } } From 65af852b54afca3c76c978c1bfd27d8a1451cab6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Oct 2014 15:02:30 -0700 Subject: [PATCH 084/214] Silenced one last warning --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 0db34adb1..c530b27a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -48,7 +48,7 @@ class BaseTensorContractionMapper { m_k_strides(k_strides) { } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(int i) { } + EIGEN_STRONG_INLINE void prefetch(int /*i*/) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { From 7acd38d19e2f9559825c78b4be8644f3b10496fb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 17 Oct 2014 09:49:03 -0700 Subject: [PATCH 085/214] Created some benchmarks for the tensor code --- bench/btl/CMakeLists.txt | 1 + bench/btl/libs/tensors/CMakeLists.txt | 44 +++++++++ bench/btl/libs/tensors/main_linear.cpp | 23 +++++ bench/btl/libs/tensors/main_matmat.cpp | 21 +++++ bench/btl/libs/tensors/main_vecmat.cpp | 21 +++++ bench/btl/libs/tensors/tensor_interface.hh | 105 +++++++++++++++++++++ unsupported/Eigen/CXX11/Core | 2 + 7 files changed, 217 insertions(+) create mode 100644 bench/btl/libs/tensors/CMakeLists.txt create mode 100644 bench/btl/libs/tensors/main_linear.cpp create mode 100644 bench/btl/libs/tensors/main_matmat.cpp create mode 100644 bench/btl/libs/tensors/main_vecmat.cpp create mode 100644 bench/btl/libs/tensors/tensor_interface.hh diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt index b299d9899..9444b450c 100644 --- a/bench/btl/CMakeLists.txt +++ b/bench/btl/CMakeLists.txt @@ -97,6 +97,7 @@ ENABLE_TESTING() add_subdirectory(libs/eigen3) add_subdirectory(libs/eigen2) +add_subdirectory(libs/tensors) add_subdirectory(libs/BLAS) add_subdirectory(libs/ublas) add_subdirectory(libs/gmm) diff --git a/bench/btl/libs/tensors/CMakeLists.txt b/bench/btl/libs/tensors/CMakeLists.txt new file mode 100644 index 000000000..09d6d8e43 --- /dev/null +++ b/bench/btl/libs/tensors/CMakeLists.txt @@ -0,0 +1,44 @@ + + +if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR) + # unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version + set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR}) + set(TENSOR_FOUND TRUE) +else() + find_package(Tensor) +endif() + +if (TENSOR_FOUND) + + include_directories(${TENSOR_INCLUDE_DIR}) + btl_add_bench(btl_tensor_linear main_linear.cpp) + btl_add_bench(btl_tensor_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor") + + option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF) + if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC) + btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp) + btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp) + btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp) + + btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec") + endif() + + + if(NOT BTL_NOVEC) + btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF) + btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF) + btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF) + btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec") + + endif(NOT BTL_NOVEC) + +endif (TENSOR_FOUND) diff --git a/bench/btl/libs/tensors/main_linear.cpp b/bench/btl/libs/tensors/main_linear.cpp new file mode 100644 index 000000000..e257f1e72 --- /dev/null +++ b/bench/btl/libs/tensors/main_linear.cpp @@ -0,0 +1,23 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + bench > >(MIN_AXPY,MAX_AXPY,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_matmat.cpp b/bench/btl/libs/tensors/main_matmat.cpp new file mode 100644 index 000000000..675fcfc6d --- /dev/null +++ b/bench/btl/libs/tensors/main_matmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_MM,MAX_MM,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/main_vecmat.cpp b/bench/btl/libs/tensors/main_vecmat.cpp new file mode 100644 index 000000000..1af00c81b --- /dev/null +++ b/bench/btl/libs/tensors/main_vecmat.cpp @@ -0,0 +1,21 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#include "utilities.h" +#include "tensor_interface.hh" +#include "bench.hh" +#include "basic_actions.hh" + +BTL_MAIN; + +int main() +{ + bench > >(MIN_MV,MAX_MV,NB_POINT); + + return 0; +} diff --git a/bench/btl/libs/tensors/tensor_interface.hh b/bench/btl/libs/tensors/tensor_interface.hh new file mode 100644 index 000000000..97b8e0f0b --- /dev/null +++ b/bench/btl/libs/tensors/tensor_interface.hh @@ -0,0 +1,105 @@ +//===================================================== +// Copyright (C) 2014 Benoit Steiner +//===================================================== +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +#ifndef TENSOR_INTERFACE_HH +#define TENSOR_INTERFACE_HH + +#include +#include +#include "btl.hh" + +using namespace Eigen; + +template +class tensor_interface +{ +public : + typedef real real_type; + typedef typename Eigen::Tensor::Index Index; + + typedef std::vector stl_vector; + typedef std::vector stl_matrix; + + typedef Eigen::Tensor gene_matrix; + typedef Eigen::Tensor gene_vector; + + + static inline std::string name( void ) + { + return EIGEN_MAKESTRING(BTL_PREFIX); + } + + static void free_matrix(gene_matrix & /*A*/, int /*N*/) {} + + static void free_vector(gene_vector & /*B*/) {} + + static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){ + A.resize(Eigen::array(A_stl[0].size(), A_stl.size())); + + for (unsigned int j=0; j(i,j)) = A_stl[j][i]; + } + } + } + + static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){ + B.resize(B_stl.size()); + + for (unsigned int i=0; i(i,j)); + } + } + } + + static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){ + typedef typename Eigen::Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){ + typedef typename Eigen::Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + X/*.noalias()*/ = A.contract(B, dims); + } + + static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){ + Y += X.constant(coef) * X; + } + + static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){ + Y = X.constant(a)*X + Y.constant(b)*Y; + } + + static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){ + cible = source; + } + + static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){ + cible = source; + } +}; + +#endif diff --git a/unsupported/Eigen/CXX11/Core b/unsupported/Eigen/CXX11/Core index f6c3b49bb..292f09564 100644 --- a/unsupported/Eigen/CXX11/Core +++ b/unsupported/Eigen/CXX11/Core @@ -30,6 +30,8 @@ * \endcode */ +#include + // Emulate the cxx11 functionality that we need if the compiler doesn't support it. #if __cplusplus <= 199711L #include "src/Core/util/EmulateCXX11Meta.h" From f786897e4b96737767effc85bedb78f06dc46dc5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 17 Oct 2014 15:33:27 -0700 Subject: [PATCH 086/214] Added access to the unerlying raw data of a tnsor slice/chip whenever possible --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 9 ++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 21 +++++- unsupported/test/cxx11_tensor_chipping.cpp | 37 +++++++++++ unsupported/test/cxx11_tensor_morphing.cpp | 64 ++++++++++++++++++- 4 files changed, 126 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 3aa3eba24..b862a8fd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -157,7 +157,14 @@ struct TensorEvaluator, Device> }*/ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (DimId == NumDims && result) { + return result + m_inputOffset; + } else { + return NULL; + } + } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 686bf5c24..3447592eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -366,7 +366,26 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); + if (result) { + Index offset = 0; + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + return result + offset; + } + return NULL; + } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 8c8a0cec2..0027b2888 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -236,9 +236,46 @@ static void test_chip_as_lvalue() } +static void test_chip_raw_data() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; + auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice()); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + int chip_index = i + 2 * (j + 3 * (k + 5 * l)); + VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); + } + } + } + } + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator0; + auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip0.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip1.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip2.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip3.data(), static_cast(0)); +} + + void test_cxx11_tensor_chipping() { CALL_SUBTEST(test_simple_chip()); CALL_SUBTEST(test_chip_in_expr()); CALL_SUBTEST(test_chip_as_lvalue()); + CALL_SUBTEST(test_chip_raw_data()); } diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index fd1b1fa32..78b0dade0 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -12,7 +12,6 @@ #include using Eigen::Tensor; -using Eigen::IndexPair; static void test_simple_reshape() { @@ -53,7 +52,8 @@ static void test_reshape_in_expr() { TensorMap> tensor2(m2.data(), 3,5,7,11,13); Tensor::Dimensions newDims1{{2,3*5*7*11}}; Tensor::Dimensions newDims2{{3*5*7*11,13}}; - Eigen::array, 1> contract_along{{IndexPair(1, 0)}}; + typedef Tensor::DimensionPair DimPair; + array contract_along{{DimPair(1, 0)}}; Tensor tensor3(2,13); tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along); @@ -126,7 +126,8 @@ static void test_slice_in_expr() { TensorMap> tensor1(m1.data(), 7, 7); TensorMap> tensor2(m2.data(), 3, 3); Tensor tensor3(3,1); - array, 1> contract_along{{IndexPair(1, 0)}}; + typedef Tensor::DimensionPair DimPair; + array contract_along{{DimPair(1, 0)}}; Eigen::DSizes indices1(1,2); Eigen::DSizes sizes1(3,3); @@ -190,6 +191,62 @@ static void test_slice_as_lvalue() } +static void test_slice_raw_data() +{ + Tensor tensor(3,5,7,11); + tensor.setRandom(); + + Eigen::DSizes offsets(1,2,3,4); + Eigen::DSizes extents(1,1,1,1); + typedef TensorEvaluator SliceEvaluator; + auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul); + VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4)); + + extents = Eigen::DSizes(2,1,1,1); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + + extents = Eigen::DSizes(1,2,1,1); + auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice3.data(), static_cast(0)); + + offsets = Eigen::DSizes(0,2,3,4); + extents = Eigen::DSizes(3,2,1,1); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + } + } + + offsets = Eigen::DSizes(0,0,0,4); + extents = Eigen::DSizes(3,5,7,2); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 2; ++l) { + int slice_index = i + 3 * (j + 5 * (k + 7 * l)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + } + } + } + } + + offsets = Eigen::DSizes(0,0,0,0); + extents = Eigen::DSizes(3,5,7,11); + auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3ul*5*7*11); + VERIFY_IS_EQUAL(slice6.data(), tensor.data()); +} + + void test_cxx11_tensor_morphing() { CALL_SUBTEST(test_simple_reshape()); @@ -199,4 +256,5 @@ void test_cxx11_tensor_morphing() CALL_SUBTEST(test_simple_slice()); CALL_SUBTEST(test_slice_in_expr()); CALL_SUBTEST(test_slice_as_lvalue()); + CALL_SUBTEST(test_slice_raw_data()); } From debc97821c775518afd54e05e19dec9eb0c3bde1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 28 Oct 2014 23:10:13 -0700 Subject: [PATCH 087/214] Added support for tensor references --- unsupported/Eigen/CXX11/Tensor | 2 + .../src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorRef.h | 360 ++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorTraits.h | 40 ++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_ref.cpp | 192 ++++++++++ 6 files changed, 596 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorRef.h create mode 100644 unsupported/test/cxx11_tensor_ref.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 47447f446..c36db96ec 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -76,6 +76,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" + #include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" #include "Eigen/src/Core/util/ReenableStupidWarnings.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 67f478822..a72e11215 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -15,6 +15,7 @@ namespace Eigen { template class Tensor; template class TensorFixedSize; template class TensorMap; +template class TensorRef; template::value> class TensorBase; template class TensorCwiseNullaryOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 000000000..db2027a5f --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,360 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +namespace Eigen { + +namespace internal { + +template +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) { } + virtual ~TensorLazyBaseEvaluator() { } + + virtual const Dimensions& dimensions() const = 0; + virtual const Scalar* data() const = 0; + + virtual const Scalar coeff(DenseIndex index) const = 0; + virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assigment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + +static char dummy[8]; + +template +class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator::Scalar> { + public: + // typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Scalar Scalar; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { + m_impl.cleanup(); + } + + virtual const Dimensions& dimensions() const { + return m_dims; + } + virtual const Scalar* data() const { + return m_impl.data(); + } + + virtual const Scalar coeff(DenseIndex index) const { + return m_impl.coeff(index); + } + virtual Scalar& coeffRef(DenseIndex index) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return *reinterpret_cast(dummy); + }; + + protected: + TensorEvaluator m_impl; + Dimensions m_dims; +}; + +template +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly { + public: + typedef TensorLazyEvaluatorReadOnly Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluatorWritable() { + } + + virtual Scalar& coeffRef(DenseIndex index) { + return this->m_impl.coeffRef(index); + } +}; + +template +class TensorLazyEvaluator : public internal::conditional::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly >::type { + public: + typedef typename internal::conditional::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly >::type Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluator() { + } +}; + +} // namespace internal + + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template class TensorRef : public TensorBase > +{ + public: + typedef TensorRef Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { + } + + template + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template + EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { + unrefEvaluator(); + } + + TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator = (const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index index) const + { + return m_evaluator->coeff(index); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } +#else + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeff(indices); + } +#endif + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(const array& indices) const + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options&RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return m_evaluator->coeffRef(index); + } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator* m_evaluator; +}; + + +// evaluator for rvalues +template +struct TensorEvaluator, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) + : m_ref(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_ref.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return m_ref.coeffRef(index); + } + + Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef m_ref; +}; + + +// evaluator for lvalues +template +struct TensorEvaluator, Device> : public TensorEvaluator, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Packet Packet; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename Derived::Packet PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator, Device> Base; + + enum { + IsAligned = false, + PacketAccess = false, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_ref.coeffRef(index); + } +}; + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 5940a8cf1..5c0f78489 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -84,6 +84,20 @@ struct traits > }; }; +template +struct traits > + : public traits +{ + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + enum { + Options = BaseTraits::Options, + Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), + }; +}; + template struct eval, Eigen::Dense> @@ -121,6 +135,19 @@ struct eval, Eigen::Dense> typedef const TensorMap& type; }; +template +struct eval, Eigen::Dense> +{ + typedef const TensorRef& type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorRef& type; +}; + + template struct nested, 1, typename eval >::type> { @@ -145,6 +172,7 @@ struct nested, 1, typename e typedef const TensorFixedSize& type; }; + template struct nested, 1, typename eval >::type> { @@ -157,6 +185,18 @@ struct nested, 1, typename eval& type; }; +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorRef& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef const TensorRef& type; +}; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index a7ef2b402..2b5395013 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -126,5 +126,6 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_striding "-std=c++0x") # ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") + ei_add_test(cxx11_tensor_ref "-std=c++0x") ei_add_test(cxx11_tensor_io "-std=c++0x") endif() diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp new file mode 100644 index 000000000..4ff94a059 --- /dev/null +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void test_simple_lvalue_ref() +{ + Tensor input(6); + input.setRandom(); + + TensorRef> ref3(input); + TensorRef> ref4 = input; + + VERIFY_IS_EQUAL(ref3.data(), input.data()); + VERIFY_IS_EQUAL(ref4.data(), input.data()); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(ref3(i), input(i)); + VERIFY_IS_EQUAL(ref4(i), input(i)); + } + + for (int i = 0; i < 6; ++i) { + ref3.coeffRef(i) = i; + } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(input(i), i); + } + for (int i = 0; i < 6; ++i) { + ref4.coeffRef(i) = -i * 2; + } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(input(i), -i*2); + } +} + + +static void test_simple_rvalue_ref() +{ + Tensor input1(6); + input1.setRandom(); + Tensor input2(6); + input2.setRandom(); + + TensorRef> ref3(input1 + input2); + TensorRef> ref4 = input1 + input2; + + VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data()); + VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data()); + VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data()); + VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data()); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i)); + VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i)); + } +} + + +static void test_multiple_dims() +{ + Tensor input(3,5,7); + input.setRandom(); + + TensorRef> ref(input); + VERIFY_IS_EQUAL(ref.data(), input.data()); + VERIFY_IS_EQUAL(ref.dimension(0), 3); + VERIFY_IS_EQUAL(ref.dimension(1), 5); + VERIFY_IS_EQUAL(ref.dimension(2), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k)); + } + } + } +} + + +static void test_slice() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + + Eigen::DSizes indices(1,2,3,4,5); + Eigen::DSizes sizes(1,1,1,1,1); + TensorRef> slice = tensor.slice(indices, sizes); + VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5)); + + Eigen::DSizes indices2(1,1,3,4,5); + Eigen::DSizes sizes2(1,1,2,2,3); + slice = tensor.slice(indices2, sizes2); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + + Eigen::DSizes indices3(0,0,0,0,0); + Eigen::DSizes sizes3(2,3,1,1,1); + slice = tensor.slice(indices3, sizes3); + VERIFY_IS_EQUAL(slice.data(), tensor.data()); +} + + +static void test_ref_of_ref() +{ + Tensor input(3,5,7); + input.setRandom(); + + TensorRef> ref(input); + TensorRef> ref_of_ref(ref); + TensorRef> ref_of_ref2; + ref_of_ref2 = ref; + + VERIFY_IS_EQUAL(ref_of_ref.data(), input.data()); + VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3); + VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5); + VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7); + + VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data()); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5); + VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k)); + VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k)); + } + } + } +} + + +static void test_ref_in_expr() +{ + Tensor input(3,5,7); + input.setRandom(); + TensorRef> input_ref(input); + + Tensor result(3,5,7); + result.setRandom(); + TensorRef> result_ref(result); + + Tensor bias(3,5,7); + bias.setRandom(); + + result_ref = input_ref + bias; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k)); + VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k)); + } + } + } + + result = result_ref; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k)); + } + } + } +} + + +void test_cxx11_tensor_ref() +{ + CALL_SUBTEST(test_simple_lvalue_ref()); + CALL_SUBTEST(test_simple_rvalue_ref()); + CALL_SUBTEST(test_multiple_dims()); + CALL_SUBTEST(test_slice()); + CALL_SUBTEST(test_ref_of_ref()); + CALL_SUBTEST(test_ref_in_expr()); +} From 5e62427e22002019d1a3ef05daeb75c6db7c6405 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 17:49:39 -0700 Subject: [PATCH 088/214] Use the proper index type --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 01fa04c64..4fa8e83ef 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -149,26 +149,26 @@ class TensorExecutor // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, unsigned int size) { + EigenMetaKernel(Evaluator eval, Index size) { - const int first_index = blockIdx.x * blockDim.x + threadIdx.x; - const int step_size = blockDim.x * gridDim.x; + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { // Use the scalar path - for (int i = first_index; i < size; i += step_size) { + for (Index i = first_index; i < size; i += step_size) { eval.evalScalar(i); } } else { // Use the vector path - const int PacketSize = unpacket_traits::size; - const int vectorized_step_size = step_size * PacketSize; - const int vectorized_size = (size / PacketSize) * PacketSize; - int i = first_index * PacketSize; + const Index PacketSize = unpacket_traits::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + Index i = first_index * PacketSize; for ( ; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } @@ -193,7 +193,7 @@ class TensorExecutor const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - EigenMetaKernel > <<>>(evaluator, size); + EigenMetaKernel, Index><<>>(evaluator, size); assert(cudaGetLastError() == cudaSuccess); } evaluator.cleanup(); From 1946cc44784c9d0b024a2f1d7d7664010735411f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 17:52:32 -0700 Subject: [PATCH 089/214] Added missing packet primitives for CUDA. --- Eigen/src/Core/arch/CUDA/PacketMath.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 5b0abe2e6..7b481d512 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -216,6 +216,21 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, c to[stride*1] = from.y; } +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(abs(a.x), abs(a.y)); +} + + template<> EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { double tmp = kernel.packet[0].y; From bc99c5f7db8d4d7e41e5e4358170e99a1bf9d364 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 18:09:53 -0700 Subject: [PATCH 090/214] fixed some potential alignment issues. --- Eigen/src/Core/util/Macros.h | 4 +++- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 8fdd7d898..001907a0b 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -297,7 +297,9 @@ namespace Eigen { * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ -#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) +#if (defined __CUDACC__) +#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION) #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif (defined _MSC_VER) #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 3447592eb..33849ed3e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -459,7 +459,7 @@ struct TensorEvaluator, Device> this->m_impl.template writePacket(inputIndices[0], x); } else { - CoeffReturnType values[packetSize]; + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; From d62bfe73a92878c878a6b46674a2ea4cec130ac8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 18:15:05 -0700 Subject: [PATCH 091/214] Use the proper index type in the padding code --- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 89c0cff05..d6347b054 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -98,7 +98,6 @@ struct TensorEvaluator, Device for (int i = 0; i < NumDims; ++i) { m_dimensions[i] += m_padding[i].first + m_padding[i].second; } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); m_inputStrides[0] = 1; m_outputStrides[0] = 1; @@ -125,6 +124,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; @@ -151,11 +151,11 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { - const int first = index; - const int last = index + packetSize - 1; - const int lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const int firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const int lastPaddedRight = m_outputStrides[i+1]; + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. @@ -179,9 +179,9 @@ struct TensorEvaluator, Device const Index last = index + packetSize - 1; const Index first = index; - const int lastPaddedLeft = m_padding[0].first; - const int firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const int lastPaddedRight = m_outputStrides[1]; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; if (last < lastPaddedLeft) { // all the coefficient are in the padding zone. From fcecafde3aac795a50c32dc5c91a0ed59b4819ed Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 30 Oct 2014 21:58:14 -0700 Subject: [PATCH 092/214] Fixed a compilation error with clang --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index a753c5a48..1af2d7bcd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -187,13 +187,6 @@ class TensorFixedSize : public TensorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) From 85c3389b2845c5bece37dfb155053aef22ea4138 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 00:04:13 -0700 Subject: [PATCH 093/214] Fixed a test --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 3 +++ unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 2 +- unsupported/test/CMakeLists.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 2dd8e274b..c5965065e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -384,6 +384,9 @@ static const size_t value = Sizes::count; }; template struct array_size > { static const size_t value = Sizes::count; +}; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes) { + return get::Base>::value; }; #else template struct array_size > { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index db2027a5f..d43fb286e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -64,7 +64,7 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator(dummy); }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 2b5395013..49a8013ea 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -100,7 +100,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") -# ei_add_test(cxx11_tensor_assign "-std=c++0x") + ei_add_test(cxx11_tensor_assign "-std=c++0x") # ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") From 7f2c6ed2fa35d7f83f0da83c8564b7bd5b01d232 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 11:45:21 -0700 Subject: [PATCH 094/214] Fixed a compilation warning --- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index c5965065e..3d646c455 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -387,7 +387,7 @@ static const size_t value = Sizes::count; }; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes) { return get::Base>::value; -}; +} #else template struct array_size > { static const size_t value = Sizes::count; From 2dde63499c4ef836a0d9dfd443494d863ad62b16 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 31 Oct 2014 16:33:51 -0700 Subject: [PATCH 095/214] Generalized the matrix vector product code. --- Eigen/src/Core/GeneralProduct.h | 32 ++- Eigen/src/Core/products/GeneralMatrixVector.h | 246 +++++++++--------- .../Core/products/TriangularMatrixVector.h | 46 ++-- .../Core/products/TriangularSolverVector.h | 24 +- Eigen/src/Core/util/BlasUtil.h | 47 +++- 5 files changed, 228 insertions(+), 167 deletions(-) diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 7179eb124..9d3d5562c 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -11,7 +11,7 @@ #ifndef EIGEN_GENERAL_PRODUCT_H #define EIGEN_GENERAL_PRODUCT_H -namespace Eigen { +namespace Eigen { /** \class GeneralProduct * \ingroup Core_Module @@ -257,7 +257,7 @@ class GeneralProduct : public ProductBase, Lhs, Rhs> { template struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; - + public: EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) @@ -266,7 +266,7 @@ class GeneralProduct EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) } - + struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; @@ -277,12 +277,12 @@ class GeneralProduct dst.const_cast_derived() += m_scale * src; } }; - + template inline void evalTo(Dest& dest) const { internal::outer_product_selector_run(*this, dest, set(), IsRowMajor()); } - + template inline void addTo(Dest& dest) const { internal::outer_product_selector_run(*this, dest, add(), IsRowMajor()); @@ -436,12 +436,12 @@ template<> struct gemv_selector bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), evalToDest ? dest.data() : static_dest.data()); - + if(!evalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN @@ -457,11 +457,13 @@ template<> struct gemv_selector MappedDest(actualDestPtr, dest.size()) = dest; } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; general_matrix_vector_product - ::run( + ::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhs.data(), actualRhs.innerStride(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), actualDestPtr, 1, compatibleAlpha); @@ -516,11 +518,13 @@ template<> struct gemv_selector Map(actualRhsPtr, actualRhs.size()) = actualRhs; } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; general_matrix_vector_product - ::run( + ::run( actualLhs.rows(), actualLhs.cols(), - actualLhs.data(), actualLhs.outerStride(), - actualRhsPtr, 1, + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhsPtr, 1), dest.data(), dest.innerStride(), actualAlpha); } @@ -594,7 +598,7 @@ MatrixBase::operator*(const MatrixBase &other) const #ifdef EIGEN_DEBUG_PRODUCT internal::product_type::debug(); #endif - + return Product(derived(), other.derived()); } #else diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 340c51394..7dfa48bfb 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -48,17 +48,17 @@ namespace internal { * // we currently fall back to the NoneAligned case * * The same reasoning apply for the transposed case. - * + * * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow * compared to unaligned loads on a 4 byte boundary. * */ -template -struct general_matrix_vector_product +template +struct general_matrix_vector_product { -typedef typename scalar_product_traits::ReturnType ResScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; enum { Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable @@ -78,17 +78,17 @@ typedef typename conditional::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { @@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&res[j]), \ padd( \ - padd(pcj.pmul(EIGEN_CAT(ploa , A0)(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs1[j]), ptmp1)), \ - padd(pcj.pmul(EIGEN_CAT(ploa , A2)(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs3[j]), ptmp3)) ))) + padd(pcj.pmul(lhs0.template load(j), ptmp0), \ + pcj.pmul(lhs1.template load(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load(j), ptmp2), \ + pcj.pmul(lhs3.template load(j), ptmp3)) ))) + + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper cj; conj_helper pcj; @@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size= cols) || LhsPacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -178,20 +182,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(alpha*rhs[i*rhsIncr]), - ptmp1 = pset1(alpha*rhs[(i+offset1)*rhsIncr]), - ptmp2 = pset1(alpha*rhs[(i+2)*rhsIncr]), - ptmp3 = pset1(alpha*rhs[(i+offset3)*rhsIncr]); + RhsPacket ptmp0 = pset1(alpha*rhs(i, 0)), + ptmp1 = pset1(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1(alpha*rhs(i+2, 0)), + ptmp3 = pset1(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { @@ -199,10 +203,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_productalignedStart) @@ -211,11 +215,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&lhs1[alignedStart-1]); - A02 = pload(&lhs2[alignedStart-2]); - A03 = pload(&lhs3[alignedStart-3]); + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); for (; j(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - A00 = pload(&lhs0[j]); - A10 = pload(&lhs0[j+LhsPacketSize]); + A00 = lhs0.template load(j); + A10 = lhs0.template load(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); - A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); @@ -254,12 +258,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(alpha*rhs[k*rhsIncr]); - const LhsScalar* lhs0 = lhs + k*lhsStride; + RhsPacket ptmp0 = pset1(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j(alignedStart)) for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load(i), ptmp0, pload(&res[i]))); else for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load(i), ptmp0, pload(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i -struct general_matrix_vector_product +template +struct general_matrix_vector_product { typedef typename scalar_product_traits::ReturnType ResScalar; @@ -346,67 +350,69 @@ typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - + EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - EIGEN_UNUSED_VARIABLE(rhsIncr); - eigen_internal_assert(rhsIncr==1); - + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - RhsPacket b = pload(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) (&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) (&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs3[j]), b, ptmp3); } + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } conj_helper cj; conj_helper pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; -// const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; + const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - Index alignedStart = internal::first_aligned(rhs, depth); + Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (lhsAlignmentOffset < 0) || (rhsAlignmentOffset < 0) ) { alignedSize = 0; alignedStart = 0; @@ -418,7 +424,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth= rows) || LhsPacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -447,8 +453,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_productalignedStart) @@ -481,11 +487,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product(&lhs1[alignedStart-1]); - A02 = pload(&lhs2[alignedStart-2]); - A03 = pload(&lhs3[alignedStart-3]); + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); for (; j(&rhs[j]); - A11 = pload(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - ptmp0 = pcj.pmadd(pload(&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - b = pload(&rhs[j+RhsPacketSize]); - ptmp0 = pcj.pmadd(pload(&lhs0[j+LhsPacketSize]), b, ptmp0); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); + ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j(tmp0); - const LhsScalar* lhs0 = lhs + i*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; jalignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned(alignedStart)) for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); else for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); - + typedef Map, 0, InnerStride<> > RhsMap; const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if::type cjRhs(rhs); @@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product > ResMap; ResMap res(_res,rows); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + for (Index pi=0; pi0) { Index s = IsLower ? pi+actualPanelWidth : 0; - general_matrix_vector_product::run( + general_matrix_vector_product::run( r, actualPanelWidth, - &lhs.coeffRef(s,pi), lhsStride, - &rhs.coeffRef(pi), rhsIncr, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, alpha); } } if((!IsLower) && cols>size) { - general_matrix_vector_product::run( + general_matrix_vector_product::run( rows, cols-size, - &lhs.coeffRef(0,size), lhsStride, - &rhs.coeffRef(size), rhsIncr, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), _res, resIncr, alpha); } } @@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product, 0, InnerStride<> > ResMap; ResMap res(_res,rows,InnerStride<>(resIncr)); - + + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + for (Index pi=0; pi0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - general_matrix_vector_product::run( + general_matrix_vector_product::run( actualPanelWidth, r, - &lhs.coeffRef(pi,s), lhsStride, - &rhs.coeffRef(s), rhsIncr, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, alpha); } } if(IsLower && rows>diagSize) { - general_matrix_vector_product::run( + general_matrix_vector_product::run( rows-diagSize, cols, - &lhs.coeffRef(diagSize,0), lhsStride, - &rhs.coeffRef(0), rhsIncr, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), &res.coeffRef(diagSize), resIncr, alpha); } } @@ -184,7 +190,7 @@ struct TriangularProduct template void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); - + internal::trmv_selector<(int(internal::traits::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha); } }; @@ -211,7 +217,7 @@ struct TriangularProduct namespace internal { // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. - + template<> struct trmv_selector { template @@ -247,7 +253,7 @@ template<> struct trmv_selector bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -267,7 +273,7 @@ template<> struct trmv_selector else MappedDest(actualDestPtr, dest.size()) = dest; } - + internal::triangular_matrix_vector_product struct trmv_selector #endif Map(actualRhsPtr, actualRhs.size()) = actualRhs; } - + internal::triangular_matrix_vector_product ::run(size, _lhs, lhsStride, rhs); } }; - + // forward and backward substitution, row-major, rhs is a vector template struct triangular_solve_vector @@ -37,6 +37,10 @@ struct triangular_solve_vector, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typename internal::conditional< Conjugate, const CwiseUnaryOp,LhsMap>, @@ -58,10 +62,10 @@ struct triangular_solve_vector::run( + general_matrix_vector_product::run( actualPanelWidth, r, - &lhs.coeffRef(startRow,startCol), lhsStride, - rhs + startCol, 1, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1)); } @@ -72,7 +76,7 @@ struct triangular_solve_vector0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } @@ -91,6 +95,8 @@ struct triangular_solve_vector, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; typename internal::conditional,LhsMap>, const LhsMap& @@ -122,10 +128,10 @@ struct triangular_solve_vector::run( + general_matrix_vector_product::run( r, actualPanelWidth, - &lhs.coeffRef(endBlock,startBlock), lhsStride, - rhs+startBlock, 1, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), rhs+endBlock, 1, RhsScalar(-1)); } } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 25a62d528..c4881b8da 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -34,7 +34,9 @@ template< int ResStorageOrder> struct general_matrix_matrix_product; -template +template struct general_matrix_vector_product; @@ -118,13 +120,35 @@ template struct get_factor::R }; +template +class BlasVectorMapper { + public: + EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + + EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_data[i]; + } + template + EIGEN_ALWAYS_INLINE Packet load(Index i) const { + return ploadt(m_data + i); + } + + template + bool aligned(Index i) const { + return (size_t(m_data+i)%sizeof(Packet))==0; + } + + protected: + Scalar* m_data; +}; + template -class MatrixLinearMapper { +class BlasLinearMapper { public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_ALWAYS_INLINE MatrixLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); @@ -157,7 +181,8 @@ class blas_data_mapper { typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - typedef MatrixLinearMapper LinearMapper; + typedef BlasLinearMapper LinearMapper; + typedef BlasVectorMapper VectorMapper; EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} @@ -170,6 +195,11 @@ class blas_data_mapper { return LinearMapper(&operator()(i, j)); } + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(&operator()(i, j)); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; @@ -193,6 +223,15 @@ class blas_data_mapper { return pgather(&operator()(i, j), m_stride); } + const Index stride() const { return m_stride; } + + Index firstAligned(Index size) const { + if (size_t(m_data)%sizeof(Scalar)) { + return -1; + } + return internal::first_aligned(m_data, size); + } + protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; From b1789c112b5cf8d478a03786c6c1243320aefd47 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 3 Nov 2014 08:51:33 -0800 Subject: [PATCH 096/214] Improved handling of 1d tensors --- .../CXX11/src/Tensor/TensorContraction.h | 98 +++++++++++++++++-- .../src/Tensor/TensorContractionThreadPool.h | 12 ++- 2 files changed, 99 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c530b27a7..8e898619d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -48,7 +48,7 @@ class BaseTensorContractionMapper { m_k_strides(k_strides) { } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(int /*i*/) { } + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const { @@ -142,6 +142,13 @@ class BaseTensorContractionMapper { return IndexPair(linidx[0], linidx[1]); } + Index firstAligned(Index size) const { + return size; + } + Index stride() const { + return 1; + } + protected: const Tensor m_tensor; const nocontract_t m_nocontract_strides; @@ -202,6 +209,18 @@ class TensorContractionSubMapper { return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); } + template + EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE); + return loadPacket(i); + } + + template + bool aligned(Index /*i*/) const { + return false; + } + private: const ParentMapper& m_base_mapper; const Index m_vert_offset; @@ -220,6 +239,7 @@ class TensorContractionInputMapper public: typedef BaseTensorContractionMapper Base; typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, @@ -233,6 +253,10 @@ class TensorContractionInputMapper return SubMapper(*this, i, j); } + EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; @@ -306,6 +330,7 @@ class TensorContractionInputMapper Base; typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides, @@ -319,6 +344,10 @@ class TensorContractionInputMapper::type Packet; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { @@ -592,41 +621,80 @@ struct TensorContractionEvaluatorBase if (this->m_lhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } else { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } } else { if (this->m_rhs_inner_dim_contiguous) { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } else { if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } else { - static_cast(this)->template evalTyped(buffer); + static_cast(this)->template evalProduct(buffer); } } } } + template + void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const int lhs_packet_size = internal::packet_traits::size; + const int rhs_packet_size = internal::packet_traits::size; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -707,7 +775,17 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const { + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index cf1352a31..f0e9bb616 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -93,7 +93,17 @@ struct TensorEvaluator - void evalTyped(Scalar* buffer) const { + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; From 9ea09179b5394fdd4af3a8450cdb60d72b232327 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 4 Nov 2014 10:24:42 -0800 Subject: [PATCH 097/214] Fixed the return type of the coefficient-wise tensor operations. --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 131326615..f7c784942 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -45,7 +45,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* dest) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { if (dest) { m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); return false; @@ -108,7 +108,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -161,7 +161,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { return true; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -175,7 +175,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: const NullaryOp m_functor; @@ -228,7 +228,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: const UnaryOp m_functor; @@ -253,7 +253,9 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: const BinaryOp m_functor; @@ -313,7 +315,10 @@ struct TensorEvaluator : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) - { } + { + eigen_assert(internal::dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(internal::dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -327,7 +332,7 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); @@ -356,7 +361,7 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_condImpl; From 9a06a716277029ffa152049be8fd53aee1e1bc13 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 5 Nov 2014 07:49:51 -0800 Subject: [PATCH 098/214] Fixed a test --- unsupported/test/CMakeLists.txt | 2 +- unsupported/test/cxx11_tensor_dimension.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 49a8013ea..e83c10dc4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -101,7 +101,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_simple "-std=c++0x") ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") -# ei_add_test(cxx11_tensor_dimension "-std=c++0x") + ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index fc0d29c50..c806b623f 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -16,7 +16,7 @@ using Eigen::Tensor; static void test_dynamic_size() { - Eigen::DSizes dimensions(Eigen::array(2,3,7)); + Eigen::DSizes dimensions(Eigen::array{{2,3,7}}); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); @@ -37,7 +37,7 @@ static void test_fixed_size() static void test_match() { - Eigen::DSizes dyn(Eigen::array(2,3,7)); + Eigen::DSizes dyn(Eigen::array{{2,3,7}}); Eigen::Sizes<2,3,7> stat; VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true); } From cb37f818ca6e8dfc9d81343882401e3671531d1b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 5 Nov 2014 23:25:11 -0800 Subject: [PATCH 099/214] Fixed a compilation error triggered by some operations on fixed sized tensors --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 12 ++++-------- unsupported/test/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 3d646c455..6d9e09318 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -40,10 +40,6 @@ template struct IndexPair { // Boilerplate code namespace internal { -template struct dget { - static const std::size_t value = get::value; -}; - template struct fixed_size_tensor_index_linearization_helper @@ -53,7 +49,7 @@ struct fixed_size_tensor_index_linearization_helper const Dimensions& dimensions) { return array_get(indices) + - dget::value * + get::value * fixed_size_tensor_index_linearization_helper::run(indices, dimensions); } }; @@ -125,7 +121,7 @@ struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes { +template struct Sizes : typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; static const size_t count = Base::count; static const std::size_t total_size = internal::arg_prod::value; @@ -164,11 +160,11 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *this); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } }; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e83c10dc4..6b8ed2826 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -107,7 +107,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_convolution "-std=c++0x") ei_add_test(cxx11_tensor_expr "-std=c++0x") ei_add_test(cxx11_tensor_forced_eval "-std=c++0x") -# ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") + ei_add_test(cxx11_tensor_fixed_size "-std=c++0x") ei_add_test(cxx11_tensor_const "-std=c++0x") ei_add_test(cxx11_tensor_of_const_values "-std=c++0x") ei_add_test(cxx11_tensor_of_complex "-std=c++0x") From c2d1074932ae92a001eadb27e9f85eaf2de187b9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 12 Nov 2014 22:25:38 -0800 Subject: [PATCH 100/214] Added support for static list of indices --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 264 ++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_index_list.cpp | 133 +++++++++ 4 files changed, 399 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h create mode 100644 unsupported/test/cxx11_tensor_index_list.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index c36db96ec..44d5a4d82 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -43,6 +43,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 000000000..010221e74 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,264 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + +#if __cplusplus > 199711L + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static const DenseIndex value = n; + constexpr operator DenseIndex() const { return n; } + void set(DenseIndex val) { + eigen_assert(val == n); + } +}; + +namespace internal { +template +void update_value(T& val, DenseIndex new_val) { + val = new_val; +} +template +void update_value(type2index& val, DenseIndex new_val) { + val.set(new_val); +} + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + +template +struct tuple_coeff { + template + static constexpr DenseIndex get(const DenseIndex i, const std::tuple& t) { + return std::get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + } + template + static void set(const DenseIndex i, std::tuple& t, const DenseIndex value) { + if (i == Idx) { + update_value(std::get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + return ((i == Idx) & is_compile_time_constant >::type>::value) || + tuple_coeff::value_known_statically(i, t); + } +}; + +template <> +struct tuple_coeff<0> { + template + static constexpr DenseIndex get(const DenseIndex i, const std::tuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return std::get<0>(t) * (i == 0); + } + template + static void set(const DenseIndex i, std::tuple& t, const DenseIndex value) { + eigen_assert (i == 0); + update_value(std::get<0>(t), value); + } + template + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return is_compile_time_constant >::type>::value & (i == 0); + } +}; +} // namespace internal + + +template +struct IndexList : std::tuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff >::value-1>::set(i, *this, value); + } + + constexpr IndexList(const std::tuple& other) : std::tuple(other) { } + constexpr IndexList() : std::tuple() { } + + constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1>::value_known_statically(i, *this); + } +}; + + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return std::make_tuple(val1, other_vals...); +} + + +namespace internal { + +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; + +template constexpr DenseIndex array_get(IndexList& a) { + return std::get(a); +} +template constexpr DenseIndex array_get(const IndexList& a) { + return std::get(a); +} + +template +struct index_known_statically { + constexpr bool operator() (DenseIndex) const { + return false; + } +}; + +template +struct index_known_statically > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically > { + constexpr bool operator() (const DenseIndex i) const { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_statically_eq { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_eq > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] == value; + } +}; + +template +struct index_statically_eq > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] == value; + } +}; + +template +struct index_statically_ne { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_ne > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] != value; + } +}; + +template +struct index_statically_ne > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] != value; + } +}; + + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +// No C++11 support +template +struct index_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{ + return false; + } +}; + +template +struct index_statically_eq { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template +struct index_statically_ne { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 6b8ed2826..181f06fc7 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -102,6 +102,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") ei_add_test(cxx11_tensor_dimension "-std=c++0x") + ei_add_test(cxx11_tensor_index_list "-std=c++0x") ei_add_test(cxx11_tensor_comparisons "-std=c++0x") ei_add_test(cxx11_tensor_contraction "-std=c++0x") ei_add_test(cxx11_tensor_convolution "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp new file mode 100644 index 000000000..6a103cab1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -0,0 +1,133 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + + +static void test_static_index_list() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + constexpr auto reduction_axis = make_index_list(0, 1, 2); + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); + VERIFY_IS_EQUAL(static_cast(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast(reduction_axis[2]), 2); + + EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + + Tensor result = tensor.sum(reduction_axis); + for (int i = 0; i < result.size(); ++i) { + float expected = 0.0f; + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 5; ++l) { + expected += tensor(j,k,l,i); + } + } + } + VERIFY_IS_APPROX(result(i), expected); + } +} + + +static void test_dynamic_index_list() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + int dim1 = 2; + int dim2 = 1; + int dim3 = 0; + + auto reduction_axis = make_index_list(dim1, dim2, dim3); + + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0); + VERIFY_IS_EQUAL(static_cast(reduction_axis[0]), 2); + VERIFY_IS_EQUAL(static_cast(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast(reduction_axis[2]), 0); + + Tensor result = tensor.sum(reduction_axis); + for (int i = 0; i < result.size(); ++i) { + float expected = 0.0f; + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 5; ++l) { + expected += tensor(j,k,l,i); + } + } + } + VERIFY_IS_APPROX(result(i), expected); + } +} + +static void test_mixed_index_list() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + int dim2 = 1; + int dim4 = 3; + + auto reduction_axis = make_index_list(0, dim2, 2, dim4); + + VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); + VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); + VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); + VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3); + VERIFY_IS_EQUAL(static_cast(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast(reduction_axis[2]), 2); + VERIFY_IS_EQUAL(static_cast(reduction_axis[3]), 3); + + typedef IndexList, int, type2index<2>, int> ReductionIndices; + ReductionIndices reduction_indices; + reduction_indices.set(1, 1); + reduction_indices.set(3, 3); + EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_known_statically()(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_known_statically()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + + Tensor result1 = tensor.sum(reduction_axis); + Tensor result2 = tensor.sum(reduction_indices); + + float expected = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + expected += tensor(i,j,k,l); + } + } + } + } + VERIFY_IS_APPROX(result1(0), expected); + VERIFY_IS_APPROX(result2(0), expected); +} + + +void test_cxx11_tensor_index_list() +{ + CALL_SUBTEST(test_static_index_list()); + CALL_SUBTEST(test_dynamic_index_list()); + CALL_SUBTEST(test_mixed_index_list()); +} From eeabf7975e59b47f4e3677c340013ebbfcfbc2bd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 12 Nov 2014 22:35:44 -0800 Subject: [PATCH 101/214] Optimized broadcasting --- .../CXX11/src/Tensor/TensorBroadcasting.h | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 2bd158dac..a77903dca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -24,11 +24,13 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; }; template @@ -85,6 +87,7 @@ struct TensorEvaluator, Device> static const int NumDims = internal::array_size::Dimensions>::value; typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; enum { IsAligned = false, @@ -129,10 +132,19 @@ struct TensorEvaluator, Device> Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } index -= idx * m_outputStrides[i]; } - inputIndex += (index % m_impl.dimensions()[0]); + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } return m_impl.coeff(inputIndex); } @@ -150,10 +162,20 @@ struct TensorEvaluator, Device> Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } index -= idx * m_outputStrides[i]; } - const Index innermostLoc = index % m_impl.dimensions()[0]; + Index innermostLoc; + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } inputIndex += innermostLoc; // Todo: this could be extended to the second dimension if we're not From ec785b0180f6cf9355b89d85c53fa18acf83e8a6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 13 Nov 2014 09:28:54 -0800 Subject: [PATCH 102/214] Added support for extraction of patches from images --- unsupported/Eigen/CXX11/Tensor | 1 + .../Eigen/CXX11/src/Tensor/TensorBase.h | 13 + .../src/Tensor/TensorForwardDeclarations.h | 1 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 291 ++++++++++++++++++ unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_image_patch.cpp | 280 +++++++++++++++++ 6 files changed, 587 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h create mode 100644 unsupported/test/cxx11_tensor_image_patch.cpp diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 44d5a4d82..aa26e5283 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -59,6 +59,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 6018ecc66..f451a3c99 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -255,6 +255,19 @@ class TensorBase return TensorPatchOp(derived(), patch_dims); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches() const { + return TensorImagePatchOp(derived(), Rows, Cols, 1, 1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride = 1, const Index col_stride = 1) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride); + } + // Morphing operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index a72e11215..85599ccfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -27,6 +27,7 @@ template class Tenso template class TensorContractionOp; template class TensorConvolutionOp; template class TensorPatchOp; +template class TensorImagePatchOp; template class TensorBroadcastingOp; template class TensorChippingOp; template class TensorReshapingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 000000000..ce916fdfd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,291 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorImagePatchOp type; +}; + +} // end namespace internal + + + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides){} + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = ceilf(static_cast(input_dims[1]) / op.row_strides()) * + ceilf(static_cast(input_dims[2]) / op.col_strides()); + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + + m_rowInputStride = input_dims[0] * op.row_strides(); + m_colInputStride = input_dims[0] * input_dims[1] * op.col_strides(); + m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2]; + + m_rowPaddingTop = op.patch_rows() / 2; + m_colPaddingLeft = op.patch_cols() / 2; + + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInputRows = internal::TensorIntDivisor(m_inputRows); + m_fastDimZero = internal::TensorIntDivisor(m_dimensions[0]); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Find the location of the first element of the patch. + const Index patchIndex = index / m_fastPatchStride; + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero; + + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colOffset = patchOffset / m_fastColStride; + + const Index inputCol = colIndex + colOffset - m_colPaddingLeft; + if (inputCol < 0 || inputCol >= m_inputCols) { + return Scalar(0); + } + const Index rowIndex = patch2DIndex - colIndex * m_inputRows; // m_rowStride is always 1 + const Index rowOffset = patchOffset - colOffset * m_colStride; + + const Index inputRow = rowIndex + rowOffset - m_rowPaddingTop; + if (inputRow < 0 || inputRow >= m_inputRows) { + return Scalar(0); + } + + const Index depth = index - (index / m_fastDimZero) * m_dimensions[0]; + + const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const Index packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index indices[2] = {index, index + packetSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastDimZero, + (indices[1] - patchIndex * m_patchStride) / m_fastDimZero}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + const Index inputCols[2] = {colIndex + colOffsets[0] - m_colPaddingLeft, colIndex + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + // all zeros + return internal::pset1(Scalar(0)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_inputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0] - m_rowPaddingTop, rowIndex + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + // all zeros + return internal::pset1(Scalar(0)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const Index depth = index - (index / m_fastDimZero) * m_dimensions[0]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + Scalar* data() const { return NULL; } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputRows; + Index m_inputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastInputRows; + internal::TensorIntDivisor m_fastDimZero; + + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 181f06fc7..89c651804 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -122,6 +122,7 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_morphing "-std=c++0x") ei_add_test(cxx11_tensor_padding "-std=c++0x") ei_add_test(cxx11_tensor_patch "-std=c++0x") + ei_add_test(cxx11_tensor_image_patch "-std=c++0x") ei_add_test(cxx11_tensor_reduction "-std=c++0x") ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp new file mode 100644 index 000000000..55d35eac0 --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -0,0 +1,280 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_patch() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + Tensor single_pixel_patch; + single_pixel_patch = tensor.extract_image_patches<1, 1>(); + + VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); + } + + Tensor entire_image_patch; + entire_image_patch = tensor.extract_image_patches<3, 5>(); + + VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 3; ++r) { + for (int c = 0; c < 5; ++c) { + for (int d = 0; d < 2; ++d) { + for (int b = 0; b < 7; ++b) { + float expected = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected = tensor(d, r-1+i, c-2+j, b); + } + VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 7); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 2; ++r) { + for (int c = 0; c < 2; ++c) { + for (int d = 0; d < 2; ++d) { + for (int b = 0; b < 7; ++b) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { + expected = tensor(d, r-1+i, c-1+j, b); + } + VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + + +static void test_patch_no_extra_dim() +{ + Tensor tensor(2,3,5); + tensor.setRandom(); + + Tensor single_pixel_patch; + single_pixel_patch = tensor.extract_image_patches<1, 1>(); + + VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1); + VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); + } + + Tensor entire_image_patch; + entire_image_patch = tensor.extract_image_patches<3, 5>(); + + VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 3; ++r) { + for (int c = 0; c < 5; ++c) { + for (int d = 0; d < 2; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected = tensor(d, r-1+i, c-2+j); + } + VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected); + } + } + } + } + } + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + int patchId = i+3*j; + for (int r = 0; r < 2; ++r) { + for (int c = 0; c < 2; ++c) { + for (int d = 0; d < 2; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { + expected = tensor(d, r-1+i, c-1+j); + } + VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected); + } + } + } + } + } +} + + +static void test_imagenet_patches() +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + Tensor l_in(3, 128, 128, 128); + l_in.setRandom(); + Tensor l_out = l_in.extract_image_patches(11, 11); + VERIFY_IS_EQUAL(l_out.dimension(0), 3); + VERIFY_IS_EQUAL(l_out.dimension(1), 11); + VERIFY_IS_EQUAL(l_out.dimension(2), 11); + VERIFY_IS_EQUAL(l_out.dimension(3), 128*128); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 128; ++i) { + for (int j = 0; j < 128; ++j) { + int patchId = i+128*j; + for (int c = 0; c < 11; ++c) { + for (int r = 0; r < 11; ++r) { + for (int d = 0; d < 3; ++d) { + float expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in(d, r-5+i, c-5+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(64, 64, 64, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(9, 9); + VERIFY_IS_EQUAL(l_out.dimension(0), 64); + VERIFY_IS_EQUAL(l_out.dimension(1), 9); + VERIFY_IS_EQUAL(l_out.dimension(2), 9); + VERIFY_IS_EQUAL(l_out.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 64; ++i) { + for (int j = 0; j < 64; ++j) { + int patchId = i+64*j; + for (int c = 0; c < 9; ++c) { + for (int r = 0; r < 9; ++r) { + for (int d = 0; d < 64; ++d) { + float expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in(d, r-4+i, c-4+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(128, 16, 16, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(7, 7); + VERIFY_IS_EQUAL(l_out.dimension(0), 128); + VERIFY_IS_EQUAL(l_out.dimension(1), 7); + VERIFY_IS_EQUAL(l_out.dimension(2), 7); + VERIFY_IS_EQUAL(l_out.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 16; ++j) { + int patchId = i+16*j; + for (int c = 0; c < 7; ++c) { + for (int r = 0; r < 7; ++r) { + for (int d = 0; d < 128; ++d) { + float expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in(d, r-3+i, c-3+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } + + l_in.resize(384, 13, 13, 128); + l_in.setRandom(); + l_out = l_in.extract_image_patches(3, 3); + VERIFY_IS_EQUAL(l_out.dimension(0), 384); + VERIFY_IS_EQUAL(l_out.dimension(1), 3); + VERIFY_IS_EQUAL(l_out.dimension(2), 3); + VERIFY_IS_EQUAL(l_out.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out.dimension(4), 128); + for (int b = 0; b < 128; ++b) { + for (int i = 0; i < 13; ++i) { + for (int j = 0; j < 13; ++j) { + int patchId = i+13*j; + for (int c = 0; c < 3; ++c) { + for (int r = 0; r < 3; ++r) { + for (int d = 0; d < 384; ++d) { + float expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in(d, r-1+i, c-1+j, b); + } + VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + + +void test_cxx11_tensor_image_patch() +{ + CALL_SUBTEST(test_simple_patch()); + CALL_SUBTEST(test_patch_no_extra_dim()); + CALL_SUBTEST(test_imagenet_patches()); +} From 1d3c8306f87b284c26180be6eac13dc8d4aa1b52 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 13 Nov 2014 19:13:17 -0800 Subject: [PATCH 103/214] Fixed compilation errors with clang. H: Enter commit message. Lines beginning with 'HG:' are removed. --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 1 - unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 10 +++++----- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a77903dca..8cb41aec8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -30,7 +30,6 @@ struct traits > : public traits::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8e898619d..c5ec42cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -848,8 +848,8 @@ struct TensorEvaluator(this->m_device.allocate(sizeA * sizeof(LhsScalar))); RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index ce916fdfd..0dfb6913b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -37,7 +37,6 @@ struct traits > : public traits typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; }; template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 010221e74..eaf0195ce 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -110,7 +110,7 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } @@ -190,7 +190,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -198,7 +198,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -213,7 +213,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -221,7 +221,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; From b33cf92878a57ec86d5e5715e7cde3a0cd360fd6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 18 Nov 2014 14:32:41 -0800 Subject: [PATCH 104/214] Fixed the evaluation of expressions involving tensors of 2 or 3 elements on CUDA devices. --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 4fa8e83ef..f27f643c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -168,11 +168,10 @@ __launch_bounds__(1024) const Index PacketSize = unpacket_traits::size; const Index vectorized_step_size = step_size * PacketSize; const Index vectorized_size = (size / PacketSize) * PacketSize; - Index i = first_index * PacketSize; - for ( ; i < vectorized_size; i += vectorized_step_size) { + for (Index i = first_index * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for ( ; i < size; i += step_size) { + for (Index i = vectorized_size + first_index; i < size; i += step_size) { eval.evalScalar(i); } } From 509e4ddc02e0d70b8c1ee325f3b18624d4235c1e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 19 Nov 2014 10:34:11 -0800 Subject: [PATCH 105/214] Added reduction packet primitives for CUDA --- Eigen/src/Core/arch/CUDA/PacketMath.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/CUDA/PacketMath.h index 7b481d512..19749c832 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -223,6 +223,27 @@ template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { return a.x; } +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} + template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w)); } From 48db34a7b90f07c9abec453f072b4f813a14ea07 Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Thu, 4 Dec 2014 01:18:47 -0500 Subject: [PATCH 106/214] Adding missing OPENGL_LIBRARIES for openglsupport test. Also adding OpenGL include directories as a better pratice even though these are system include directories in most systems. --- unsupported/test/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 97849a25a..94a5cf445 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -76,8 +76,9 @@ if(NOT EIGEN_TEST_NO_OPENGL) find_package(GLUT) find_package(GLEW) if(OPENGL_FOUND AND GLUT_FOUND AND GLEW_FOUND) + include_directories(${OPENGL_INCLUDE_DIR} ${GLUT_INCLUDE_DIR} ${GLEW_INCLUDE_DIRS}) ei_add_property(EIGEN_TESTED_BACKENDS "OpenGL, ") - set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES}) + set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES} ${OPENGL_LIBRARIES}) ei_add_test(openglsupport "" "${EIGEN_GL_LIB}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ") From eb3695d2fc75dd97bd0131672b9f160275e5caad Mon Sep 17 00:00:00 2001 From: Abhijit Kundu Date: Thu, 4 Dec 2014 02:57:03 -0500 Subject: [PATCH 107/214] Added cmake uninstall target. This adds a cmake command make uninstall Running make uninstall removes the files installed by running make install --- CMakeLists.txt | 5 +++++ cmake/EigenUninstall.cmake | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 cmake/EigenUninstall.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 05d92babe..f9610a522 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -448,6 +448,7 @@ if(cmake_generator_tolower MATCHES "makefile") message(STATUS "make check | Build and run the unit-tests. Read this page:") message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)") + message(STATUS "make uninstall| Removes files installed by make install") message(STATUS "--------------+--------------------------------------------------------------") else() message(STATUS "To build/run the unit tests, read this page:") @@ -483,3 +484,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake DESTINATION ${EIGEN_CONFIG_CMAKE_PATH} ) + +# Add uninstall target +add_custom_target ( uninstall + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) diff --git a/cmake/EigenUninstall.cmake b/cmake/EigenUninstall.cmake new file mode 100644 index 000000000..4dae8c85c --- /dev/null +++ b/cmake/EigenUninstall.cmake @@ -0,0 +1,40 @@ +################ CMake Uninstall Template ####################### +# CMake Template file for uninstallation of files +# mentioned in 'install_manifest.txt' +# +# Used by uinstall target +################################################################# + +set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt") + +if(EXISTS ${MANIFEST}) + message(STATUS "============== Uninstalling Eigen ===================") + + file(STRINGS ${MANIFEST} files) + foreach(file ${files}) + if(EXISTS ${file}) + message(STATUS "Removing file: '${file}'") + + execute_process( + COMMAND ${CMAKE_COMMAND} -E remove ${file} + OUTPUT_VARIABLE rm_out + RESULT_VARIABLE rm_retval + ) + + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Failed to remove file: '${file}'.") + endif() + else() + message(STATUS "File '${file}' does not exist.") + endif() + endforeach(file) + + message(STATUS "========== Finished Uninstalling Eigen ==============") +else() + message(STATUS "Cannot find install manifest: '${MANIFEST}'") + message(STATUS "Probably make install has not been performed") + message(STATUS " or install_manifest.txt has been deleted.") +endif() + + + From 80ed5bd90c245655ce0f892f6a679a0278ccbbab Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 5 Dec 2014 12:49:30 +0100 Subject: [PATCH 108/214] Workaround various "returning reference to temporary" warnings. --- Eigen/SparseCore | 7 ------- Eigen/src/Core/CoreEvaluators.h | 4 +++- Eigen/src/Core/util/Constants.h | 3 +++ Eigen/src/SparseCore/CompressedStorage.h | 2 +- Eigen/src/SparseCore/SparseMatrix.h | 4 ++-- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Eigen/SparseCore b/Eigen/SparseCore index b68c8fa8a..d5c0f6271 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -26,13 +26,6 @@ * This module depends on: Core. */ -namespace Eigen { - -/** The type used to identify a general sparse storage. */ -struct Sparse {}; - -} - #include "src/SparseCore/SparseUtil.h" #include "src/SparseCore/SparseMatrixBase.h" #include "src/SparseCore/SparseAssign.h" diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index a0dc72c4d..1c7123b85 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1221,7 +1221,9 @@ struct evaluator > typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; + // FIXME having to check whether ArgType is sparse here i not very nice. + typedef typename internal::conditional::value, + typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const { diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 5c7d70af6..9b40093f0 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -449,6 +449,9 @@ enum Action {GetAction, SetAction}; /** The type used to identify a dense storage. */ struct Dense {}; +/** The type used to identify a general sparse storage. */ +struct Sparse {}; + /** The type used to identify a permutation storage. */ struct PermutationStorage {}; diff --git a/Eigen/src/SparseCore/CompressedStorage.h b/Eigen/src/SparseCore/CompressedStorage.h index 2741f8292..99f741138 100644 --- a/Eigen/src/SparseCore/CompressedStorage.h +++ b/Eigen/src/SparseCore/CompressedStorage.h @@ -143,7 +143,7 @@ class CompressedStorage } /** Like at(), but the search is performed in the range [start,end) */ - inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const + inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const { if (start>=end) return defaultValue; diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 4c79c7dc3..93677c786 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -179,7 +179,7 @@ class SparseMatrix /** \returns the value of the matrix at position \a i, \a j * This function returns Scalar(0) if the element is an explicit \em zero */ - inline const Scalar& coeff(Index row, Index col) const + inline Scalar coeff(Index row, Index col) const { eigen_assert(row>=0 && row=0 && col > operator const SparseMatrixType&() const { return *m_matrix; } typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; - CoeffReturnType coeff(Index row, Index col) const + Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); } Scalar& coeffRef(Index row, Index col) From 30c849669d6d29f6e19484478639ec5176c1826a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 14:45:04 +0100 Subject: [PATCH 109/214] Fix dynamic allocation in JacobiSVD (regression) --- Eigen/src/SVD/JacobiSVD.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 0f7e5b8fe..444187ae7 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -628,6 +628,7 @@ template class JacobiSVD internal::qr_preconditioner_impl m_qr_precond_morecols; internal::qr_preconditioner_impl m_qr_precond_morerows; + MatrixType m_scaledMatrix; }; template @@ -674,8 +675,9 @@ void JacobiSVD::allocate(Index rows, Index cols, u : 0); m_workMatrix.resize(m_diagSize, m_diagSize); - if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); - if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this); + if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this); + if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols); } template @@ -698,7 +700,13 @@ JacobiSVD::compute(const MatrixType& matrix, unsig /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ - if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale)) + if(m_rows!=m_cols) + { + m_scaledMatrix = matrix / scale; + m_qr_precond_morecols.run(*this, m_scaledMatrix); + m_qr_precond_morerows.run(*this, m_scaledMatrix); + } + else { m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale; if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows); From 7f7a71206267014f33a175322789977047ced24f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 15:02:25 +0100 Subject: [PATCH 110/214] Optimize Simplicial Cholesky when NaturalOrdering is used. --- Eigen/src/SparseCholesky/SimplicialCholesky.h | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 918a34e13..1928670a3 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -607,22 +607,35 @@ void SimplicialCholeskyBase::ordering(const MatrixType& a, CholMatrixTy { eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); - // Note that amd compute the inverse permutation + // Note that ordering methods compute the inverse permutation + if(!internal::is_same >::value) { - CholMatrixType C; - C = a.template selfadjointView(); + { + CholMatrixType C; + C = a.template selfadjointView(); + + OrderingType ordering; + ordering(C,m_Pinv); + } + + if(m_Pinv.size()>0) m_P = m_Pinv.inverse(); + else m_P.resize(0); - OrderingType ordering; - ordering(C,m_Pinv); + ap.resize(size,size); + ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); } - - if(m_Pinv.size()>0) - m_P = m_Pinv.inverse(); else + { + m_Pinv.resize(0); m_P.resize(0); - - ap.resize(size,size); - ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + if(UpLo==Lower) + { + ap.resize(size,size); + ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + } + else + ap = a.template triangularView(); + } } } // end namespace Eigen From bea36925dbdd753c861d650623ae2692bb9de812 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 16:26:53 +0100 Subject: [PATCH 111/214] bug #876: implement a portable log1p function --- Eigen/src/Core/MathFunctions.h | 85 +++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 4c5fc1cae..72d6acfc1 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -14,7 +14,7 @@ namespace Eigen { // On WINCE, std::abs is defined for int only, so let's defined our own overloads: // This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too. -#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500 +#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500 long abs(long x) { return (labs(x)); } double abs(double x) { return (fabs(x)); } float abs(float x) { return (fabsf(x)); } @@ -360,50 +360,31 @@ inline NewType cast(const OldType& x) } /**************************************************************************** -* Implementation of atanh2 * +* Implementation of logp1 * ****************************************************************************/ template -struct atanh2_impl +struct log1p_impl { - static inline Scalar run(const Scalar& x, const Scalar& r) + static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if (__cplusplus >= 201103L) && !defined(__CYGWIN__) + // Let's be conservative and enable the default C++11 implementation only if we are sure it exists + #if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC) using std::log1p; - return log1p(2 * x / (r - x)) / 2; + return log1p(x); #else - using std::abs; + typedef typename NumTraits::Real RealScalar; using std::log; - using std::sqrt; - Scalar z = x / r; - if (r == 0 || abs(z) > sqrt(NumTraits::epsilon())) - return log((r + x) / (r - x)) / 2; - else - return z + z*z*z / 3; + Scalar x1p = RealScalar(1) + x; + return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); #endif } }; -template -struct atanh2_impl > -{ - typedef std::complex Scalar; - static inline Scalar run(const Scalar& x, const Scalar& r) - { - using std::log; - using std::norm; - using std::sqrt; - Scalar z = x / r; - if (r == Scalar(0) || norm(z) > NumTraits::epsilon()) - return RealScalar(0.5) * log((r + x) / (r - x)); - else - return z + z*z*z / RealScalar(3); - } -}; - template -struct atanh2_retval +struct log1p_retval { typedef Scalar type; }; @@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& template EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) +inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); + return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } template @@ -727,6 +708,44 @@ inline int log2(int x) } // end namespace numext + +namespace internal { + +/**************************************************************************** +* Implementation of atanh2 * +****************************************************************************/ + +template +struct atanh2_impl +{ + static inline Scalar run(const Scalar& x, const Scalar& r) + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + typedef typename NumTraits::Real RealScalar; + return numext::log1p(RealScalar(2) * x / (r - x)) / RealScalar(2); + } +}; + +template +struct atanh2_retval +{ + typedef Scalar type; +}; + + +} // end namespace internal + +namespace numext { + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) +{ + return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); +} + +} // end namespace numext + namespace internal { /**************************************************************************** From 77294047d68ea07c90ddabddbc4cee5dd97c5237 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 16:28:06 +0100 Subject: [PATCH 112/214] bug #876, matrix_log_compute_2x2: directly use logp1 instead of atanh2 --- .../Eigen/src/MatrixFunctions/MatrixLogarithm.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 42b60b9b1..22bfdc4ac 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -53,15 +53,20 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) result(1,0) = Scalar(0); result(1,1) = logA11; - if (A(0,0) == A(1,1)) { + Scalar y = A(1,1) - A(0,0); + if (y==Scalar(0)) + { result(0,1) = A(0,1) / A(0,0); - } else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) { - result(0,1) = A(0,1) * (logA11 - logA00) / (A(1,1) - A(0,0)); - } else { + } + else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) + { + result(0,1) = A(0,1) * (logA11 - logA00) / y; + } + else + { // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) int unwindingNumber = static_cast(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI))); - Scalar y = A(1,1) - A(0,0), x = A(1,1) + A(0,0); - result(0,1) = A(0,1) * (Scalar(2) * numext::atanh2(y,x) + Scalar(0,2*M_PI*unwindingNumber)) / y; + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*M_PI*unwindingNumber)) / y; } } From 5fc4ce64492a0c791691204dbb465570d7b4a70d Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 16:44:05 +0100 Subject: [PATCH 113/214] bug #876: remove usage of atanh2 in matrix power --- unsupported/Eigen/src/MatrixFunctions/MatrixPower.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index ee665c18e..1e5a59c55 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -299,7 +299,7 @@ MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar logCurr = log(curr); ComplexScalar logPrev = log(prev); int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI)); - ComplexScalar w = numext::atanh2(curr - prev, curr + prev) + ComplexScalar(0, M_PI*unwindingNumber); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, M_PI*unwindingNumber); return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); } @@ -311,7 +311,7 @@ MatrixPowerAtomic::computeSuperDiag(RealScalar curr, RealScalar prev using std::log; using std::sinh; - RealScalar w = numext::atanh2(curr - prev, curr + prev); + RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2); return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev); } From 437191186165005e0b619c069239ac2913fd2c41 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 16:44:34 +0100 Subject: [PATCH 114/214] Remove useless and non standard numext::atanh2 function. --- Eigen/src/Core/MathFunctions.h | 38 ---------------------------------- 1 file changed, 38 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 72d6acfc1..16ad2dc7e 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -708,44 +708,6 @@ inline int log2(int x) } // end namespace numext - -namespace internal { - -/**************************************************************************** -* Implementation of atanh2 * -****************************************************************************/ - -template -struct atanh2_impl -{ - static inline Scalar run(const Scalar& x, const Scalar& r) - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - typedef typename NumTraits::Real RealScalar; - return numext::log1p(RealScalar(2) * x / (r - x)) / RealScalar(2); - } -}; - -template -struct atanh2_retval -{ - typedef Scalar type; -}; - - -} // end namespace internal - -namespace numext { - -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y) -{ - return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y); -} - -} // end namespace numext - namespace internal { /**************************************************************************** From a910a7466e143502439606572367849cb09ff5bf Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 17:55:31 +0100 Subject: [PATCH 115/214] Fix inner iterator type --- Eigen/src/SparseCholesky/SimplicialCholesky_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 7aaf702be..b7fd62faa 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -126,7 +126,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& Index top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L - for(typename MatrixType::InnerIterator it(ap,k); it; ++it) + for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it) { Index i = it.index(); if(i <= k) From 41a20994cce4d7e2c49bbb958a43c9ed69473f7f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 8 Dec 2014 17:56:33 +0100 Subject: [PATCH 116/214] In simplicial cholesky: avoid deep copy of the input matrix is this later can be used readily --- Eigen/src/SparseCholesky/SimplicialCholesky.h | 68 +++++++++++++++---- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 1928670a3..22325d7f4 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -17,6 +17,27 @@ enum SimplicialCholeskyMode { SimplicialCholeskyLDLT }; +namespace internal { + template + struct simplicial_cholesky_grab_input { + typedef CholMatrixType const * ConstCholMatrixPtr; + static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp) + { + tmp = input; + pmat = &tmp; + } + }; + + template + struct simplicial_cholesky_grab_input { + typedef MatrixType const * ConstMatrixPtr; + static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/) + { + pmat = &input; + } + }; +} // end namespace internal + /** \ingroup SparseCholesky_Module * \brief A direct sparse Cholesky factorizations * @@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::Index Index; typedef SparseMatrix CholMatrixType; + typedef CholMatrixType const * ConstCholMatrixPtr; typedef Matrix VectorType; public: @@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase { eigen_assert(matrix.rows()==matrix.cols()); Index size = matrix.cols(); - CholMatrixType ap(size,size); - ordering(matrix, ap); - analyzePattern_preordered(ap, DoLDLT); - factorize_preordered(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(matrix, pmat, tmp); + analyzePattern_preordered(*pmat, DoLDLT); + factorize_preordered(*pmat); } template @@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); - factorize_preordered(ap); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + + if(m_P.size()==0 && (UpLo&Upper)==Upper) + { + // If there is no ordering, try to directly use the input matrix without any copy + internal::simplicial_cholesky_grab_input::run(a, pmat, tmp); + } + else + { + tmp.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + pmat = &tmp; + } + + factorize_preordered(*pmat); } template @@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase { eigen_assert(a.rows()==a.cols()); int size = a.cols(); - CholMatrixType ap(size,size); - ordering(a, ap); - analyzePattern_preordered(ap,doLDLT); + CholMatrixType tmp(size,size); + ConstCholMatrixPtr pmat; + ordering(a, pmat, tmp); + analyzePattern_preordered(*pmat,doLDLT); } void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT); - void ordering(const MatrixType& a, CholMatrixType& ap); + void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap); /** keeps off-diagonal entries; drops diagonal entries */ struct keep_diag { @@ -603,10 +639,11 @@ public: }; template -void SimplicialCholeskyBase::ordering(const MatrixType& a, CholMatrixType& ap) +void SimplicialCholeskyBase::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap) { eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); + pmat = ≈ // Note that ordering methods compute the inverse permutation if(!internal::is_same >::value) { @@ -628,13 +665,14 @@ void SimplicialCholeskyBase::ordering(const MatrixType& a, CholMatrixTy { m_Pinv.resize(0); m_P.resize(0); - if(UpLo==Lower) + if(UpLo==Lower || MatrixType::IsRowMajor) { + // we have to transpose the lower part to to the upper one ap.resize(size,size); - ap.template selfadjointView() = a.template selfadjointView().twistedBy(m_P); + ap.template selfadjointView() = a.template selfadjointView(); } else - ap = a.template triangularView(); + internal::simplicial_cholesky_grab_input::run(a, pmat, ap); } } From 0efaff9b3b261daaa91baf8935ec7c4f5156a647 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 11 Dec 2014 16:15:20 +0100 Subject: [PATCH 117/214] Fix out-of-bounds write --- Eigen/src/SparseCore/AmbiVector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 5c9c3101e..76ef25f7d 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -69,7 +69,7 @@ class AmbiVector delete[] m_buffer; if (size<1000) { - Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar); + Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar); m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl); m_buffer = new Scalar[allocSize]; } @@ -88,7 +88,7 @@ class AmbiVector Index copyElements = m_allocatedElements; m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size); Index allocSize = m_allocatedElements * sizeof(ListEl); - allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0); + allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; From 80cae358b000c87bba77414cdb36ddf55eced896 Mon Sep 17 00:00:00 2001 From: Tim Murray Date: Mon, 24 Nov 2014 10:56:30 -0800 Subject: [PATCH 118/214] Adds a modified f2c-generated C implmentation for BLAS. This adds an optional implementation for the BLAS library that does not require the use of a FORTRAN compiler. It can be enabled with EIGEN_USE_F2C_BLAS. The C implementation uses the standard gfortran calling convention and does not require the use of -ff2c when compiled with gfortran. --- blas/CMakeLists.txt | 38 +- blas/f2c/chbmv.c | 487 +++++++++++++++++++++++ blas/f2c/chpmv.c | 438 +++++++++++++++++++++ blas/f2c/complexdots.c | 84 ++++ blas/f2c/ctbmv.c | 647 +++++++++++++++++++++++++++++++ blas/f2c/d_cnjg.c | 6 + blas/f2c/datatypes.h | 24 ++ blas/f2c/drotm.c | 215 ++++++++++ blas/f2c/drotmg.c | 293 ++++++++++++++ blas/f2c/dsbmv.c | 366 +++++++++++++++++ blas/f2c/dspmv.c | 316 +++++++++++++++ blas/f2c/dtbmv.c | 428 ++++++++++++++++++++ blas/f2c/lsame.c | 117 ++++++ blas/f2c/r_cnjg.c | 6 + blas/f2c/srotm.c | 216 +++++++++++ blas/f2c/srotmg.c | 295 ++++++++++++++ blas/f2c/ssbmv.c | 368 ++++++++++++++++++ blas/f2c/sspmv.c | 316 +++++++++++++++ blas/f2c/stbmv.c | 428 ++++++++++++++++++++ blas/f2c/zhbmv.c | 488 +++++++++++++++++++++++ blas/f2c/zhpmv.c | 438 +++++++++++++++++++++ blas/f2c/ztbmv.c | 647 +++++++++++++++++++++++++++++++ blas/{ => fortran}/chbmv.f | 0 blas/{ => fortran}/chpmv.f | 0 blas/{ => fortran}/complexdots.f | 0 blas/{ => fortran}/ctbmv.f | 0 blas/{ => fortran}/drotm.f | 0 blas/{ => fortran}/drotmg.f | 0 blas/{ => fortran}/dsbmv.f | 0 blas/{ => fortran}/dspmv.f | 0 blas/{ => fortran}/dtbmv.f | 0 blas/{ => fortran}/lsame.f | 0 blas/{ => fortran}/srotm.f | 0 blas/{ => fortran}/srotmg.f | 0 blas/{ => fortran}/ssbmv.f | 0 blas/{ => fortran}/sspmv.f | 0 blas/{ => fortran}/stbmv.f | 0 blas/{ => fortran}/zhbmv.f | 0 blas/{ => fortran}/zhpmv.f | 0 blas/{ => fortran}/ztbmv.f | 0 40 files changed, 6647 insertions(+), 14 deletions(-) create mode 100644 blas/f2c/chbmv.c create mode 100644 blas/f2c/chpmv.c create mode 100644 blas/f2c/complexdots.c create mode 100644 blas/f2c/ctbmv.c create mode 100644 blas/f2c/d_cnjg.c create mode 100644 blas/f2c/datatypes.h create mode 100644 blas/f2c/drotm.c create mode 100644 blas/f2c/drotmg.c create mode 100644 blas/f2c/dsbmv.c create mode 100644 blas/f2c/dspmv.c create mode 100644 blas/f2c/dtbmv.c create mode 100644 blas/f2c/lsame.c create mode 100644 blas/f2c/r_cnjg.c create mode 100644 blas/f2c/srotm.c create mode 100644 blas/f2c/srotmg.c create mode 100644 blas/f2c/ssbmv.c create mode 100644 blas/f2c/sspmv.c create mode 100644 blas/f2c/stbmv.c create mode 100644 blas/f2c/zhbmv.c create mode 100644 blas/f2c/zhpmv.c create mode 100644 blas/f2c/ztbmv.c rename blas/{ => fortran}/chbmv.f (100%) rename blas/{ => fortran}/chpmv.f (100%) rename blas/{ => fortran}/complexdots.f (100%) rename blas/{ => fortran}/ctbmv.f (100%) rename blas/{ => fortran}/drotm.f (100%) rename blas/{ => fortran}/drotmg.f (100%) rename blas/{ => fortran}/dsbmv.f (100%) rename blas/{ => fortran}/dspmv.f (100%) rename blas/{ => fortran}/dtbmv.f (100%) rename blas/{ => fortran}/lsame.f (100%) rename blas/{ => fortran}/srotm.f (100%) rename blas/{ => fortran}/srotmg.f (100%) rename blas/{ => fortran}/ssbmv.f (100%) rename blas/{ => fortran}/sspmv.f (100%) rename blas/{ => fortran}/stbmv.f (100%) rename blas/{ => fortran}/zhbmv.f (100%) rename blas/{ => fortran}/zhpmv.f (100%) rename blas/{ => fortran}/ztbmv.f (100%) diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index a9bc05137..2bc956a64 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -16,21 +16,31 @@ add_custom_target(blas) set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp) -if(EIGEN_Fortran_COMPILER_WORKS) - -set(EigenBlas_SRCS ${EigenBlas_SRCS} - complexdots.f - srotm.f srotmg.f drotm.f drotmg.f - lsame.f dspmv.f ssbmv.f - chbmv.f sspmv.f - zhbmv.f chpmv.f dsbmv.f - zhpmv.f - dtbmv.f stbmv.f ctbmv.f ztbmv.f -) +if(EIGEN_USE_F2C_BLAS) + set(EigenBlas_SRCS ${EigenBlas_SRCS} + f2c/complexdots.c + f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c + f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c + f2c/chbmv.c f2c/sspmv.c + f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c + f2c/zhpmv.c + f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c f2c/ztbmv.c + f2c/d_cnjg.c f2c/r_cnjg.c + ) else() - -message(WARNING " No fortran compiler has been detected, the blas build will be incomplete.") - + if (EIGEN_Fortran_COMPILER_WORKS) + set(EigenBlas_SRCS ${EigenBlas_SRCS} + fortran/complexdots.f + fortran/srotm.f fortran/srotmg.f fortran/drotm.f fortran/drotmg.f + fortran/lsame.f fortran/dspmv.f fortran/ssbmv.f + fortran/chbmv.f fortran/sspmv.f + fortran/zhbmv.f fortran/chpmv.f fortran/dsbmv.f + fortran/zhpmv.f + fortran/dtbmv.f fortran/stbmv.f fortran/ctbmv.f fortran/ztbmv.f + ) + else() + message(WARNING " No Fortran compiler has been detected, the blas build will be incomplete. Define EIGEN_USE_F2C_BLAS to build BLAS without Fortran") + endif() endif() add_library(eigen_blas_static ${EigenBlas_SRCS}) diff --git a/blas/f2c/chbmv.c b/blas/f2c/chbmv.c new file mode 100644 index 000000000..f218fe3f5 --- /dev/null +++ b/blas/f2c/chbmv.c @@ -0,0 +1,487 @@ +/* chbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex * + alpha, complex *a, integer *lda, complex *x, integer *incx, complex * + beta, complex *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + real r__1; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + void r_cnjg(complex *, complex *); + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + complex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CHBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - COMPLEX array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX . */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("CHBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && + beta->i == 0.f))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (beta->r != 1.f || beta->i != 0.f) { + if (*incy == 1) { + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0.f, y[i__2].i = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0.f, y[i__2].i = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0.f && alpha->i == 0.f) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__2 = i__; + q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = + q__3.r * x[i__2].i + q__3.i * x[i__2].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L50: */ + } + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + r__1 = a[i__3].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = + alpha->r * x[i__4].i + alpha->i * x[i__4].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = + q__3.r * x[i__4].i + q__3.i * x[i__4].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__3 = jy; + i__4 = jy; + i__2 = kplus1 + j * a_dim1; + r__1 = a[i__2].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = j; + q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__3 = j; + i__4 = j; + i__2 = j * a_dim1 + 1; + r__1 = a[i__2].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__2 = i__; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = + q__3.r * x[i__4].i + q__3.i * x[i__4].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L90: */ + } + i__3 = j; + i__4 = j; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = jx; + q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__3 = jy; + i__4 = jy; + i__2 = j * a_dim1 + 1; + r__1 = a[i__2].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; + y[i__4].r = q__1.r, y[i__4].i = q__1.i; + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = + q__3.r * x[i__4].i + q__3.i * x[i__4].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L110: */ + } + i__3 = jy; + i__4 = jy; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of CHBMV . */ + +} /* chbmv_ */ + diff --git a/blas/f2c/chpmv.c b/blas/f2c/chpmv.c new file mode 100644 index 000000000..65bab1c7f --- /dev/null +++ b/blas/f2c/chpmv.c @@ -0,0 +1,438 @@ +/* chpmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int chpmv_(char *uplo, integer *n, complex *alpha, complex * + ap, complex *x, integer *incx, complex *beta, complex *y, integer * + incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + real r__1; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + void r_cnjg(complex *, complex *); + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + complex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CHPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - COMPLEX array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* X - COMPLEX array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX . */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("CHPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && + beta->i == 0.f))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (beta->r != 1.f || beta->i != 0.f) { + if (*incy == 1) { + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0.f, y[i__2].i = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0.f && beta->i == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0.f, y[i__2].i = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0.f && alpha->i == 0.f) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = i__; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ++k; +/* L50: */ + } + i__2 = j; + i__3 = j; + i__4 = kk + j - 1; + r__1 = ap[i__4].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + i__3 = iy; + i__4 = iy; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = ix; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__2 = jy; + i__3 = jy; + i__4 = kk + j - 1; + r__1 = ap[i__4].r; + q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; + q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i; + q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__2 = j; + i__3 = j; + i__4 = kk; + r__1 = ap[i__4].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = i__; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; + ++k; +/* L90: */ + } + i__2 = j; + i__3 = j; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = q__1.r, temp1.i = q__1.i; + temp2.r = 0.f, temp2.i = 0.f; + i__2 = jy; + i__3 = jy; + i__4 = kk; + r__1 = ap[i__4].r; + q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + i__3 = iy; + i__4 = iy; + i__5 = k; + q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; + y[i__3].r = q__1.r, y[i__3].i = q__1.i; + r_cnjg(&q__3, &ap[k]); + i__3 = ix; + q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i = + q__3.r * x[i__3].i + q__3.i * x[i__3].r; + q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; + temp2.r = q__1.r, temp2.i = q__1.i; +/* L110: */ + } + i__2 = jy; + i__3 = jy; + q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; + y[i__2].r = q__1.r, y[i__2].i = q__1.i; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of CHPMV . */ + +} /* chpmv_ */ + diff --git a/blas/f2c/complexdots.c b/blas/f2c/complexdots.c new file mode 100644 index 000000000..a856a231c --- /dev/null +++ b/blas/f2c/complexdots.c @@ -0,0 +1,84 @@ +/* This file has been modified to use the standard gfortran calling + convention, rather than the f2c calling convention. + + It does not require -ff2c when compiled with gfortran. +*/ + +/* complexdots.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +complex cdotc_(integer *n, complex *cx, integer + *incx, complex *cy, integer *incy) +{ + complex res; + extern /* Subroutine */ int cdotcw_(integer *, complex *, integer *, + complex *, integer *, complex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + cdotcw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* cdotc_ */ + +complex cdotu_(integer *n, complex *cx, integer + *incx, complex *cy, integer *incy) +{ + complex res; + extern /* Subroutine */ int cdotuw_(integer *, complex *, integer *, + complex *, integer *, complex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + cdotuw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* cdotu_ */ + +doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx, + doublecomplex *cy, integer *incy) +{ + doublecomplex res; + extern /* Subroutine */ int zdotcw_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + zdotcw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* zdotc_ */ + +doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx, + doublecomplex *cy, integer *incy) +{ + doublecomplex res; + extern /* Subroutine */ int zdotuw_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *); + + /* Parameter adjustments */ + --cy; + --cx; + + /* Function Body */ + zdotuw_(n, &cx[1], incx, &cy[1], incy, &res); + return res; +} /* zdotu_ */ + diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c new file mode 100644 index 000000000..790fd581f --- /dev/null +++ b/blas/f2c/ctbmv.c @@ -0,0 +1,647 @@ +/* ctbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int ctbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, complex *a, integer *lda, complex *x, integer *incx, + ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + complex q__1, q__2, q__3; + + /* Builtin functions */ + void r_cnjg(complex *, complex *); + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + complex temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical noconj, nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CTBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := conjg( A' )*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - COMPLEX array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("CTBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1); + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + if (x[i__2].r != 0.f || x[i__2].i != 0.f) { + i__2 = j; + temp.r = x[i__2].r, temp.i = x[i__2].i; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + q__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + + q__2.i; + x[i__2].r = q__1.r, x[i__2].i = q__1.i; +/* L10: */ + } + if (nounit) { + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[ + i__3].i, q__1.i = x[i__2].r * a[i__3].i + + x[i__2].i * a[i__3].r; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + if (x[i__4].r != 0.f || x[i__4].i != 0.f) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = ix; + i__2 = ix; + i__5 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + q__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i + + q__2.i; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + ix += *incx; +/* L30: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__2 = kplus1 + j * a_dim1; + q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[ + i__2].i, q__1.i = x[i__4].r * a[i__2].i + + x[i__4].i * a[i__2].r; + x[i__3].r = q__1.r, x[i__3].i = q__1.i; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__1 = j; + if (x[i__1].r != 0.f || x[i__1].i != 0.f) { + i__1 = j; + temp.r = x[i__1].r, temp.i = x[i__1].i; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + i__1 = i__; + i__3 = i__; + i__2 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + q__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i + + q__2.i; + x[i__1].r = q__1.r, x[i__1].i = q__1.i; +/* L50: */ + } + if (nounit) { + i__4 = j; + i__1 = j; + i__3 = j * a_dim1 + 1; + q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[ + i__3].i, q__1.i = x[i__1].r * a[i__3].i + + x[i__1].i * a[i__3].r; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__4 = jx; + if (x[i__4].r != 0.f || x[i__4].i != 0.f) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + i__4 = ix; + i__1 = ix; + i__2 = l + i__ + j * a_dim1; + q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + q__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i + + q__2.i; + x[i__4].r = q__1.r, x[i__4].i = q__1.i; + ix -= *incx; +/* L70: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__1 = j * a_dim1 + 1; + q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[ + i__1].i, q__1.i = x[i__4].r * a[i__1].i + + x[i__4].i * a[i__1].r; + x[i__3].r = q__1.r, x[i__3].i = q__1.i; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x or x := conjg( A' )*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__3 = j; + temp.r = x[i__3].r, temp.i = x[i__3].i; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + q__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = i__; + q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, q__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L90: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, + q__2.i = q__3.r * x[i__4].i + q__3.i * x[ + i__4].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L100: */ + } + } + i__3 = j; + x[i__3].r = temp.r, x[i__3].i = temp.i; +/* L110: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__3 = jx; + temp.r = x[i__3].r, temp.i = x[i__3].i; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + q__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = ix; + q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, q__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix -= *incx; +/* L120: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, + q__2.i = q__3.r * x[i__4].i + q__3.i * x[ + i__4].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix -= *incx; +/* L130: */ + } + } + i__3 = jx; + x[i__3].r = temp.r, x[i__3].i = temp.i; + jx -= *incx; +/* L140: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = j; + temp.r = x[i__4].r, temp.i = x[i__4].i; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + q__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = i__; + q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, q__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L150: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[j * a_dim1 + 1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__1 = i__; + q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, + q__2.i = q__3.r * x[i__1].i + q__3.i * x[ + i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; +/* L160: */ + } + } + i__4 = j; + x[i__4].r = temp.r, x[i__4].i = temp.i; +/* L170: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + kx += *incx; + ix = kx; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + q__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = ix; + q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, q__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix += *incx; +/* L180: */ + } + } else { + if (nounit) { + r_cnjg(&q__2, &a[j * a_dim1 + 1]); + q__1.r = temp.r * q__2.r - temp.i * q__2.i, + q__1.i = temp.r * q__2.i + temp.i * + q__2.r; + temp.r = q__1.r, temp.i = q__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); + i__1 = ix; + q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i, + q__2.i = q__3.r * x[i__1].i + q__3.i * x[ + i__1].r; + q__1.r = temp.r + q__2.r, q__1.i = temp.i + + q__2.i; + temp.r = q__1.r, temp.i = q__1.i; + ix += *incx; +/* L190: */ + } + } + i__4 = jx; + x[i__4].r = temp.r, x[i__4].i = temp.i; + jx += *incx; +/* L200: */ + } + } + } + } + + return 0; + +/* End of CTBMV . */ + +} /* ctbmv_ */ + diff --git a/blas/f2c/d_cnjg.c b/blas/f2c/d_cnjg.c new file mode 100644 index 000000000..623090c6b --- /dev/null +++ b/blas/f2c/d_cnjg.c @@ -0,0 +1,6 @@ +#include "datatypes.h" + +void d_cnjg(doublecomplex *r, doublecomplex *z) { + r->r = z->r; + r->i = -(z->i); +} diff --git a/blas/f2c/datatypes.h b/blas/f2c/datatypes.h new file mode 100644 index 000000000..63232b246 --- /dev/null +++ b/blas/f2c/datatypes.h @@ -0,0 +1,24 @@ +/* This contains a limited subset of the typedefs exposed by f2c + for use by the Eigen BLAS C-only implementation. +*/ + +#ifndef __EIGEN_DATATYPES_H__ +#define __EIGEN_DATATYPES_H__ + +typedef int integer; +typedef unsigned int uinteger; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +typedef int ftnlen; +typedef int logical; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (doublereal)abs(x) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (doublereal)min(a,b) +#define dmax(a,b) (doublereal)max(a,b) + +#endif diff --git a/blas/f2c/drotm.c b/blas/f2c/drotm.c new file mode 100644 index 000000000..17a779b74 --- /dev/null +++ b/blas/f2c/drotm.c @@ -0,0 +1,215 @@ +/* drotm.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int drotm_(integer *n, doublereal *dx, integer *incx, + doublereal *dy, integer *incy, doublereal *dparam) +{ + /* Initialized data */ + + static doublereal zero = 0.; + static doublereal two = 2.; + + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__; + doublereal w, z__; + integer kx, ky; + doublereal dh11, dh12, dh21, dh22, dflag; + integer nsteps; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */ + +/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */ +/* (DY**T) */ + +/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */ +/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */ +/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */ + +/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */ +/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */ + +/* Arguments */ +/* ========= */ + +/* N (input) INTEGER */ +/* number of elements in input vector(s) */ + +/* DX (input/output) DOUBLE PRECISION array, dimension N */ +/* double precision vector with N elements */ + +/* INCX (input) INTEGER */ +/* storage spacing between elements of DX */ + +/* DY (input/output) DOUBLE PRECISION array, dimension N */ +/* double precision vector with N elements */ + +/* INCY (input) INTEGER */ +/* storage spacing between elements of DY */ + +/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */ +/* DPARAM(1)=DFLAG */ +/* DPARAM(2)=DH11 */ +/* DPARAM(3)=DH21 */ +/* DPARAM(4)=DH12 */ +/* DPARAM(5)=DH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --dparam; + --dy; + --dx; + + /* Function Body */ +/* .. */ + + dflag = dparam[1]; + if (*n <= 0 || dflag + two == zero) { + goto L140; + } + if (! (*incx == *incy && *incx > 0)) { + goto L70; + } + + nsteps = *n * *incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w + z__ * dh12; + dy[i__] = w * dh21 + z__; +/* L20: */ + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = *incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__; + dy[i__] = -w + dh22 * z__; +/* L40: */ + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__ * dh12; + dy[i__] = w * dh21 + z__ * dh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (*incx < 0) { + kx = (1 - *n) * *incx + 1; + } + if (*incy < 0) { + ky = (1 - *n) * *incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w + z__ * dh12; + dy[ky] = w * dh21 + z__; + kx += *incx; + ky += *incy; +/* L90: */ + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__; + dy[ky] = -w + dh22 * z__; + kx += *incx; + ky += *incy; +/* L110: */ + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__ * dh12; + dy[ky] = w * dh21 + z__ * dh22; + kx += *incx; + ky += *incy; +/* L130: */ + } +L140: + return 0; +} /* drotm_ */ + diff --git a/blas/f2c/drotmg.c b/blas/f2c/drotmg.c new file mode 100644 index 000000000..a63eb1083 --- /dev/null +++ b/blas/f2c/drotmg.c @@ -0,0 +1,293 @@ +/* drotmg.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int drotmg_(doublereal *dd1, doublereal *dd2, doublereal * + dx1, doublereal *dy1, doublereal *dparam) +{ + /* Initialized data */ + + static doublereal zero = 0.; + static doublereal one = 1.; + static doublereal two = 2.; + static doublereal gam = 4096.; + static doublereal gamsq = 16777216.; + static doublereal rgamsq = 5.9604645e-8; + + /* Format strings */ + static char fmt_120[] = ""; + static char fmt_150[] = ""; + static char fmt_180[] = ""; + static char fmt_210[] = ""; + + /* System generated locals */ + doublereal d__1; + + /* Local variables */ + doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22; + integer igo; + doublereal dflag, dtemp; + + /* Assigned format variables */ + static char *igo_fmt; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */ +/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */ +/* DY2)**T. */ +/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */ + +/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */ +/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */ +/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */ +/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */ + +/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */ +/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */ +/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */ + + +/* Arguments */ +/* ========= */ + +/* DD1 (input/output) DOUBLE PRECISION */ + +/* DD2 (input/output) DOUBLE PRECISION */ + +/* DX1 (input/output) DOUBLE PRECISION */ + +/* DY1 (input) DOUBLE PRECISION */ + +/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */ +/* DPARAM(1)=DFLAG */ +/* DPARAM(2)=DH11 */ +/* DPARAM(3)=DH21 */ +/* DPARAM(4)=DH12 */ +/* DPARAM(5)=DH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Data statements .. */ + + /* Parameter adjustments */ + --dparam; + + /* Function Body */ +/* .. */ + if (! (*dd1 < zero)) { + goto L10; + } +/* GO ZERO-H-D-AND-DX1.. */ + goto L60; +L10: +/* CASE-DD1-NONNEGATIVE */ + dp2 = *dd2 * *dy1; + if (! (dp2 == zero)) { + goto L20; + } + dflag = -two; + goto L260; +/* REGULAR-CASE.. */ +L20: + dp1 = *dd1 * *dx1; + dq2 = dp2 * *dy1; + dq1 = dp1 * *dx1; + + if (! (abs(dq1) > abs(dq2))) { + goto L40; + } + dh21 = -(*dy1) / *dx1; + dh12 = dp2 / dp1; + + du = one - dh12 * dh21; + + if (! (du <= zero)) { + goto L30; + } +/* GO ZERO-H-D-AND-DX1.. */ + goto L60; +L30: + dflag = zero; + *dd1 /= du; + *dd2 /= du; + *dx1 *= du; +/* GO SCALE-CHECK.. */ + goto L100; +L40: + if (! (dq2 < zero)) { + goto L50; + } +/* GO ZERO-H-D-AND-DX1.. */ + goto L60; +L50: + dflag = one; + dh11 = dp1 / dp2; + dh22 = *dx1 / *dy1; + du = one + dh11 * dh22; + dtemp = *dd2 / du; + *dd2 = *dd1 / du; + *dd1 = dtemp; + *dx1 = *dy1 * du; +/* GO SCALE-CHECK */ + goto L100; +/* PROCEDURE..ZERO-H-D-AND-DX1.. */ +L60: + dflag = -one; + dh11 = zero; + dh12 = zero; + dh21 = zero; + dh22 = zero; + + *dd1 = zero; + *dd2 = zero; + *dx1 = zero; +/* RETURN.. */ + goto L220; +/* PROCEDURE..FIX-H.. */ +L70: + if (! (dflag >= zero)) { + goto L90; + } + + if (! (dflag == zero)) { + goto L80; + } + dh11 = one; + dh22 = one; + dflag = -one; + goto L90; +L80: + dh21 = -one; + dh12 = one; + dflag = -one; +L90: + switch (igo) { + case 0: goto L120; + case 1: goto L150; + case 2: goto L180; + case 3: goto L210; + } +/* PROCEDURE..SCALE-CHECK */ +L100: +L110: + if (! (*dd1 <= rgamsq)) { + goto L130; + } + if (*dd1 == zero) { + goto L160; + } + igo = 0; + igo_fmt = fmt_120; +/* FIX-H.. */ + goto L70; +L120: +/* Computing 2nd power */ + d__1 = gam; + *dd1 *= d__1 * d__1; + *dx1 /= gam; + dh11 /= gam; + dh12 /= gam; + goto L110; +L130: +L140: + if (! (*dd1 >= gamsq)) { + goto L160; + } + igo = 1; + igo_fmt = fmt_150; +/* FIX-H.. */ + goto L70; +L150: +/* Computing 2nd power */ + d__1 = gam; + *dd1 /= d__1 * d__1; + *dx1 *= gam; + dh11 *= gam; + dh12 *= gam; + goto L140; +L160: +L170: + if (! (abs(*dd2) <= rgamsq)) { + goto L190; + } + if (*dd2 == zero) { + goto L220; + } + igo = 2; + igo_fmt = fmt_180; +/* FIX-H.. */ + goto L70; +L180: +/* Computing 2nd power */ + d__1 = gam; + *dd2 *= d__1 * d__1; + dh21 /= gam; + dh22 /= gam; + goto L170; +L190: +L200: + if (! (abs(*dd2) >= gamsq)) { + goto L220; + } + igo = 3; + igo_fmt = fmt_210; +/* FIX-H.. */ + goto L70; +L210: +/* Computing 2nd power */ + d__1 = gam; + *dd2 /= d__1 * d__1; + dh21 *= gam; + dh22 *= gam; + goto L200; +L220: + if (dflag < 0.) { + goto L250; + } else if (dflag == 0) { + goto L230; + } else { + goto L240; + } +L230: + dparam[3] = dh21; + dparam[4] = dh12; + goto L260; +L240: + dparam[2] = dh11; + dparam[5] = dh22; + goto L260; +L250: + dparam[2] = dh11; + dparam[3] = dh21; + dparam[4] = dh12; + dparam[5] = dh22; +L260: + dparam[1] = dflag; + return 0; +} /* drotmg_ */ + diff --git a/blas/f2c/dsbmv.c b/blas/f2c/dsbmv.c new file mode 100644 index 000000000..c6b4b21d6 --- /dev/null +++ b/blas/f2c/dsbmv.c @@ -0,0 +1,366 @@ +/* dsbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int dsbmv_(char *uplo, integer *n, integer *k, doublereal * + alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, + doublereal *beta, doublereal *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + doublereal temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* DSBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - DOUBLE PRECISION. */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - DOUBLE PRECISION array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - DOUBLE PRECISION. */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - DOUBLE PRECISION array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("DSBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0. && *beta == 1.)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (*beta != 1.) { + if (*incy == 1) { + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L50: */ + } + y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * + temp2; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + y[j] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + y[j] += *alpha * temp2; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + y[jy] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of DSBMV . */ + +} /* dsbmv_ */ + diff --git a/blas/f2c/dspmv.c b/blas/f2c/dspmv.c new file mode 100644 index 000000000..0b4e92d5c --- /dev/null +++ b/blas/f2c/dspmv.c @@ -0,0 +1,316 @@ +/* dspmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int dspmv_(char *uplo, integer *n, doublereal *alpha, + doublereal *ap, doublereal *x, integer *incx, doublereal *beta, + doublereal *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + doublereal temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* DSPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - DOUBLE PRECISION. */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - DOUBLE PRECISION array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Unchanged on exit. */ + +/* X - DOUBLE PRECISION array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - DOUBLE PRECISION. */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - DOUBLE PRECISION array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("DSPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0. && *beta == 1.)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (*beta != 1.) { + if (*incy == 1) { + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L50: */ + } + y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.; + y[j] += temp1 * ap[kk]; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L90: */ + } + y[j] += *alpha * temp2; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.; + y[jy] += temp1 * ap[kk]; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of DSPMV . */ + +} /* dspmv_ */ + diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c new file mode 100644 index 000000000..fdf73ebb5 --- /dev/null +++ b/blas/f2c/dtbmv.c @@ -0,0 +1,428 @@ +/* dtbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int dtbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, doublereal *a, integer *lda, doublereal *x, integer *incx, + ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + doublereal temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* DTBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := A'*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - DOUBLE PRECISION array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("DTBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[j] != 0.) { + temp = x[j]; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L10: */ + } + if (nounit) { + x[j] *= a[kplus1 + j * a_dim1]; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[jx] != 0.) { + temp = x[jx]; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix += *incx; +/* L30: */ + } + if (nounit) { + x[jx] *= a[kplus1 + j * a_dim1]; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + if (x[j] != 0.) { + temp = x[j]; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L50: */ + } + if (nounit) { + x[j] *= a[j * a_dim1 + 1]; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + if (x[jx] != 0.) { + temp = x[jx]; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix -= *incx; +/* L70: */ + } + if (nounit) { + x[jx] *= a[j * a_dim1 + 1]; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + temp = x[j]; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + x[j] = temp; +/* L100: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + temp = x[jx]; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix -= *incx; +/* L110: */ + } + x[jx] = temp; + jx -= *incx; +/* L120: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[j]; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L130: */ + } + x[j] = temp; +/* L140: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[jx]; + kx += *incx; + ix = kx; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; +/* L150: */ + } + x[jx] = temp; + jx += *incx; +/* L160: */ + } + } + } + } + + return 0; + +/* End of DTBMV . */ + +} /* dtbmv_ */ + diff --git a/blas/f2c/lsame.c b/blas/f2c/lsame.c new file mode 100644 index 000000000..46324d916 --- /dev/null +++ b/blas/f2c/lsame.c @@ -0,0 +1,117 @@ +/* lsame.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +logical lsame_(char *ca, char *cb, ftnlen ca_len, ftnlen cb_len) +{ + /* System generated locals */ + logical ret_val; + + /* Local variables */ + integer inta, intb, zcode; + + +/* -- LAPACK auxiliary routine (version 3.1) -- */ +/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ +/* November 2006 */ + +/* .. Scalar Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */ +/* case. */ + +/* Arguments */ +/* ========= */ + +/* CA (input) CHARACTER*1 */ + +/* CB (input) CHARACTER*1 */ +/* CA and CB specify the single characters to be compared. */ + +/* ===================================================================== */ + +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ + +/* Test if the characters are equal */ + + ret_val = *(unsigned char *)ca == *(unsigned char *)cb; + if (ret_val) { + return ret_val; + } + +/* Now test for equivalence if both characters are alphabetic. */ + + zcode = 'Z'; + +/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */ +/* machines, on which ICHAR returns a value with bit 8 set. */ +/* ICHAR('A') on Prime machines returns 193 which is the same as */ +/* ICHAR('A') on an EBCDIC machine. */ + + inta = *(unsigned char *)ca; + intb = *(unsigned char *)cb; + + if (zcode == 90 || zcode == 122) { + +/* ASCII is assumed - ZCODE is the ASCII code of either lower or */ +/* upper case 'Z'. */ + + if (inta >= 97 && inta <= 122) { + inta += -32; + } + if (intb >= 97 && intb <= 122) { + intb += -32; + } + + } else if (zcode == 233 || zcode == 169) { + +/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ +/* upper case 'Z'. */ + + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || + (inta >= 162 && inta <= 169)) { + inta += 64; + } + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || + (intb >= 162 && intb <= 169)) { + intb += 64; + } + + } else if (zcode == 218 || zcode == 250) { + +/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */ +/* plus 128 of either lower or upper case 'Z'. */ + + if (inta >= 225 && inta <= 250) { + inta += -32; + } + if (intb >= 225 && intb <= 250) { + intb += -32; + } + } + ret_val = inta == intb; + +/* RETURN */ + +/* End of LSAME */ + + return ret_val; +} /* lsame_ */ + diff --git a/blas/f2c/r_cnjg.c b/blas/f2c/r_cnjg.c new file mode 100644 index 000000000..c08182f88 --- /dev/null +++ b/blas/f2c/r_cnjg.c @@ -0,0 +1,6 @@ +#include "datatypes.h" + +void r_cnjg(complex *r, complex *z) { + r->r = z->r; + r->i = -(z->i); +} diff --git a/blas/f2c/srotm.c b/blas/f2c/srotm.c new file mode 100644 index 000000000..bd5944a99 --- /dev/null +++ b/blas/f2c/srotm.c @@ -0,0 +1,216 @@ +/* srotm.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int srotm_(integer *n, real *sx, integer *incx, real *sy, + integer *incy, real *sparam) +{ + /* Initialized data */ + + static real zero = 0.f; + static real two = 2.f; + + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__; + real w, z__; + integer kx, ky; + real sh11, sh12, sh21, sh22, sflag; + integer nsteps; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */ + +/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */ +/* (DX**T) */ + +/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */ +/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */ +/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */ + +/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */ +/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */ + + +/* Arguments */ +/* ========= */ + +/* N (input) INTEGER */ +/* number of elements in input vector(s) */ + +/* SX (input/output) REAL array, dimension N */ +/* double precision vector with N elements */ + +/* INCX (input) INTEGER */ +/* storage spacing between elements of SX */ + +/* SY (input/output) REAL array, dimension N */ +/* double precision vector with N elements */ + +/* INCY (input) INTEGER */ +/* storage spacing between elements of SY */ + +/* SPARAM (input/output) REAL array, dimension 5 */ +/* SPARAM(1)=SFLAG */ +/* SPARAM(2)=SH11 */ +/* SPARAM(3)=SH21 */ +/* SPARAM(4)=SH12 */ +/* SPARAM(5)=SH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Data statements .. */ + /* Parameter adjustments */ + --sparam; + --sy; + --sx; + + /* Function Body */ +/* .. */ + + sflag = sparam[1]; + if (*n <= 0 || sflag + two == zero) { + goto L140; + } + if (! (*incx == *incy && *incx > 0)) { + goto L70; + } + + nsteps = *n * *incx; + if (sflag < 0.f) { + goto L50; + } else if (sflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + sh12 = sparam[4]; + sh21 = sparam[3]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = sx[i__]; + z__ = sy[i__]; + sx[i__] = w + z__ * sh12; + sy[i__] = w * sh21 + z__; +/* L20: */ + } + goto L140; +L30: + sh11 = sparam[2]; + sh22 = sparam[5]; + i__2 = nsteps; + i__1 = *incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = sx[i__]; + z__ = sy[i__]; + sx[i__] = w * sh11 + z__; + sy[i__] = -w + sh22 * z__; +/* L40: */ + } + goto L140; +L50: + sh11 = sparam[2]; + sh12 = sparam[4]; + sh21 = sparam[3]; + sh22 = sparam[5]; + i__1 = nsteps; + i__2 = *incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = sx[i__]; + z__ = sy[i__]; + sx[i__] = w * sh11 + z__ * sh12; + sy[i__] = w * sh21 + z__ * sh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (*incx < 0) { + kx = (1 - *n) * *incx + 1; + } + if (*incy < 0) { + ky = (1 - *n) * *incy + 1; + } + + if (sflag < 0.f) { + goto L120; + } else if (sflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + sh12 = sparam[4]; + sh21 = sparam[3]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = sx[kx]; + z__ = sy[ky]; + sx[kx] = w + z__ * sh12; + sy[ky] = w * sh21 + z__; + kx += *incx; + ky += *incy; +/* L90: */ + } + goto L140; +L100: + sh11 = sparam[2]; + sh22 = sparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = sx[kx]; + z__ = sy[ky]; + sx[kx] = w * sh11 + z__; + sy[ky] = -w + sh22 * z__; + kx += *incx; + ky += *incy; +/* L110: */ + } + goto L140; +L120: + sh11 = sparam[2]; + sh12 = sparam[4]; + sh21 = sparam[3]; + sh22 = sparam[5]; + i__2 = *n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = sx[kx]; + z__ = sy[ky]; + sx[kx] = w * sh11 + z__ * sh12; + sy[ky] = w * sh21 + z__ * sh22; + kx += *incx; + ky += *incy; +/* L130: */ + } +L140: + return 0; +} /* srotm_ */ + diff --git a/blas/f2c/srotmg.c b/blas/f2c/srotmg.c new file mode 100644 index 000000000..75f789fe2 --- /dev/null +++ b/blas/f2c/srotmg.c @@ -0,0 +1,295 @@ +/* srotmg.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real + *sparam) +{ + /* Initialized data */ + + static real zero = 0.f; + static real one = 1.f; + static real two = 2.f; + static real gam = 4096.f; + static real gamsq = 16777200.f; + static real rgamsq = 5.96046e-8f; + + /* Format strings */ + static char fmt_120[] = ""; + static char fmt_150[] = ""; + static char fmt_180[] = ""; + static char fmt_210[] = ""; + + /* System generated locals */ + real r__1; + + /* Local variables */ + real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22; + integer igo; + real sflag, stemp; + + /* Assigned format variables */ + static char *igo_fmt; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */ +/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */ +/* SY2)**T. */ +/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ + +/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */ + +/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */ +/* H=( ) ( ) ( ) ( ) */ +/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */ +/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */ +/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */ +/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */ + +/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */ +/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */ +/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */ + + +/* Arguments */ +/* ========= */ + + +/* SD1 (input/output) REAL */ + +/* SD2 (input/output) REAL */ + +/* SX1 (input/output) REAL */ + +/* SY1 (input) REAL */ + + +/* SPARAM (input/output) REAL array, dimension 5 */ +/* SPARAM(1)=SFLAG */ +/* SPARAM(2)=SH11 */ +/* SPARAM(3)=SH21 */ +/* SPARAM(4)=SH12 */ +/* SPARAM(5)=SH22 */ + +/* ===================================================================== */ + +/* .. Local Scalars .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ +/* .. Data statements .. */ + + /* Parameter adjustments */ + --sparam; + + /* Function Body */ +/* .. */ + if (! (*sd1 < zero)) { + goto L10; + } +/* GO ZERO-H-D-AND-SX1.. */ + goto L60; +L10: +/* CASE-SD1-NONNEGATIVE */ + sp2 = *sd2 * *sy1; + if (! (sp2 == zero)) { + goto L20; + } + sflag = -two; + goto L260; +/* REGULAR-CASE.. */ +L20: + sp1 = *sd1 * *sx1; + sq2 = sp2 * *sy1; + sq1 = sp1 * *sx1; + + if (! (dabs(sq1) > dabs(sq2))) { + goto L40; + } + sh21 = -(*sy1) / *sx1; + sh12 = sp2 / sp1; + + su = one - sh12 * sh21; + + if (! (su <= zero)) { + goto L30; + } +/* GO ZERO-H-D-AND-SX1.. */ + goto L60; +L30: + sflag = zero; + *sd1 /= su; + *sd2 /= su; + *sx1 *= su; +/* GO SCALE-CHECK.. */ + goto L100; +L40: + if (! (sq2 < zero)) { + goto L50; + } +/* GO ZERO-H-D-AND-SX1.. */ + goto L60; +L50: + sflag = one; + sh11 = sp1 / sp2; + sh22 = *sx1 / *sy1; + su = one + sh11 * sh22; + stemp = *sd2 / su; + *sd2 = *sd1 / su; + *sd1 = stemp; + *sx1 = *sy1 * su; +/* GO SCALE-CHECK */ + goto L100; +/* PROCEDURE..ZERO-H-D-AND-SX1.. */ +L60: + sflag = -one; + sh11 = zero; + sh12 = zero; + sh21 = zero; + sh22 = zero; + + *sd1 = zero; + *sd2 = zero; + *sx1 = zero; +/* RETURN.. */ + goto L220; +/* PROCEDURE..FIX-H.. */ +L70: + if (! (sflag >= zero)) { + goto L90; + } + + if (! (sflag == zero)) { + goto L80; + } + sh11 = one; + sh22 = one; + sflag = -one; + goto L90; +L80: + sh21 = -one; + sh12 = one; + sflag = -one; +L90: + switch (igo) { + case 0: goto L120; + case 1: goto L150; + case 2: goto L180; + case 3: goto L210; + } +/* PROCEDURE..SCALE-CHECK */ +L100: +L110: + if (! (*sd1 <= rgamsq)) { + goto L130; + } + if (*sd1 == zero) { + goto L160; + } + igo = 0; + igo_fmt = fmt_120; +/* FIX-H.. */ + goto L70; +L120: +/* Computing 2nd power */ + r__1 = gam; + *sd1 *= r__1 * r__1; + *sx1 /= gam; + sh11 /= gam; + sh12 /= gam; + goto L110; +L130: +L140: + if (! (*sd1 >= gamsq)) { + goto L160; + } + igo = 1; + igo_fmt = fmt_150; +/* FIX-H.. */ + goto L70; +L150: +/* Computing 2nd power */ + r__1 = gam; + *sd1 /= r__1 * r__1; + *sx1 *= gam; + sh11 *= gam; + sh12 *= gam; + goto L140; +L160: +L170: + if (! (dabs(*sd2) <= rgamsq)) { + goto L190; + } + if (*sd2 == zero) { + goto L220; + } + igo = 2; + igo_fmt = fmt_180; +/* FIX-H.. */ + goto L70; +L180: +/* Computing 2nd power */ + r__1 = gam; + *sd2 *= r__1 * r__1; + sh21 /= gam; + sh22 /= gam; + goto L170; +L190: +L200: + if (! (dabs(*sd2) >= gamsq)) { + goto L220; + } + igo = 3; + igo_fmt = fmt_210; +/* FIX-H.. */ + goto L70; +L210: +/* Computing 2nd power */ + r__1 = gam; + *sd2 /= r__1 * r__1; + sh21 *= gam; + sh22 *= gam; + goto L200; +L220: + if (sflag < 0.f) { + goto L250; + } else if (sflag == 0) { + goto L230; + } else { + goto L240; + } +L230: + sparam[3] = sh21; + sparam[4] = sh12; + goto L260; +L240: + sparam[2] = sh11; + sparam[5] = sh22; + goto L260; +L250: + sparam[2] = sh11; + sparam[3] = sh21; + sparam[4] = sh12; + sparam[5] = sh22; +L260: + sparam[1] = sflag; + return 0; +} /* srotmg_ */ + diff --git a/blas/f2c/ssbmv.c b/blas/f2c/ssbmv.c new file mode 100644 index 000000000..8599325f2 --- /dev/null +++ b/blas/f2c/ssbmv.c @@ -0,0 +1,368 @@ +/* ssbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int ssbmv_(char *uplo, integer *n, integer *k, real *alpha, + real *a, integer *lda, real *x, integer *incx, real *beta, real *y, + integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + real temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* SSBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - REAL . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - REAL array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the symmetric matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a symmetric band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - REAL array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - REAL . */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - REAL array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("SSBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (*beta != 1.f) { + if (*incy == 1) { + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.f) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L50: */ + } + y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * + temp2; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + y[j] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + y[i__] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + y[j] += *alpha * temp2; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + y[jy] += temp1 * a[j * a_dim1 + 1]; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * a[l + i__ + j * a_dim1]; + temp2 += a[l + i__ + j * a_dim1] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of SSBMV . */ + +} /* ssbmv_ */ + diff --git a/blas/f2c/sspmv.c b/blas/f2c/sspmv.c new file mode 100644 index 000000000..47858ec6c --- /dev/null +++ b/blas/f2c/sspmv.c @@ -0,0 +1,316 @@ +/* sspmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int sspmv_(char *uplo, integer *n, real *alpha, real *ap, + real *x, integer *incx, real *beta, real *y, integer *incy, ftnlen + uplo_len) +{ + /* System generated locals */ + integer i__1, i__2; + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + real temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* SSPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n symmetric matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - REAL . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - REAL array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the symmetric matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Unchanged on exit. */ + +/* X - REAL array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - REAL . */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - REAL array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("SSPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (*beta != 1.f) { + if (*incy == 1) { + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = 0.f; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[i__] = *beta * y[i__]; +/* L20: */ + } + } + } else { + iy = ky; + if (*beta == 0.f) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = 0.f; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + y[iy] = *beta * y[iy]; + iy += *incy; +/* L40: */ + } + } + } + } + if (*alpha == 0.f) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L50: */ + } + y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; + ix += *incx; + iy += *incy; +/* L70: */ + } + y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[j]; + temp2 = 0.f; + y[j] += temp1 * ap[kk]; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + y[i__] += temp1 * ap[k]; + temp2 += ap[k] * x[i__]; + ++k; +/* L90: */ + } + y[j] += *alpha * temp2; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + temp1 = *alpha * x[jx]; + temp2 = 0.f; + y[jy] += temp1 * ap[kk]; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + y[iy] += temp1 * ap[k]; + temp2 += ap[k] * x[ix]; +/* L110: */ + } + y[jy] += *alpha * temp2; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of SSPMV . */ + +} /* sspmv_ */ + diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c new file mode 100644 index 000000000..fcf9ce336 --- /dev/null +++ b/blas/f2c/stbmv.c @@ -0,0 +1,428 @@ +/* stbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int stbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, real *a, integer *lda, real *x, integer *incx, ftnlen + uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + real temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* STBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := A'*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - REAL array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - REAL array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("STBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[j] != 0.f) { + temp = x[j]; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L10: */ + } + if (nounit) { + x[j] *= a[kplus1 + j * a_dim1]; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (x[jx] != 0.f) { + temp = x[jx]; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix += *incx; +/* L30: */ + } + if (nounit) { + x[jx] *= a[kplus1 + j * a_dim1]; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + if (x[j] != 0.f) { + temp = x[j]; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + x[i__] += temp * a[l + i__ + j * a_dim1]; +/* L50: */ + } + if (nounit) { + x[j] *= a[j * a_dim1 + 1]; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + if (x[jx] != 0.f) { + temp = x[jx]; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + x[ix] += temp * a[l + i__ + j * a_dim1]; + ix -= *incx; +/* L70: */ + } + if (nounit) { + x[jx] *= a[j * a_dim1 + 1]; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + temp = x[j]; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L90: */ + } + x[j] = temp; +/* L100: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + temp = x[jx]; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (nounit) { + temp *= a[kplus1 + j * a_dim1]; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix -= *incx; +/* L110: */ + } + x[jx] = temp; + jx -= *incx; +/* L120: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[j]; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[i__]; +/* L130: */ + } + x[j] = temp; +/* L140: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + temp = x[jx]; + kx += *incx; + ix = kx; + l = 1 - j; + if (nounit) { + temp *= a[j * a_dim1 + 1]; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + temp += a[l + i__ + j * a_dim1] * x[ix]; + ix += *incx; +/* L150: */ + } + x[jx] = temp; + jx += *incx; +/* L160: */ + } + } + } + } + + return 0; + +/* End of STBMV . */ + +} /* stbmv_ */ + diff --git a/blas/f2c/zhbmv.c b/blas/f2c/zhbmv.c new file mode 100644 index 000000000..42da13dbb --- /dev/null +++ b/blas/f2c/zhbmv.c @@ -0,0 +1,488 @@ +/* zhbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int zhbmv_(char *uplo, integer *n, integer *k, doublecomplex + *alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer * + incx, doublecomplex *beta, doublecomplex *y, integer *incy, ftnlen + uplo_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + doublereal d__1; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + integer i__, j, l, ix, iy, jx, jy, kx, ky, info; + doublecomplex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* ZHBMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian band matrix, with k super-diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the band matrix A is being supplied as */ +/* follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* being supplied. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* being supplied. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry, K specifies the number of super-diagonals of the */ +/* matrix A. K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX*16 . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer the upper */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the hermitian matrix, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer the lower */ +/* triangular part of a hermitian band matrix from conventional */ +/* full matrix storage to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX*16 array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the */ +/* vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX*16 . */ +/* On entry, BETA specifies the scalar beta. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX*16 array of DIMENSION at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the */ +/* vector y. On exit, Y is overwritten by the updated vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + --y; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*k < 0) { + info = 3; + } else if (*lda < *k + 1) { + info = 6; + } else if (*incx == 0) { + info = 8; + } else if (*incy == 0) { + info = 11; + } + if (info != 0) { + xerbla_("ZHBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && + beta->i == 0.))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array A */ +/* are accessed sequentially with one pass through A. */ + +/* First form y := beta*y. */ + + if (beta->r != 1. || beta->i != 0.) { + if (*incy == 1) { + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0., y[i__2].i = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0., y[i__2].i = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0. && alpha->i == 0.) { + return 0; + } + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when upper triangle of A is stored. */ + + kplus1 = *k + 1; + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__2 = i__; + z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i = + z__3.r * x[i__2].i + z__3.i * x[i__2].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L50: */ + } + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + d__1 = a[i__3].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i = + alpha->r * x[i__4].i + alpha->i * x[i__4].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + ix = kx; + iy = ky; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = + z__3.r * x[i__4].i + z__3.i * x[i__4].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__3 = jy; + i__4 = jy; + i__2 = kplus1 + j * a_dim1; + d__1 = a[i__2].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + jx += *incx; + jy += *incy; + if (j > *k) { + kx += *incx; + ky += *incy; + } +/* L80: */ + } + } + } else { + +/* Form y when lower triangle of A is stored. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = j; + z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__3 = j; + i__4 = j; + i__2 = j * a_dim1 + 1; + d__1 = a[i__2].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__2 = i__; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = + z__3.r * x[i__4].i + z__3.i * x[i__4].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L90: */ + } + i__3 = j; + i__4 = j; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__3 = jx; + z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i = + alpha->r * x[i__3].i + alpha->i * x[i__3].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__3 = jy; + i__4 = jy; + i__2 = j * a_dim1 + 1; + d__1 = a[i__2].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + l = 1 - j; + ix = jx; + iy = jy; +/* Computing MIN */ + i__4 = *n, i__2 = j + *k; + i__3 = min(i__4,i__2); + for (i__ = j + 1; i__ <= i__3; ++i__) { + ix += *incx; + iy += *incy; + i__4 = iy; + i__2 = iy; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, + z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] + .r; + z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i; + y[i__4].r = z__1.r, y[i__4].i = z__1.i; + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i = + z__3.r * x[i__4].i + z__3.i * x[i__4].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L110: */ + } + i__3 = jy; + i__4 = jy; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + jx += *incx; + jy += *incy; +/* L120: */ + } + } + } + + return 0; + +/* End of ZHBMV . */ + +} /* zhbmv_ */ + diff --git a/blas/f2c/zhpmv.c b/blas/f2c/zhpmv.c new file mode 100644 index 000000000..fbe2f42b3 --- /dev/null +++ b/blas/f2c/zhpmv.c @@ -0,0 +1,438 @@ +/* zhpmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int zhpmv_(char *uplo, integer *n, doublecomplex *alpha, + doublecomplex *ap, doublecomplex *x, integer *incx, doublecomplex * + beta, doublecomplex *y, integer *incy, ftnlen uplo_len) +{ + /* System generated locals */ + integer i__1, i__2, i__3, i__4, i__5; + doublereal d__1; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info; + doublecomplex temp1, temp2; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* ZHPMV performs the matrix-vector operation */ + +/* y := alpha*A*x + beta*y, */ + +/* where alpha and beta are scalars, x and y are n element vectors and */ +/* A is an n by n hermitian matrix, supplied in packed form. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the upper or lower */ +/* triangular part of the matrix A is supplied in the packed */ +/* array AP as follows: */ + +/* UPLO = 'U' or 'u' The upper triangular part of A is */ +/* supplied in AP. */ + +/* UPLO = 'L' or 'l' The lower triangular part of A is */ +/* supplied in AP. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* ALPHA - COMPLEX*16 . */ +/* On entry, ALPHA specifies the scalar alpha. */ +/* Unchanged on exit. */ + +/* AP - COMPLEX*16 array of DIMENSION at least */ +/* ( ( n*( n + 1 ) )/2 ). */ +/* Before entry with UPLO = 'U' or 'u', the array AP must */ +/* contain the upper triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ +/* and a( 2, 2 ) respectively, and so on. */ +/* Before entry with UPLO = 'L' or 'l', the array AP must */ +/* contain the lower triangular part of the hermitian matrix */ +/* packed sequentially, column by column, so that AP( 1 ) */ +/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ +/* and a( 3, 1 ) respectively, and so on. */ +/* Note that the imaginary parts of the diagonal elements need */ +/* not be set and are assumed to be zero. */ +/* Unchanged on exit. */ + +/* X - COMPLEX*16 array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. */ +/* Unchanged on exit. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* BETA - COMPLEX*16 . */ +/* On entry, BETA specifies the scalar beta. When BETA is */ +/* supplied as zero then Y need not be set on input. */ +/* Unchanged on exit. */ + +/* Y - COMPLEX*16 array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCY ) ). */ +/* Before entry, the incremented array Y must contain the n */ +/* element vector y. On exit, Y is overwritten by the updated */ +/* vector y. */ + +/* INCY - INTEGER. */ +/* On entry, INCY specifies the increment for the elements of */ +/* Y. INCY must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + --y; + --x; + --ap; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (*n < 0) { + info = 2; + } else if (*incx == 0) { + info = 6; + } else if (*incy == 0) { + info = 9; + } + if (info != 0) { + xerbla_("ZHPMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && + beta->i == 0.))) { + return 0; + } + +/* Set up the start points in X and Y. */ + + if (*incx > 0) { + kx = 1; + } else { + kx = 1 - (*n - 1) * *incx; + } + if (*incy > 0) { + ky = 1; + } else { + ky = 1 - (*n - 1) * *incy; + } + +/* Start the operations. In this version the elements of the array AP */ +/* are accessed sequentially with one pass through AP. */ + +/* First form y := beta*y. */ + + if (beta->r != 1. || beta->i != 0.) { + if (*incy == 1) { + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + y[i__2].r = 0., y[i__2].i = 0.; +/* L10: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; +/* L20: */ + } + } + } else { + iy = ky; + if (beta->r == 0. && beta->i == 0.) { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + y[i__2].r = 0., y[i__2].i = 0.; + iy += *incy; +/* L30: */ + } + } else { + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = iy; + i__3 = iy; + z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, + z__1.i = beta->r * y[i__3].i + beta->i * y[i__3] + .r; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + iy += *incy; +/* L40: */ + } + } + } + } + if (alpha->r == 0. && alpha->i == 0.) { + return 0; + } + kk = 1; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + +/* Form y when AP contains the upper triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + k = kk; + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = i__; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ++k; +/* L50: */ + } + i__2 = j; + i__3 = j; + i__4 = kk + j - 1; + d__1 = ap[i__4].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + kk += j; +/* L60: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + ix = kx; + iy = ky; + i__2 = kk + j - 2; + for (k = kk; k <= i__2; ++k) { + i__3 = iy; + i__4 = iy; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = ix; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ix += *incx; + iy += *incy; +/* L70: */ + } + i__2 = jy; + i__3 = jy; + i__4 = kk + j - 1; + d__1 = ap[i__4].r; + z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i; + z__2.r = y[i__3].r + z__3.r, z__2.i = y[i__3].i + z__3.i; + z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + jx += *incx; + jy += *incy; + kk += j; +/* L80: */ + } + } + } else { + +/* Form y when AP contains the lower triangle. */ + + if (*incx == 1 && *incy == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__2 = j; + i__3 = j; + i__4 = kk; + d__1 = ap[i__4].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + k = kk + 1; + i__2 = *n; + for (i__ = j + 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = i__; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; + ++k; +/* L90: */ + } + i__2 = j; + i__3 = j; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + kk += *n - j + 1; +/* L100: */ + } + } else { + jx = kx; + jy = ky; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = jx; + z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i = + alpha->r * x[i__2].i + alpha->i * x[i__2].r; + temp1.r = z__1.r, temp1.i = z__1.i; + temp2.r = 0., temp2.i = 0.; + i__2 = jy; + i__3 = jy; + i__4 = kk; + d__1 = ap[i__4].r; + z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + ix = jx; + iy = jy; + i__2 = kk + *n - j; + for (k = kk + 1; k <= i__2; ++k) { + ix += *incx; + iy += *incy; + i__3 = iy; + i__4 = iy; + i__5 = k; + z__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i, + z__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5] + .r; + z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i; + y[i__3].r = z__1.r, y[i__3].i = z__1.i; + d_cnjg(&z__3, &ap[k]); + i__3 = ix; + z__2.r = z__3.r * x[i__3].r - z__3.i * x[i__3].i, z__2.i = + z__3.r * x[i__3].i + z__3.i * x[i__3].r; + z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i; + temp2.r = z__1.r, temp2.i = z__1.i; +/* L110: */ + } + i__2 = jy; + i__3 = jy; + z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i = + alpha->r * temp2.i + alpha->i * temp2.r; + z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i; + y[i__2].r = z__1.r, y[i__2].i = z__1.i; + jx += *incx; + jy += *incy; + kk += *n - j + 1; +/* L120: */ + } + } + } + + return 0; + +/* End of ZHPMV . */ + +} /* zhpmv_ */ + diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c new file mode 100644 index 000000000..4cdcd7f88 --- /dev/null +++ b/blas/f2c/ztbmv.c @@ -0,0 +1,647 @@ +/* ztbmv.f -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "datatypes.h" + +/* Subroutine */ int ztbmv_(char *uplo, char *trans, char *diag, integer *n, + integer *k, doublecomplex *a, integer *lda, doublecomplex *x, integer + *incx, ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; + doublecomplex z__1, z__2, z__3; + + /* Builtin functions */ + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + integer i__, j, l, ix, jx, kx, info; + doublecomplex temp; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + integer kplus1; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + logical noconj, nounit; + +/* .. Scalar Arguments .. */ +/* .. */ +/* .. Array Arguments .. */ +/* .. */ + +/* Purpose */ +/* ======= */ + +/* ZTBMV performs one of the matrix-vector operations */ + +/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ + +/* where x is an n element vector and A is an n by n unit, or non-unit, */ +/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ + +/* Arguments */ +/* ========== */ + +/* UPLO - CHARACTER*1. */ +/* On entry, UPLO specifies whether the matrix is an upper or */ +/* lower triangular matrix as follows: */ + +/* UPLO = 'U' or 'u' A is an upper triangular matrix. */ + +/* UPLO = 'L' or 'l' A is a lower triangular matrix. */ + +/* Unchanged on exit. */ + +/* TRANS - CHARACTER*1. */ +/* On entry, TRANS specifies the operation to be performed as */ +/* follows: */ + +/* TRANS = 'N' or 'n' x := A*x. */ + +/* TRANS = 'T' or 't' x := A'*x. */ + +/* TRANS = 'C' or 'c' x := conjg( A' )*x. */ + +/* Unchanged on exit. */ + +/* DIAG - CHARACTER*1. */ +/* On entry, DIAG specifies whether or not A is unit */ +/* triangular as follows: */ + +/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ + +/* DIAG = 'N' or 'n' A is not assumed to be unit */ +/* triangular. */ + +/* Unchanged on exit. */ + +/* N - INTEGER. */ +/* On entry, N specifies the order of the matrix A. */ +/* N must be at least zero. */ +/* Unchanged on exit. */ + +/* K - INTEGER. */ +/* On entry with UPLO = 'U' or 'u', K specifies the number of */ +/* super-diagonals of the matrix A. */ +/* On entry with UPLO = 'L' or 'l', K specifies the number of */ +/* sub-diagonals of the matrix A. */ +/* K must satisfy 0 .le. K. */ +/* Unchanged on exit. */ + +/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ +/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ +/* by n part of the array A must contain the upper triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row */ +/* ( k + 1 ) of the array, the first super-diagonal starting at */ +/* position 2 in row k, and so on. The top left k by k triangle */ +/* of the array A is not referenced. */ +/* The following program segment will transfer an upper */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = K + 1 - J */ +/* DO 10, I = MAX( 1, J - K ), J */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ +/* by n part of the array A must contain the lower triangular */ +/* band part of the matrix of coefficients, supplied column by */ +/* column, with the leading diagonal of the matrix in row 1 of */ +/* the array, the first sub-diagonal starting at position 1 in */ +/* row 2, and so on. The bottom right k by k triangle of the */ +/* array A is not referenced. */ +/* The following program segment will transfer a lower */ +/* triangular band matrix from conventional full matrix storage */ +/* to band storage: */ + +/* DO 20, J = 1, N */ +/* M = 1 - J */ +/* DO 10, I = J, MIN( N, J + K ) */ +/* A( M + I, J ) = matrix( I, J ) */ +/* 10 CONTINUE */ +/* 20 CONTINUE */ + +/* Note that when DIAG = 'U' or 'u' the elements of the array A */ +/* corresponding to the diagonal elements of the matrix are not */ +/* referenced, but are assumed to be unity. */ +/* Unchanged on exit. */ + +/* LDA - INTEGER. */ +/* On entry, LDA specifies the first dimension of A as declared */ +/* in the calling (sub) program. LDA must be at least */ +/* ( k + 1 ). */ +/* Unchanged on exit. */ + +/* X - COMPLEX*16 array of dimension at least */ +/* ( 1 + ( n - 1 )*abs( INCX ) ). */ +/* Before entry, the incremented array X must contain the n */ +/* element vector x. On exit, X is overwritten with the */ +/* tranformed vector x. */ + +/* INCX - INTEGER. */ +/* On entry, INCX specifies the increment for the elements of */ +/* X. INCX must not be zero. */ +/* Unchanged on exit. */ + +/* Further Details */ +/* =============== */ + +/* Level 2 Blas routine. */ + +/* -- Written on 22-October-1986. */ +/* Jack Dongarra, Argonne National Lab. */ +/* Jeremy Du Croz, Nag Central Office. */ +/* Sven Hammarling, Nag Central Office. */ +/* Richard Hanson, Sandia National Labs. */ + +/* ===================================================================== */ + +/* .. Parameters .. */ +/* .. */ +/* .. Local Scalars .. */ +/* .. */ +/* .. External Functions .. */ +/* .. */ +/* .. External Subroutines .. */ +/* .. */ +/* .. Intrinsic Functions .. */ +/* .. */ + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --x; + + /* Function Body */ + info = 0; + if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( + ftnlen)1, (ftnlen)1)) { + info = 1; + } else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, + "T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, ( + ftnlen)1)) { + info = 2; + } else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag, + "N", (ftnlen)1, (ftnlen)1)) { + info = 3; + } else if (*n < 0) { + info = 4; + } else if (*k < 0) { + info = 5; + } else if (*lda < *k + 1) { + info = 7; + } else if (*incx == 0) { + info = 9; + } + if (info != 0) { + xerbla_("ZTBMV ", &info, (ftnlen)6); + return 0; + } + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } + + noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1); + nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1); + +/* Set up the start point in X if the increment is not unity. This */ +/* will be ( N - 1 )*INCX too small for descending loops. */ + + if (*incx <= 0) { + kx = 1 - (*n - 1) * *incx; + } else if (*incx != 1) { + kx = 1; + } + +/* Start the operations. In this version the elements of A are */ +/* accessed sequentially with one pass through A. */ + + if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) { + +/* Form x := A*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + if (x[i__2].r != 0. || x[i__2].i != 0.) { + i__2 = j; + temp.r = x[i__2].r, temp.i = x[i__2].i; + l = kplus1 - j; +/* Computing MAX */ + i__2 = 1, i__3 = j - *k; + i__4 = j - 1; + for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { + i__2 = i__; + i__3 = i__; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + z__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + + z__2.i; + x[i__2].r = z__1.r, x[i__2].i = z__1.i; +/* L10: */ + } + if (nounit) { + i__4 = j; + i__2 = j; + i__3 = kplus1 + j * a_dim1; + z__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[ + i__3].i, z__1.i = x[i__2].r * a[i__3].i + + x[i__2].i * a[i__3].r; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + } + } +/* L20: */ + } + } else { + jx = kx; + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__4 = jx; + if (x[i__4].r != 0. || x[i__4].i != 0.) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = kplus1 - j; +/* Computing MAX */ + i__4 = 1, i__2 = j - *k; + i__3 = j - 1; + for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { + i__4 = ix; + i__2 = ix; + i__5 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i, + z__2.i = temp.r * a[i__5].i + temp.i * a[ + i__5].r; + z__1.r = x[i__2].r + z__2.r, z__1.i = x[i__2].i + + z__2.i; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + ix += *incx; +/* L30: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__2 = kplus1 + j * a_dim1; + z__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[ + i__2].i, z__1.i = x[i__4].r * a[i__2].i + + x[i__4].i * a[i__2].r; + x[i__3].r = z__1.r, x[i__3].i = z__1.i; + } + } + jx += *incx; + if (j > *k) { + kx += *incx; + } +/* L40: */ + } + } + } else { + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__1 = j; + if (x[i__1].r != 0. || x[i__1].i != 0.) { + i__1 = j; + temp.r = x[i__1].r, temp.i = x[i__1].i; + l = 1 - j; +/* Computing MIN */ + i__1 = *n, i__3 = j + *k; + i__4 = j + 1; + for (i__ = min(i__1,i__3); i__ >= i__4; --i__) { + i__1 = i__; + i__3 = i__; + i__2 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + z__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + z__1.r = x[i__3].r + z__2.r, z__1.i = x[i__3].i + + z__2.i; + x[i__1].r = z__1.r, x[i__1].i = z__1.i; +/* L50: */ + } + if (nounit) { + i__4 = j; + i__1 = j; + i__3 = j * a_dim1 + 1; + z__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[ + i__3].i, z__1.i = x[i__1].r * a[i__3].i + + x[i__1].i * a[i__3].r; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + } + } +/* L60: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__4 = jx; + if (x[i__4].r != 0. || x[i__4].i != 0.) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + ix = kx; + l = 1 - j; +/* Computing MIN */ + i__4 = *n, i__1 = j + *k; + i__3 = j + 1; + for (i__ = min(i__4,i__1); i__ >= i__3; --i__) { + i__4 = ix; + i__1 = ix; + i__2 = l + i__ + j * a_dim1; + z__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i, + z__2.i = temp.r * a[i__2].i + temp.i * a[ + i__2].r; + z__1.r = x[i__1].r + z__2.r, z__1.i = x[i__1].i + + z__2.i; + x[i__4].r = z__1.r, x[i__4].i = z__1.i; + ix -= *incx; +/* L70: */ + } + if (nounit) { + i__3 = jx; + i__4 = jx; + i__1 = j * a_dim1 + 1; + z__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[ + i__1].i, z__1.i = x[i__4].r * a[i__1].i + + x[i__4].i * a[i__1].r; + x[i__3].r = z__1.r, x[i__3].i = z__1.i; + } + } + jx -= *incx; + if (*n - j >= *k) { + kx -= *incx; + } +/* L80: */ + } + } + } + } else { + +/* Form x := A'*x or x := conjg( A' )*x. */ + + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + kplus1 = *k + 1; + if (*incx == 1) { + for (j = *n; j >= 1; --j) { + i__3 = j; + temp.r = x[i__3].r, temp.i = x[i__3].i; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + z__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = i__; + z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, z__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L90: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = i__; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, + z__2.i = z__3.r * x[i__4].i + z__3.i * x[ + i__4].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L100: */ + } + } + i__3 = j; + x[i__3].r = temp.r, x[i__3].i = temp.i; +/* L110: */ + } + } else { + kx += (*n - 1) * *incx; + jx = kx; + for (j = *n; j >= 1; --j) { + i__3 = jx; + temp.r = x[i__3].r, temp.i = x[i__3].i; + kx -= *incx; + ix = kx; + l = kplus1 - j; + if (noconj) { + if (nounit) { + i__3 = kplus1 + j * a_dim1; + z__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i, + z__1.i = temp.r * a[i__3].i + temp.i * a[ + i__3].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + i__4 = l + i__ + j * a_dim1; + i__1 = ix; + z__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[ + i__1].i, z__2.i = a[i__4].r * x[i__1].i + + a[i__4].i * x[i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix -= *incx; +/* L120: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MAX */ + i__4 = 1, i__1 = j - *k; + i__3 = max(i__4,i__1); + for (i__ = j - 1; i__ >= i__3; --i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__4 = ix; + z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, + z__2.i = z__3.r * x[i__4].i + z__3.i * x[ + i__4].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix -= *incx; +/* L130: */ + } + } + i__3 = jx; + x[i__3].r = temp.r, x[i__3].i = temp.i; + jx -= *incx; +/* L140: */ + } + } + } else { + if (*incx == 1) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = j; + temp.r = x[i__4].r, temp.i = x[i__4].i; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + z__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = i__; + z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, z__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L150: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[j * a_dim1 + 1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__1 = i__; + z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, + z__2.i = z__3.r * x[i__1].i + z__3.i * x[ + i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; +/* L160: */ + } + } + i__4 = j; + x[i__4].r = temp.r, x[i__4].i = temp.i; +/* L170: */ + } + } else { + jx = kx; + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + i__4 = jx; + temp.r = x[i__4].r, temp.i = x[i__4].i; + kx += *incx; + ix = kx; + l = 1 - j; + if (noconj) { + if (nounit) { + i__4 = j * a_dim1 + 1; + z__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i, + z__1.i = temp.r * a[i__4].i + temp.i * a[ + i__4].r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + i__1 = l + i__ + j * a_dim1; + i__2 = ix; + z__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[ + i__2].i, z__2.i = a[i__1].r * x[i__2].i + + a[i__1].i * x[i__2].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix += *incx; +/* L180: */ + } + } else { + if (nounit) { + d_cnjg(&z__2, &a[j * a_dim1 + 1]); + z__1.r = temp.r * z__2.r - temp.i * z__2.i, + z__1.i = temp.r * z__2.i + temp.i * + z__2.r; + temp.r = z__1.r, temp.i = z__1.i; + } +/* Computing MIN */ + i__1 = *n, i__2 = j + *k; + i__4 = min(i__1,i__2); + for (i__ = j + 1; i__ <= i__4; ++i__) { + d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); + i__1 = ix; + z__2.r = z__3.r * x[i__1].r - z__3.i * x[i__1].i, + z__2.i = z__3.r * x[i__1].i + z__3.i * x[ + i__1].r; + z__1.r = temp.r + z__2.r, z__1.i = temp.i + + z__2.i; + temp.r = z__1.r, temp.i = z__1.i; + ix += *incx; +/* L190: */ + } + } + i__4 = jx; + x[i__4].r = temp.r, x[i__4].i = temp.i; + jx += *incx; +/* L200: */ + } + } + } + } + + return 0; + +/* End of ZTBMV . */ + +} /* ztbmv_ */ + diff --git a/blas/chbmv.f b/blas/fortran/chbmv.f similarity index 100% rename from blas/chbmv.f rename to blas/fortran/chbmv.f diff --git a/blas/chpmv.f b/blas/fortran/chpmv.f similarity index 100% rename from blas/chpmv.f rename to blas/fortran/chpmv.f diff --git a/blas/complexdots.f b/blas/fortran/complexdots.f similarity index 100% rename from blas/complexdots.f rename to blas/fortran/complexdots.f diff --git a/blas/ctbmv.f b/blas/fortran/ctbmv.f similarity index 100% rename from blas/ctbmv.f rename to blas/fortran/ctbmv.f diff --git a/blas/drotm.f b/blas/fortran/drotm.f similarity index 100% rename from blas/drotm.f rename to blas/fortran/drotm.f diff --git a/blas/drotmg.f b/blas/fortran/drotmg.f similarity index 100% rename from blas/drotmg.f rename to blas/fortran/drotmg.f diff --git a/blas/dsbmv.f b/blas/fortran/dsbmv.f similarity index 100% rename from blas/dsbmv.f rename to blas/fortran/dsbmv.f diff --git a/blas/dspmv.f b/blas/fortran/dspmv.f similarity index 100% rename from blas/dspmv.f rename to blas/fortran/dspmv.f diff --git a/blas/dtbmv.f b/blas/fortran/dtbmv.f similarity index 100% rename from blas/dtbmv.f rename to blas/fortran/dtbmv.f diff --git a/blas/lsame.f b/blas/fortran/lsame.f similarity index 100% rename from blas/lsame.f rename to blas/fortran/lsame.f diff --git a/blas/srotm.f b/blas/fortran/srotm.f similarity index 100% rename from blas/srotm.f rename to blas/fortran/srotm.f diff --git a/blas/srotmg.f b/blas/fortran/srotmg.f similarity index 100% rename from blas/srotmg.f rename to blas/fortran/srotmg.f diff --git a/blas/ssbmv.f b/blas/fortran/ssbmv.f similarity index 100% rename from blas/ssbmv.f rename to blas/fortran/ssbmv.f diff --git a/blas/sspmv.f b/blas/fortran/sspmv.f similarity index 100% rename from blas/sspmv.f rename to blas/fortran/sspmv.f diff --git a/blas/stbmv.f b/blas/fortran/stbmv.f similarity index 100% rename from blas/stbmv.f rename to blas/fortran/stbmv.f diff --git a/blas/zhbmv.f b/blas/fortran/zhbmv.f similarity index 100% rename from blas/zhbmv.f rename to blas/fortran/zhbmv.f diff --git a/blas/zhpmv.f b/blas/fortran/zhpmv.f similarity index 100% rename from blas/zhpmv.f rename to blas/fortran/zhpmv.f diff --git a/blas/ztbmv.f b/blas/fortran/ztbmv.f similarity index 100% rename from blas/ztbmv.f rename to blas/fortran/ztbmv.f From 56ca44ad1a12fda701139ba9da715f04388e04ca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 11 Dec 2014 17:03:41 +0100 Subject: [PATCH 119/214] Use f2c generated code instead of the original fortran code, except for dotc/dotu. --- blas/CMakeLists.txt | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 2bc956a64..d0efb4188 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -14,33 +14,18 @@ endif() add_custom_target(blas) -set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp) +set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp + f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c + f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c + f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c + f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c + f2c/ztbmv.c f2c/d_cnjg.c f2c/r_cnjg.c + ) -if(EIGEN_USE_F2C_BLAS) - set(EigenBlas_SRCS ${EigenBlas_SRCS} - f2c/complexdots.c - f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c - f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c - f2c/chbmv.c f2c/sspmv.c - f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c - f2c/zhpmv.c - f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c f2c/ztbmv.c - f2c/d_cnjg.c f2c/r_cnjg.c - ) +if (EIGEN_Fortran_COMPILER_WORKS) + set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f) else() - if (EIGEN_Fortran_COMPILER_WORKS) - set(EigenBlas_SRCS ${EigenBlas_SRCS} - fortran/complexdots.f - fortran/srotm.f fortran/srotmg.f fortran/drotm.f fortran/drotmg.f - fortran/lsame.f fortran/dspmv.f fortran/ssbmv.f - fortran/chbmv.f fortran/sspmv.f - fortran/zhbmv.f fortran/chpmv.f fortran/dsbmv.f - fortran/zhpmv.f - fortran/dtbmv.f fortran/stbmv.f fortran/ctbmv.f fortran/ztbmv.f - ) - else() - message(WARNING " No Fortran compiler has been detected, the blas build will be incomplete. Define EIGEN_USE_F2C_BLAS to build BLAS without Fortran") - endif() + set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c) endif() add_library(eigen_blas_static ${EigenBlas_SRCS}) From 57ec399ec9e1e0c468079a09a9f6531d2be244a2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 13 Dec 2014 21:41:25 +0100 Subject: [PATCH 120/214] Remove unused fortran files --- blas/fortran/chbmv.f | 310 ----------------------------------- blas/fortran/chpmv.f | 272 ------------------------------- blas/fortran/ctbmv.f | 366 ------------------------------------------ blas/fortran/drotm.f | 147 ----------------- blas/fortran/drotmg.f | 206 ------------------------ blas/fortran/dsbmv.f | 304 ----------------------------------- blas/fortran/dspmv.f | 265 ------------------------------ blas/fortran/dtbmv.f | 335 -------------------------------------- blas/fortran/lsame.f | 85 ---------- blas/fortran/srotm.f | 148 ----------------- blas/fortran/srotmg.f | 208 ------------------------ blas/fortran/ssbmv.f | 306 ----------------------------------- blas/fortran/sspmv.f | 265 ------------------------------ blas/fortran/stbmv.f | 335 -------------------------------------- blas/fortran/zhbmv.f | 310 ----------------------------------- blas/fortran/zhpmv.f | 272 ------------------------------- blas/fortran/ztbmv.f | 366 ------------------------------------------ 17 files changed, 4500 deletions(-) delete mode 100644 blas/fortran/chbmv.f delete mode 100644 blas/fortran/chpmv.f delete mode 100644 blas/fortran/ctbmv.f delete mode 100644 blas/fortran/drotm.f delete mode 100644 blas/fortran/drotmg.f delete mode 100644 blas/fortran/dsbmv.f delete mode 100644 blas/fortran/dspmv.f delete mode 100644 blas/fortran/dtbmv.f delete mode 100644 blas/fortran/lsame.f delete mode 100644 blas/fortran/srotm.f delete mode 100644 blas/fortran/srotmg.f delete mode 100644 blas/fortran/ssbmv.f delete mode 100644 blas/fortran/sspmv.f delete mode 100644 blas/fortran/stbmv.f delete mode 100644 blas/fortran/zhbmv.f delete mode 100644 blas/fortran/zhpmv.f delete mode 100644 blas/fortran/ztbmv.f diff --git a/blas/fortran/chbmv.f b/blas/fortran/chbmv.f deleted file mode 100644 index 1b1c330ea..000000000 --- a/blas/fortran/chbmv.f +++ /dev/null @@ -1,310 +0,0 @@ - SUBROUTINE CHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - COMPLEX ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - COMPLEX A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* CHBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - COMPLEX . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - COMPLEX array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX . -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - COMPLEX array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - COMPLEX ONE - PARAMETER (ONE= (1.0E+0,0.0E+0)) - COMPLEX ZERO - PARAMETER (ZERO= (0.0E+0,0.0E+0)) -* .. -* .. Local Scalars .. - COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC CONJG,MAX,MIN,REAL -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('CHBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*REAL(A(1,J)) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*REAL(A(1,J)) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of CHBMV . -* - END diff --git a/blas/fortran/chpmv.f b/blas/fortran/chpmv.f deleted file mode 100644 index 158be5a7b..000000000 --- a/blas/fortran/chpmv.f +++ /dev/null @@ -1,272 +0,0 @@ - SUBROUTINE CHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - COMPLEX ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - COMPLEX AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* CHPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - COMPLEX . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - COMPLEX array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* X - COMPLEX array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX . -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - COMPLEX array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - COMPLEX ONE - PARAMETER (ONE= (1.0E+0,0.0E+0)) - COMPLEX ZERO - PARAMETER (ZERO= (0.0E+0,0.0E+0)) -* .. -* .. Local Scalars .. - COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC CONJG,REAL -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('CHPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*REAL(AP(KK)) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*REAL(AP(KK)) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + CONJG(AP(K))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of CHPMV . -* - END diff --git a/blas/fortran/ctbmv.f b/blas/fortran/ctbmv.f deleted file mode 100644 index 5a879fa01..000000000 --- a/blas/fortran/ctbmv.f +++ /dev/null @@ -1,366 +0,0 @@ - SUBROUTINE CTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - COMPLEX A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* CTBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, or x := conjg( A' )*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := conjg( A' )*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - COMPLEX array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - COMPLEX ZERO - PARAMETER (ZERO= (0.0E+0,0.0E+0)) -* .. -* .. Local Scalars .. - COMPLEX TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOCONJ,NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC CONJG,MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('CTBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOCONJ = LSAME(TRANS,'T') - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x or x := conjg( A' )*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 110 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J)) - DO 100 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + CONJG(A(L+I,J))*X(I) - 100 CONTINUE - END IF - X(J) = TEMP - 110 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 140 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 120 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 120 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J)) - DO 130 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + CONJG(A(L+I,J))*X(IX) - IX = IX - INCX - 130 CONTINUE - END IF - X(JX) = TEMP - JX = JX - INCX - 140 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 170 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 150 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J)) - DO 160 I = J + 1,MIN(N,J+K) - TEMP = TEMP + CONJG(A(L+I,J))*X(I) - 160 CONTINUE - END IF - X(J) = TEMP - 170 CONTINUE - ELSE - JX = KX - DO 200 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 180 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 180 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J)) - DO 190 I = J + 1,MIN(N,J+K) - TEMP = TEMP + CONJG(A(L+I,J))*X(IX) - IX = IX + INCX - 190 CONTINUE - END IF - X(JX) = TEMP - JX = JX + INCX - 200 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of CTBMV . -* - END diff --git a/blas/fortran/drotm.f b/blas/fortran/drotm.f deleted file mode 100644 index 63a3b1134..000000000 --- a/blas/fortran/drotm.f +++ /dev/null @@ -1,147 +0,0 @@ - SUBROUTINE DROTM(N,DX,INCX,DY,INCY,DPARAM) -* .. Scalar Arguments .. - INTEGER INCX,INCY,N -* .. -* .. Array Arguments .. - DOUBLE PRECISION DPARAM(5),DX(*),DY(*) -* .. -* -* Purpose -* ======= -* -* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX -* -* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN -* (DY**T) -* -* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE -* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. -* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 -* -* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) -* H=( ) ( ) ( ) ( ) -* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). -* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. -* -* Arguments -* ========= -* -* N (input) INTEGER -* number of elements in input vector(s) -* -* DX (input/output) DOUBLE PRECISION array, dimension N -* double precision vector with N elements -* -* INCX (input) INTEGER -* storage spacing between elements of DX -* -* DY (input/output) DOUBLE PRECISION array, dimension N -* double precision vector with N elements -* -* INCY (input) INTEGER -* storage spacing between elements of DY -* -* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 -* DPARAM(1)=DFLAG -* DPARAM(2)=DH11 -* DPARAM(3)=DH21 -* DPARAM(4)=DH12 -* DPARAM(5)=DH22 -* -* ===================================================================== -* -* .. Local Scalars .. - DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO - INTEGER I,KX,KY,NSTEPS -* .. -* .. Data statements .. - DATA ZERO,TWO/0.D0,2.D0/ -* .. -* - DFLAG = DPARAM(1) - IF (N.LE.0 .OR. (DFLAG+TWO.EQ.ZERO)) GO TO 140 - IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70 -* - NSTEPS = N*INCX - IF (DFLAG) 50,10,30 - 10 CONTINUE - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DO 20 I = 1,NSTEPS,INCX - W = DX(I) - Z = DY(I) - DX(I) = W + Z*DH12 - DY(I) = W*DH21 + Z - 20 CONTINUE - GO TO 140 - 30 CONTINUE - DH11 = DPARAM(2) - DH22 = DPARAM(5) - DO 40 I = 1,NSTEPS,INCX - W = DX(I) - Z = DY(I) - DX(I) = W*DH11 + Z - DY(I) = -W + DH22*Z - 40 CONTINUE - GO TO 140 - 50 CONTINUE - DH11 = DPARAM(2) - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DH22 = DPARAM(5) - DO 60 I = 1,NSTEPS,INCX - W = DX(I) - Z = DY(I) - DX(I) = W*DH11 + Z*DH12 - DY(I) = W*DH21 + Z*DH22 - 60 CONTINUE - GO TO 140 - 70 CONTINUE - KX = 1 - KY = 1 - IF (INCX.LT.0) KX = 1 + (1-N)*INCX - IF (INCY.LT.0) KY = 1 + (1-N)*INCY -* - IF (DFLAG) 120,80,100 - 80 CONTINUE - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DO 90 I = 1,N - W = DX(KX) - Z = DY(KY) - DX(KX) = W + Z*DH12 - DY(KY) = W*DH21 + Z - KX = KX + INCX - KY = KY + INCY - 90 CONTINUE - GO TO 140 - 100 CONTINUE - DH11 = DPARAM(2) - DH22 = DPARAM(5) - DO 110 I = 1,N - W = DX(KX) - Z = DY(KY) - DX(KX) = W*DH11 + Z - DY(KY) = -W + DH22*Z - KX = KX + INCX - KY = KY + INCY - 110 CONTINUE - GO TO 140 - 120 CONTINUE - DH11 = DPARAM(2) - DH12 = DPARAM(4) - DH21 = DPARAM(3) - DH22 = DPARAM(5) - DO 130 I = 1,N - W = DX(KX) - Z = DY(KY) - DX(KX) = W*DH11 + Z*DH12 - DY(KY) = W*DH21 + Z*DH22 - KX = KX + INCX - KY = KY + INCY - 130 CONTINUE - 140 CONTINUE - RETURN - END diff --git a/blas/fortran/drotmg.f b/blas/fortran/drotmg.f deleted file mode 100644 index 3ae647b08..000000000 --- a/blas/fortran/drotmg.f +++ /dev/null @@ -1,206 +0,0 @@ - SUBROUTINE DROTMG(DD1,DD2,DX1,DY1,DPARAM) -* .. Scalar Arguments .. - DOUBLE PRECISION DD1,DD2,DX1,DY1 -* .. -* .. Array Arguments .. - DOUBLE PRECISION DPARAM(5) -* .. -* -* Purpose -* ======= -* -* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS -* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* -* DY2)**T. -* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 -* -* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) -* H=( ) ( ) ( ) ( ) -* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). -* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 -* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE -* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) -* -* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE -* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE -* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. -* -* -* Arguments -* ========= -* -* DD1 (input/output) DOUBLE PRECISION -* -* DD2 (input/output) DOUBLE PRECISION -* -* DX1 (input/output) DOUBLE PRECISION -* -* DY1 (input) DOUBLE PRECISION -* -* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 -* DPARAM(1)=DFLAG -* DPARAM(2)=DH11 -* DPARAM(3)=DH21 -* DPARAM(4)=DH12 -* DPARAM(5)=DH22 -* -* ===================================================================== -* -* .. Local Scalars .. - DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,DP1,DP2,DQ1,DQ2,DTEMP, - + DU,GAM,GAMSQ,ONE,RGAMSQ,TWO,ZERO - INTEGER IGO -* .. -* .. Intrinsic Functions .. - INTRINSIC DABS -* .. -* .. Data statements .. -* - DATA ZERO,ONE,TWO/0.D0,1.D0,2.D0/ - DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/ -* .. - - IF (.NOT.DD1.LT.ZERO) GO TO 10 -* GO ZERO-H-D-AND-DX1.. - GO TO 60 - 10 CONTINUE -* CASE-DD1-NONNEGATIVE - DP2 = DD2*DY1 - IF (.NOT.DP2.EQ.ZERO) GO TO 20 - DFLAG = -TWO - GO TO 260 -* REGULAR-CASE.. - 20 CONTINUE - DP1 = DD1*DX1 - DQ2 = DP2*DY1 - DQ1 = DP1*DX1 -* - IF (.NOT.DABS(DQ1).GT.DABS(DQ2)) GO TO 40 - DH21 = -DY1/DX1 - DH12 = DP2/DP1 -* - DU = ONE - DH12*DH21 -* - IF (.NOT.DU.LE.ZERO) GO TO 30 -* GO ZERO-H-D-AND-DX1.. - GO TO 60 - 30 CONTINUE - DFLAG = ZERO - DD1 = DD1/DU - DD2 = DD2/DU - DX1 = DX1*DU -* GO SCALE-CHECK.. - GO TO 100 - 40 CONTINUE - IF (.NOT.DQ2.LT.ZERO) GO TO 50 -* GO ZERO-H-D-AND-DX1.. - GO TO 60 - 50 CONTINUE - DFLAG = ONE - DH11 = DP1/DP2 - DH22 = DX1/DY1 - DU = ONE + DH11*DH22 - DTEMP = DD2/DU - DD2 = DD1/DU - DD1 = DTEMP - DX1 = DY1*DU -* GO SCALE-CHECK - GO TO 100 -* PROCEDURE..ZERO-H-D-AND-DX1.. - 60 CONTINUE - DFLAG = -ONE - DH11 = ZERO - DH12 = ZERO - DH21 = ZERO - DH22 = ZERO -* - DD1 = ZERO - DD2 = ZERO - DX1 = ZERO -* RETURN.. - GO TO 220 -* PROCEDURE..FIX-H.. - 70 CONTINUE - IF (.NOT.DFLAG.GE.ZERO) GO TO 90 -* - IF (.NOT.DFLAG.EQ.ZERO) GO TO 80 - DH11 = ONE - DH22 = ONE - DFLAG = -ONE - GO TO 90 - 80 CONTINUE - DH21 = -ONE - DH12 = ONE - DFLAG = -ONE - 90 CONTINUE - GO TO IGO(120,150,180,210) -* PROCEDURE..SCALE-CHECK - 100 CONTINUE - 110 CONTINUE - IF (.NOT.DD1.LE.RGAMSQ) GO TO 130 - IF (DD1.EQ.ZERO) GO TO 160 - ASSIGN 120 TO IGO -* FIX-H.. - GO TO 70 - 120 CONTINUE - DD1 = DD1*GAM**2 - DX1 = DX1/GAM - DH11 = DH11/GAM - DH12 = DH12/GAM - GO TO 110 - 130 CONTINUE - 140 CONTINUE - IF (.NOT.DD1.GE.GAMSQ) GO TO 160 - ASSIGN 150 TO IGO -* FIX-H.. - GO TO 70 - 150 CONTINUE - DD1 = DD1/GAM**2 - DX1 = DX1*GAM - DH11 = DH11*GAM - DH12 = DH12*GAM - GO TO 140 - 160 CONTINUE - 170 CONTINUE - IF (.NOT.DABS(DD2).LE.RGAMSQ) GO TO 190 - IF (DD2.EQ.ZERO) GO TO 220 - ASSIGN 180 TO IGO -* FIX-H.. - GO TO 70 - 180 CONTINUE - DD2 = DD2*GAM**2 - DH21 = DH21/GAM - DH22 = DH22/GAM - GO TO 170 - 190 CONTINUE - 200 CONTINUE - IF (.NOT.DABS(DD2).GE.GAMSQ) GO TO 220 - ASSIGN 210 TO IGO -* FIX-H.. - GO TO 70 - 210 CONTINUE - DD2 = DD2/GAM**2 - DH21 = DH21*GAM - DH22 = DH22*GAM - GO TO 200 - 220 CONTINUE - IF (DFLAG) 250,230,240 - 230 CONTINUE - DPARAM(3) = DH21 - DPARAM(4) = DH12 - GO TO 260 - 240 CONTINUE - DPARAM(2) = DH11 - DPARAM(5) = DH22 - GO TO 260 - 250 CONTINUE - DPARAM(2) = DH11 - DPARAM(3) = DH21 - DPARAM(4) = DH12 - DPARAM(5) = DH22 - 260 CONTINUE - DPARAM(1) = DFLAG - RETURN - END diff --git a/blas/fortran/dsbmv.f b/blas/fortran/dsbmv.f deleted file mode 100644 index 8c82d1fa1..000000000 --- a/blas/fortran/dsbmv.f +++ /dev/null @@ -1,304 +0,0 @@ - SUBROUTINE DSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE PRECISION ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE PRECISION A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* DSBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - DOUBLE PRECISION. -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - DOUBLE PRECISION array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - DOUBLE PRECISION. -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - DOUBLE PRECISION array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ONE,ZERO - PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) -* .. -* .. Local Scalars .. - DOUBLE PRECISION TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('DSBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*A(1,J) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*A(1,J) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of DSBMV . -* - END diff --git a/blas/fortran/dspmv.f b/blas/fortran/dspmv.f deleted file mode 100644 index f6e121e76..000000000 --- a/blas/fortran/dspmv.f +++ /dev/null @@ -1,265 +0,0 @@ - SUBROUTINE DSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE PRECISION ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE PRECISION AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* DSPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - DOUBLE PRECISION. -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - DOUBLE PRECISION array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Unchanged on exit. -* -* X - DOUBLE PRECISION array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - DOUBLE PRECISION. -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - DOUBLE PRECISION array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ONE,ZERO - PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) -* .. -* .. Local Scalars .. - DOUBLE PRECISION TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('DSPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*AP(KK) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*AP(KK) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of DSPMV . -* - END diff --git a/blas/fortran/dtbmv.f b/blas/fortran/dtbmv.f deleted file mode 100644 index a87ffdeae..000000000 --- a/blas/fortran/dtbmv.f +++ /dev/null @@ -1,335 +0,0 @@ - SUBROUTINE DTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - DOUBLE PRECISION A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* DTBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := A'*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - DOUBLE PRECISION array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER (ZERO=0.0D+0) -* .. -* .. Local Scalars .. - DOUBLE PRECISION TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('DTBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 100 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - X(J) = TEMP - 100 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 120 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 110 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 110 CONTINUE - X(JX) = TEMP - JX = JX - INCX - 120 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 140 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 130 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 130 CONTINUE - X(J) = TEMP - 140 CONTINUE - ELSE - JX = KX - DO 160 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 150 CONTINUE - X(JX) = TEMP - JX = JX + INCX - 160 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of DTBMV . -* - END diff --git a/blas/fortran/lsame.f b/blas/fortran/lsame.f deleted file mode 100644 index f53690268..000000000 --- a/blas/fortran/lsame.f +++ /dev/null @@ -1,85 +0,0 @@ - LOGICAL FUNCTION LSAME(CA,CB) -* -* -- LAPACK auxiliary routine (version 3.1) -- -* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. -* November 2006 -* -* .. Scalar Arguments .. - CHARACTER CA,CB -* .. -* -* Purpose -* ======= -* -* LSAME returns .TRUE. if CA is the same letter as CB regardless of -* case. -* -* Arguments -* ========= -* -* CA (input) CHARACTER*1 -* -* CB (input) CHARACTER*1 -* CA and CB specify the single characters to be compared. -* -* ===================================================================== -* -* .. Intrinsic Functions .. - INTRINSIC ICHAR -* .. -* .. Local Scalars .. - INTEGER INTA,INTB,ZCODE -* .. -* -* Test if the characters are equal -* - LSAME = CA .EQ. CB - IF (LSAME) RETURN -* -* Now test for equivalence if both characters are alphabetic. -* - ZCODE = ICHAR('Z') -* -* Use 'Z' rather than 'A' so that ASCII can be detected on Prime -* machines, on which ICHAR returns a value with bit 8 set. -* ICHAR('A') on Prime machines returns 193 which is the same as -* ICHAR('A') on an EBCDIC machine. -* - INTA = ICHAR(CA) - INTB = ICHAR(CB) -* - IF (ZCODE.EQ.90 .OR. ZCODE.EQ.122) THEN -* -* ASCII is assumed - ZCODE is the ASCII code of either lower or -* upper case 'Z'. -* - IF (INTA.GE.97 .AND. INTA.LE.122) INTA = INTA - 32 - IF (INTB.GE.97 .AND. INTB.LE.122) INTB = INTB - 32 -* - ELSE IF (ZCODE.EQ.233 .OR. ZCODE.EQ.169) THEN -* -* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or -* upper case 'Z'. -* - IF (INTA.GE.129 .AND. INTA.LE.137 .OR. - + INTA.GE.145 .AND. INTA.LE.153 .OR. - + INTA.GE.162 .AND. INTA.LE.169) INTA = INTA + 64 - IF (INTB.GE.129 .AND. INTB.LE.137 .OR. - + INTB.GE.145 .AND. INTB.LE.153 .OR. - + INTB.GE.162 .AND. INTB.LE.169) INTB = INTB + 64 -* - ELSE IF (ZCODE.EQ.218 .OR. ZCODE.EQ.250) THEN -* -* ASCII is assumed, on Prime machines - ZCODE is the ASCII code -* plus 128 of either lower or upper case 'Z'. -* - IF (INTA.GE.225 .AND. INTA.LE.250) INTA = INTA - 32 - IF (INTB.GE.225 .AND. INTB.LE.250) INTB = INTB - 32 - END IF - LSAME = INTA .EQ. INTB -* -* RETURN -* -* End of LSAME -* - END diff --git a/blas/fortran/srotm.f b/blas/fortran/srotm.f deleted file mode 100644 index fc5a59333..000000000 --- a/blas/fortran/srotm.f +++ /dev/null @@ -1,148 +0,0 @@ - SUBROUTINE SROTM(N,SX,INCX,SY,INCY,SPARAM) -* .. Scalar Arguments .. - INTEGER INCX,INCY,N -* .. -* .. Array Arguments .. - REAL SPARAM(5),SX(*),SY(*) -* .. -* -* Purpose -* ======= -* -* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX -* -* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN -* (DX**T) -* -* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE -* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. -* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 -* -* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) -* H=( ) ( ) ( ) ( ) -* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). -* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. -* -* -* Arguments -* ========= -* -* N (input) INTEGER -* number of elements in input vector(s) -* -* SX (input/output) REAL array, dimension N -* double precision vector with N elements -* -* INCX (input) INTEGER -* storage spacing between elements of SX -* -* SY (input/output) REAL array, dimension N -* double precision vector with N elements -* -* INCY (input) INTEGER -* storage spacing between elements of SY -* -* SPARAM (input/output) REAL array, dimension 5 -* SPARAM(1)=SFLAG -* SPARAM(2)=SH11 -* SPARAM(3)=SH21 -* SPARAM(4)=SH12 -* SPARAM(5)=SH22 -* -* ===================================================================== -* -* .. Local Scalars .. - REAL SFLAG,SH11,SH12,SH21,SH22,TWO,W,Z,ZERO - INTEGER I,KX,KY,NSTEPS -* .. -* .. Data statements .. - DATA ZERO,TWO/0.E0,2.E0/ -* .. -* - SFLAG = SPARAM(1) - IF (N.LE.0 .OR. (SFLAG+TWO.EQ.ZERO)) GO TO 140 - IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70 -* - NSTEPS = N*INCX - IF (SFLAG) 50,10,30 - 10 CONTINUE - SH12 = SPARAM(4) - SH21 = SPARAM(3) - DO 20 I = 1,NSTEPS,INCX - W = SX(I) - Z = SY(I) - SX(I) = W + Z*SH12 - SY(I) = W*SH21 + Z - 20 CONTINUE - GO TO 140 - 30 CONTINUE - SH11 = SPARAM(2) - SH22 = SPARAM(5) - DO 40 I = 1,NSTEPS,INCX - W = SX(I) - Z = SY(I) - SX(I) = W*SH11 + Z - SY(I) = -W + SH22*Z - 40 CONTINUE - GO TO 140 - 50 CONTINUE - SH11 = SPARAM(2) - SH12 = SPARAM(4) - SH21 = SPARAM(3) - SH22 = SPARAM(5) - DO 60 I = 1,NSTEPS,INCX - W = SX(I) - Z = SY(I) - SX(I) = W*SH11 + Z*SH12 - SY(I) = W*SH21 + Z*SH22 - 60 CONTINUE - GO TO 140 - 70 CONTINUE - KX = 1 - KY = 1 - IF (INCX.LT.0) KX = 1 + (1-N)*INCX - IF (INCY.LT.0) KY = 1 + (1-N)*INCY -* - IF (SFLAG) 120,80,100 - 80 CONTINUE - SH12 = SPARAM(4) - SH21 = SPARAM(3) - DO 90 I = 1,N - W = SX(KX) - Z = SY(KY) - SX(KX) = W + Z*SH12 - SY(KY) = W*SH21 + Z - KX = KX + INCX - KY = KY + INCY - 90 CONTINUE - GO TO 140 - 100 CONTINUE - SH11 = SPARAM(2) - SH22 = SPARAM(5) - DO 110 I = 1,N - W = SX(KX) - Z = SY(KY) - SX(KX) = W*SH11 + Z - SY(KY) = -W + SH22*Z - KX = KX + INCX - KY = KY + INCY - 110 CONTINUE - GO TO 140 - 120 CONTINUE - SH11 = SPARAM(2) - SH12 = SPARAM(4) - SH21 = SPARAM(3) - SH22 = SPARAM(5) - DO 130 I = 1,N - W = SX(KX) - Z = SY(KY) - SX(KX) = W*SH11 + Z*SH12 - SY(KY) = W*SH21 + Z*SH22 - KX = KX + INCX - KY = KY + INCY - 130 CONTINUE - 140 CONTINUE - RETURN - END diff --git a/blas/fortran/srotmg.f b/blas/fortran/srotmg.f deleted file mode 100644 index 7b3bd4272..000000000 --- a/blas/fortran/srotmg.f +++ /dev/null @@ -1,208 +0,0 @@ - SUBROUTINE SROTMG(SD1,SD2,SX1,SY1,SPARAM) -* .. Scalar Arguments .. - REAL SD1,SD2,SX1,SY1 -* .. -* .. Array Arguments .. - REAL SPARAM(5) -* .. -* -* Purpose -* ======= -* -* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS -* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* -* SY2)**T. -* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. -* -* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 -* -* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) -* H=( ) ( ) ( ) ( ) -* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). -* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 -* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE -* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) -* -* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE -* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE -* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. -* -* -* Arguments -* ========= -* -* -* SD1 (input/output) REAL -* -* SD2 (input/output) REAL -* -* SX1 (input/output) REAL -* -* SY1 (input) REAL -* -* -* SPARAM (input/output) REAL array, dimension 5 -* SPARAM(1)=SFLAG -* SPARAM(2)=SH11 -* SPARAM(3)=SH21 -* SPARAM(4)=SH12 -* SPARAM(5)=SH22 -* -* ===================================================================== -* -* .. Local Scalars .. - REAL GAM,GAMSQ,ONE,RGAMSQ,SFLAG,SH11,SH12,SH21,SH22,SP1,SP2,SQ1, - + SQ2,STEMP,SU,TWO,ZERO - INTEGER IGO -* .. -* .. Intrinsic Functions .. - INTRINSIC ABS -* .. -* .. Data statements .. -* - DATA ZERO,ONE,TWO/0.E0,1.E0,2.E0/ - DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/ -* .. - - IF (.NOT.SD1.LT.ZERO) GO TO 10 -* GO ZERO-H-D-AND-SX1.. - GO TO 60 - 10 CONTINUE -* CASE-SD1-NONNEGATIVE - SP2 = SD2*SY1 - IF (.NOT.SP2.EQ.ZERO) GO TO 20 - SFLAG = -TWO - GO TO 260 -* REGULAR-CASE.. - 20 CONTINUE - SP1 = SD1*SX1 - SQ2 = SP2*SY1 - SQ1 = SP1*SX1 -* - IF (.NOT.ABS(SQ1).GT.ABS(SQ2)) GO TO 40 - SH21 = -SY1/SX1 - SH12 = SP2/SP1 -* - SU = ONE - SH12*SH21 -* - IF (.NOT.SU.LE.ZERO) GO TO 30 -* GO ZERO-H-D-AND-SX1.. - GO TO 60 - 30 CONTINUE - SFLAG = ZERO - SD1 = SD1/SU - SD2 = SD2/SU - SX1 = SX1*SU -* GO SCALE-CHECK.. - GO TO 100 - 40 CONTINUE - IF (.NOT.SQ2.LT.ZERO) GO TO 50 -* GO ZERO-H-D-AND-SX1.. - GO TO 60 - 50 CONTINUE - SFLAG = ONE - SH11 = SP1/SP2 - SH22 = SX1/SY1 - SU = ONE + SH11*SH22 - STEMP = SD2/SU - SD2 = SD1/SU - SD1 = STEMP - SX1 = SY1*SU -* GO SCALE-CHECK - GO TO 100 -* PROCEDURE..ZERO-H-D-AND-SX1.. - 60 CONTINUE - SFLAG = -ONE - SH11 = ZERO - SH12 = ZERO - SH21 = ZERO - SH22 = ZERO -* - SD1 = ZERO - SD2 = ZERO - SX1 = ZERO -* RETURN.. - GO TO 220 -* PROCEDURE..FIX-H.. - 70 CONTINUE - IF (.NOT.SFLAG.GE.ZERO) GO TO 90 -* - IF (.NOT.SFLAG.EQ.ZERO) GO TO 80 - SH11 = ONE - SH22 = ONE - SFLAG = -ONE - GO TO 90 - 80 CONTINUE - SH21 = -ONE - SH12 = ONE - SFLAG = -ONE - 90 CONTINUE - GO TO IGO(120,150,180,210) -* PROCEDURE..SCALE-CHECK - 100 CONTINUE - 110 CONTINUE - IF (.NOT.SD1.LE.RGAMSQ) GO TO 130 - IF (SD1.EQ.ZERO) GO TO 160 - ASSIGN 120 TO IGO -* FIX-H.. - GO TO 70 - 120 CONTINUE - SD1 = SD1*GAM**2 - SX1 = SX1/GAM - SH11 = SH11/GAM - SH12 = SH12/GAM - GO TO 110 - 130 CONTINUE - 140 CONTINUE - IF (.NOT.SD1.GE.GAMSQ) GO TO 160 - ASSIGN 150 TO IGO -* FIX-H.. - GO TO 70 - 150 CONTINUE - SD1 = SD1/GAM**2 - SX1 = SX1*GAM - SH11 = SH11*GAM - SH12 = SH12*GAM - GO TO 140 - 160 CONTINUE - 170 CONTINUE - IF (.NOT.ABS(SD2).LE.RGAMSQ) GO TO 190 - IF (SD2.EQ.ZERO) GO TO 220 - ASSIGN 180 TO IGO -* FIX-H.. - GO TO 70 - 180 CONTINUE - SD2 = SD2*GAM**2 - SH21 = SH21/GAM - SH22 = SH22/GAM - GO TO 170 - 190 CONTINUE - 200 CONTINUE - IF (.NOT.ABS(SD2).GE.GAMSQ) GO TO 220 - ASSIGN 210 TO IGO -* FIX-H.. - GO TO 70 - 210 CONTINUE - SD2 = SD2/GAM**2 - SH21 = SH21*GAM - SH22 = SH22*GAM - GO TO 200 - 220 CONTINUE - IF (SFLAG) 250,230,240 - 230 CONTINUE - SPARAM(3) = SH21 - SPARAM(4) = SH12 - GO TO 260 - 240 CONTINUE - SPARAM(2) = SH11 - SPARAM(5) = SH22 - GO TO 260 - 250 CONTINUE - SPARAM(2) = SH11 - SPARAM(3) = SH21 - SPARAM(4) = SH12 - SPARAM(5) = SH22 - 260 CONTINUE - SPARAM(1) = SFLAG - RETURN - END diff --git a/blas/fortran/ssbmv.f b/blas/fortran/ssbmv.f deleted file mode 100644 index 16893a295..000000000 --- a/blas/fortran/ssbmv.f +++ /dev/null @@ -1,306 +0,0 @@ - SUBROUTINE SSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - REAL ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - REAL A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* SSBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - REAL . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - REAL array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the symmetric matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a symmetric band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - REAL array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - REAL . -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - REAL array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - REAL ONE,ZERO - PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) -* .. -* .. Local Scalars .. - REAL TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('SSBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*A(1,J) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*A(1,J) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + A(L+I,J)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of SSBMV . -* - END diff --git a/blas/fortran/sspmv.f b/blas/fortran/sspmv.f deleted file mode 100644 index 0b8449824..000000000 --- a/blas/fortran/sspmv.f +++ /dev/null @@ -1,265 +0,0 @@ - SUBROUTINE SSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - REAL ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - REAL AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* SSPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n symmetric matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - REAL . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - REAL array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the symmetric matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Unchanged on exit. -* -* X - REAL array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - REAL . -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - REAL array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - REAL ONE,ZERO - PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) -* .. -* .. Local Scalars .. - REAL TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('SSPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*AP(KK) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*AP(KK) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + AP(K)*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of SSPMV . -* - END diff --git a/blas/fortran/stbmv.f b/blas/fortran/stbmv.f deleted file mode 100644 index c0b8f1136..000000000 --- a/blas/fortran/stbmv.f +++ /dev/null @@ -1,335 +0,0 @@ - SUBROUTINE STBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - REAL A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* STBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := A'*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - REAL array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - REAL array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - REAL ZERO - PARAMETER (ZERO=0.0E+0) -* .. -* .. Local Scalars .. - REAL TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('STBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 100 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - X(J) = TEMP - 100 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 120 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 110 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 110 CONTINUE - X(JX) = TEMP - JX = JX - INCX - 120 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 140 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 130 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 130 CONTINUE - X(J) = TEMP - 140 CONTINUE - ELSE - JX = KX - DO 160 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 150 CONTINUE - X(JX) = TEMP - JX = JX + INCX - 160 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of STBMV . -* - END diff --git a/blas/fortran/zhbmv.f b/blas/fortran/zhbmv.f deleted file mode 100644 index bca0da5fc..000000000 --- a/blas/fortran/zhbmv.f +++ /dev/null @@ -1,310 +0,0 @@ - SUBROUTINE ZHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE COMPLEX ALPHA,BETA - INTEGER INCX,INCY,K,LDA,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE COMPLEX A(LDA,*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* ZHBMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian band matrix, with k super-diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the band matrix A is being supplied as -* follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* being supplied. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* being supplied. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry, K specifies the number of super-diagonals of the -* matrix A. K must satisfy 0 .le. K. -* Unchanged on exit. -* -* ALPHA - COMPLEX*16 . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* A - COMPLEX*16 array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer the upper -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the hermitian matrix, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer the lower -* triangular part of a hermitian band matrix from conventional -* full matrix storage to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX*16 array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the -* vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX*16 . -* On entry, BETA specifies the scalar beta. -* Unchanged on exit. -* -* Y - COMPLEX*16 array of DIMENSION at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the -* vector y. On exit, Y is overwritten by the updated vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE COMPLEX ONE - PARAMETER (ONE= (1.0D+0,0.0D+0)) - DOUBLE COMPLEX ZERO - PARAMETER (ZERO= (0.0D+0,0.0D+0)) -* .. -* .. Local Scalars .. - DOUBLE COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC DBLE,DCONJG,MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (K.LT.0) THEN - INFO = 3 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 6 - ELSE IF (INCX.EQ.0) THEN - INFO = 8 - ELSE IF (INCY.EQ.0) THEN - INFO = 11 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('ZHBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array A -* are accessed sequentially with one pass through A. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - IF (LSAME(UPLO,'U')) THEN -* -* Form y when upper triangle of A is stored. -* - KPLUS1 = K + 1 - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - L = KPLUS1 - J - DO 50 I = MAX(1,J-K),J - 1 - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I) - 50 CONTINUE - Y(J) = Y(J) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2 - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - L = KPLUS1 - J - DO 70 I = MAX(1,J-K),J - 1 - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*DBLE(A(KPLUS1,J)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - IF (J.GT.K) THEN - KX = KX + INCX - KY = KY + INCY - END IF - 80 CONTINUE - END IF - ELSE -* -* Form y when lower triangle of A is stored. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*DBLE(A(1,J)) - L = 1 - J - DO 90 I = J + 1,MIN(N,J+K) - Y(I) = Y(I) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(I) - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*DBLE(A(1,J)) - L = 1 - J - IX = JX - IY = JY - DO 110 I = J + 1,MIN(N,J+K) - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*A(L+I,J) - TEMP2 = TEMP2 + DCONJG(A(L+I,J))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of ZHBMV . -* - END diff --git a/blas/fortran/zhpmv.f b/blas/fortran/zhpmv.f deleted file mode 100644 index b686108b3..000000000 --- a/blas/fortran/zhpmv.f +++ /dev/null @@ -1,272 +0,0 @@ - SUBROUTINE ZHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY) -* .. Scalar Arguments .. - DOUBLE COMPLEX ALPHA,BETA - INTEGER INCX,INCY,N - CHARACTER UPLO -* .. -* .. Array Arguments .. - DOUBLE COMPLEX AP(*),X(*),Y(*) -* .. -* -* Purpose -* ======= -* -* ZHPMV performs the matrix-vector operation -* -* y := alpha*A*x + beta*y, -* -* where alpha and beta are scalars, x and y are n element vectors and -* A is an n by n hermitian matrix, supplied in packed form. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the upper or lower -* triangular part of the matrix A is supplied in the packed -* array AP as follows: -* -* UPLO = 'U' or 'u' The upper triangular part of A is -* supplied in AP. -* -* UPLO = 'L' or 'l' The lower triangular part of A is -* supplied in AP. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* ALPHA - COMPLEX*16 . -* On entry, ALPHA specifies the scalar alpha. -* Unchanged on exit. -* -* AP - COMPLEX*16 array of DIMENSION at least -* ( ( n*( n + 1 ) )/2 ). -* Before entry with UPLO = 'U' or 'u', the array AP must -* contain the upper triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) -* and a( 2, 2 ) respectively, and so on. -* Before entry with UPLO = 'L' or 'l', the array AP must -* contain the lower triangular part of the hermitian matrix -* packed sequentially, column by column, so that AP( 1 ) -* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) -* and a( 3, 1 ) respectively, and so on. -* Note that the imaginary parts of the diagonal elements need -* not be set and are assumed to be zero. -* Unchanged on exit. -* -* X - COMPLEX*16 array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. -* Unchanged on exit. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* BETA - COMPLEX*16 . -* On entry, BETA specifies the scalar beta. When BETA is -* supplied as zero then Y need not be set on input. -* Unchanged on exit. -* -* Y - COMPLEX*16 array of dimension at least -* ( 1 + ( n - 1 )*abs( INCY ) ). -* Before entry, the incremented array Y must contain the n -* element vector y. On exit, Y is overwritten by the updated -* vector y. -* -* INCY - INTEGER. -* On entry, INCY specifies the increment for the elements of -* Y. INCY must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE COMPLEX ONE - PARAMETER (ONE= (1.0D+0,0.0D+0)) - DOUBLE COMPLEX ZERO - PARAMETER (ZERO= (0.0D+0,0.0D+0)) -* .. -* .. Local Scalars .. - DOUBLE COMPLEX TEMP1,TEMP2 - INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC DBLE,DCONJG -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (N.LT.0) THEN - INFO = 2 - ELSE IF (INCX.EQ.0) THEN - INFO = 6 - ELSE IF (INCY.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('ZHPMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN -* -* Set up the start points in X and Y. -* - IF (INCX.GT.0) THEN - KX = 1 - ELSE - KX = 1 - (N-1)*INCX - END IF - IF (INCY.GT.0) THEN - KY = 1 - ELSE - KY = 1 - (N-1)*INCY - END IF -* -* Start the operations. In this version the elements of the array AP -* are accessed sequentially with one pass through AP. -* -* First form y := beta*y. -* - IF (BETA.NE.ONE) THEN - IF (INCY.EQ.1) THEN - IF (BETA.EQ.ZERO) THEN - DO 10 I = 1,N - Y(I) = ZERO - 10 CONTINUE - ELSE - DO 20 I = 1,N - Y(I) = BETA*Y(I) - 20 CONTINUE - END IF - ELSE - IY = KY - IF (BETA.EQ.ZERO) THEN - DO 30 I = 1,N - Y(IY) = ZERO - IY = IY + INCY - 30 CONTINUE - ELSE - DO 40 I = 1,N - Y(IY) = BETA*Y(IY) - IY = IY + INCY - 40 CONTINUE - END IF - END IF - END IF - IF (ALPHA.EQ.ZERO) RETURN - KK = 1 - IF (LSAME(UPLO,'U')) THEN -* -* Form y when AP contains the upper triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 60 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - K = KK - DO 50 I = 1,J - 1 - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(I) - K = K + 1 - 50 CONTINUE - Y(J) = Y(J) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2 - KK = KK + J - 60 CONTINUE - ELSE - JX = KX - JY = KY - DO 80 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - IX = KX - IY = KY - DO 70 K = KK,KK + J - 2 - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX) - IX = IX + INCX - IY = IY + INCY - 70 CONTINUE - Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK+J-1)) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + J - 80 CONTINUE - END IF - ELSE -* -* Form y when AP contains the lower triangle. -* - IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN - DO 100 J = 1,N - TEMP1 = ALPHA*X(J) - TEMP2 = ZERO - Y(J) = Y(J) + TEMP1*DBLE(AP(KK)) - K = KK + 1 - DO 90 I = J + 1,N - Y(I) = Y(I) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(I) - K = K + 1 - 90 CONTINUE - Y(J) = Y(J) + ALPHA*TEMP2 - KK = KK + (N-J+1) - 100 CONTINUE - ELSE - JX = KX - JY = KY - DO 120 J = 1,N - TEMP1 = ALPHA*X(JX) - TEMP2 = ZERO - Y(JY) = Y(JY) + TEMP1*DBLE(AP(KK)) - IX = JX - IY = JY - DO 110 K = KK + 1,KK + N - J - IX = IX + INCX - IY = IY + INCY - Y(IY) = Y(IY) + TEMP1*AP(K) - TEMP2 = TEMP2 + DCONJG(AP(K))*X(IX) - 110 CONTINUE - Y(JY) = Y(JY) + ALPHA*TEMP2 - JX = JX + INCX - JY = JY + INCY - KK = KK + (N-J+1) - 120 CONTINUE - END IF - END IF -* - RETURN -* -* End of ZHPMV . -* - END diff --git a/blas/fortran/ztbmv.f b/blas/fortran/ztbmv.f deleted file mode 100644 index 7c85c1b55..000000000 --- a/blas/fortran/ztbmv.f +++ /dev/null @@ -1,366 +0,0 @@ - SUBROUTINE ZTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) -* .. Scalar Arguments .. - INTEGER INCX,K,LDA,N - CHARACTER DIAG,TRANS,UPLO -* .. -* .. Array Arguments .. - DOUBLE COMPLEX A(LDA,*),X(*) -* .. -* -* Purpose -* ======= -* -* ZTBMV performs one of the matrix-vector operations -* -* x := A*x, or x := A'*x, or x := conjg( A' )*x, -* -* where x is an n element vector and A is an n by n unit, or non-unit, -* upper or lower triangular band matrix, with ( k + 1 ) diagonals. -* -* Arguments -* ========== -* -* UPLO - CHARACTER*1. -* On entry, UPLO specifies whether the matrix is an upper or -* lower triangular matrix as follows: -* -* UPLO = 'U' or 'u' A is an upper triangular matrix. -* -* UPLO = 'L' or 'l' A is a lower triangular matrix. -* -* Unchanged on exit. -* -* TRANS - CHARACTER*1. -* On entry, TRANS specifies the operation to be performed as -* follows: -* -* TRANS = 'N' or 'n' x := A*x. -* -* TRANS = 'T' or 't' x := A'*x. -* -* TRANS = 'C' or 'c' x := conjg( A' )*x. -* -* Unchanged on exit. -* -* DIAG - CHARACTER*1. -* On entry, DIAG specifies whether or not A is unit -* triangular as follows: -* -* DIAG = 'U' or 'u' A is assumed to be unit triangular. -* -* DIAG = 'N' or 'n' A is not assumed to be unit -* triangular. -* -* Unchanged on exit. -* -* N - INTEGER. -* On entry, N specifies the order of the matrix A. -* N must be at least zero. -* Unchanged on exit. -* -* K - INTEGER. -* On entry with UPLO = 'U' or 'u', K specifies the number of -* super-diagonals of the matrix A. -* On entry with UPLO = 'L' or 'l', K specifies the number of -* sub-diagonals of the matrix A. -* K must satisfy 0 .le. K. -* Unchanged on exit. -* -* A - COMPLEX*16 array of DIMENSION ( LDA, n ). -* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) -* by n part of the array A must contain the upper triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row -* ( k + 1 ) of the array, the first super-diagonal starting at -* position 2 in row k, and so on. The top left k by k triangle -* of the array A is not referenced. -* The following program segment will transfer an upper -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = K + 1 - J -* DO 10, I = MAX( 1, J - K ), J -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) -* by n part of the array A must contain the lower triangular -* band part of the matrix of coefficients, supplied column by -* column, with the leading diagonal of the matrix in row 1 of -* the array, the first sub-diagonal starting at position 1 in -* row 2, and so on. The bottom right k by k triangle of the -* array A is not referenced. -* The following program segment will transfer a lower -* triangular band matrix from conventional full matrix storage -* to band storage: -* -* DO 20, J = 1, N -* M = 1 - J -* DO 10, I = J, MIN( N, J + K ) -* A( M + I, J ) = matrix( I, J ) -* 10 CONTINUE -* 20 CONTINUE -* -* Note that when DIAG = 'U' or 'u' the elements of the array A -* corresponding to the diagonal elements of the matrix are not -* referenced, but are assumed to be unity. -* Unchanged on exit. -* -* LDA - INTEGER. -* On entry, LDA specifies the first dimension of A as declared -* in the calling (sub) program. LDA must be at least -* ( k + 1 ). -* Unchanged on exit. -* -* X - COMPLEX*16 array of dimension at least -* ( 1 + ( n - 1 )*abs( INCX ) ). -* Before entry, the incremented array X must contain the n -* element vector x. On exit, X is overwritten with the -* tranformed vector x. -* -* INCX - INTEGER. -* On entry, INCX specifies the increment for the elements of -* X. INCX must not be zero. -* Unchanged on exit. -* -* Further Details -* =============== -* -* Level 2 Blas routine. -* -* -- Written on 22-October-1986. -* Jack Dongarra, Argonne National Lab. -* Jeremy Du Croz, Nag Central Office. -* Sven Hammarling, Nag Central Office. -* Richard Hanson, Sandia National Labs. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE COMPLEX ZERO - PARAMETER (ZERO= (0.0D+0,0.0D+0)) -* .. -* .. Local Scalars .. - DOUBLE COMPLEX TEMP - INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L - LOGICAL NOCONJ,NOUNIT -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME -* .. -* .. External Subroutines .. - EXTERNAL XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC DCONJG,MAX,MIN -* .. -* -* Test the input parameters. -* - INFO = 0 - IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN - INFO = 1 - ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. - + .NOT.LSAME(TRANS,'C')) THEN - INFO = 2 - ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN - INFO = 3 - ELSE IF (N.LT.0) THEN - INFO = 4 - ELSE IF (K.LT.0) THEN - INFO = 5 - ELSE IF (LDA.LT. (K+1)) THEN - INFO = 7 - ELSE IF (INCX.EQ.0) THEN - INFO = 9 - END IF - IF (INFO.NE.0) THEN - CALL XERBLA('ZTBMV ',INFO) - RETURN - END IF -* -* Quick return if possible. -* - IF (N.EQ.0) RETURN -* - NOCONJ = LSAME(TRANS,'T') - NOUNIT = LSAME(DIAG,'N') -* -* Set up the start point in X if the increment is not unity. This -* will be ( N - 1 )*INCX too small for descending loops. -* - IF (INCX.LE.0) THEN - KX = 1 - (N-1)*INCX - ELSE IF (INCX.NE.1) THEN - KX = 1 - END IF -* -* Start the operations. In this version the elements of A are -* accessed sequentially with one pass through A. -* - IF (LSAME(TRANS,'N')) THEN -* -* Form x := A*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 20 J = 1,N - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = KPLUS1 - J - DO 10 I = MAX(1,J-K),J - 1 - X(I) = X(I) + TEMP*A(L+I,J) - 10 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J) - END IF - 20 CONTINUE - ELSE - JX = KX - DO 40 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = KPLUS1 - J - DO 30 I = MAX(1,J-K),J - 1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX + INCX - 30 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J) - END IF - JX = JX + INCX - IF (J.GT.K) KX = KX + INCX - 40 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 60 J = N,1,-1 - IF (X(J).NE.ZERO) THEN - TEMP = X(J) - L = 1 - J - DO 50 I = MIN(N,J+K),J + 1,-1 - X(I) = X(I) + TEMP*A(L+I,J) - 50 CONTINUE - IF (NOUNIT) X(J) = X(J)*A(1,J) - END IF - 60 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 80 J = N,1,-1 - IF (X(JX).NE.ZERO) THEN - TEMP = X(JX) - IX = KX - L = 1 - J - DO 70 I = MIN(N,J+K),J + 1,-1 - X(IX) = X(IX) + TEMP*A(L+I,J) - IX = IX - INCX - 70 CONTINUE - IF (NOUNIT) X(JX) = X(JX)*A(1,J) - END IF - JX = JX - INCX - IF ((N-J).GE.K) KX = KX - INCX - 80 CONTINUE - END IF - END IF - ELSE -* -* Form x := A'*x or x := conjg( A' )*x. -* - IF (LSAME(UPLO,'U')) THEN - KPLUS1 = K + 1 - IF (INCX.EQ.1) THEN - DO 110 J = N,1,-1 - TEMP = X(J) - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 90 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(I) - 90 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J)) - DO 100 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + DCONJG(A(L+I,J))*X(I) - 100 CONTINUE - END IF - X(J) = TEMP - 110 CONTINUE - ELSE - KX = KX + (N-1)*INCX - JX = KX - DO 140 J = N,1,-1 - TEMP = X(JX) - KX = KX - INCX - IX = KX - L = KPLUS1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J) - DO 120 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX - INCX - 120 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(KPLUS1,J)) - DO 130 I = J - 1,MAX(1,J-K),-1 - TEMP = TEMP + DCONJG(A(L+I,J))*X(IX) - IX = IX - INCX - 130 CONTINUE - END IF - X(JX) = TEMP - JX = JX - INCX - 140 CONTINUE - END IF - ELSE - IF (INCX.EQ.1) THEN - DO 170 J = 1,N - TEMP = X(J) - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 150 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(I) - 150 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J)) - DO 160 I = J + 1,MIN(N,J+K) - TEMP = TEMP + DCONJG(A(L+I,J))*X(I) - 160 CONTINUE - END IF - X(J) = TEMP - 170 CONTINUE - ELSE - JX = KX - DO 200 J = 1,N - TEMP = X(JX) - KX = KX + INCX - IX = KX - L = 1 - J - IF (NOCONJ) THEN - IF (NOUNIT) TEMP = TEMP*A(1,J) - DO 180 I = J + 1,MIN(N,J+K) - TEMP = TEMP + A(L+I,J)*X(IX) - IX = IX + INCX - 180 CONTINUE - ELSE - IF (NOUNIT) TEMP = TEMP*DCONJG(A(1,J)) - DO 190 I = J + 1,MIN(N,J+K) - TEMP = TEMP + DCONJG(A(L+I,J))*X(IX) - IX = IX + INCX - 190 CONTINUE - END IF - X(JX) = TEMP - JX = JX + INCX - 200 CONTINUE - END IF - END IF - END IF -* - RETURN -* -* End of ZTBMV . -* - END From 608733415ac36824f9c2a9b11ee0ee755f701a77 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Fri, 12 Dec 2014 12:01:03 +0100 Subject: [PATCH 121/214] Free functions should only be declared as static in separate compilation units (grafted from d85abc89c5782eebe2b0f2c920d6c495b4dcc091 ) --- unsupported/Eigen/OpenGLSupport | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport index 4ed545174..6ca1b1217 100644 --- a/unsupported/Eigen/OpenGLSupport +++ b/unsupported/Eigen/OpenGLSupport @@ -178,11 +178,11 @@ template void glLoadMatrix(const Transform& t) template void glLoadMatrix(const Transform& t) { glLoadMatrix(t.matrix()); } template void glLoadMatrix(const Transform& t) { glLoadMatrix(Transform(t).matrix()); } -static void glRotate(const Rotation2D& rot) +inline void glRotate(const Rotation2D& rot) { glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f); } -static void glRotate(const Rotation2D& rot) +inline void glRotate(const Rotation2D& rot) { glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0); } @@ -246,18 +246,18 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev) #ifdef GL_VERSION_2_0 -static void glUniform2fv_ei (GLint loc, const float* v) { glUniform2fv(loc,1,v); } -static void glUniform2iv_ei (GLint loc, const int* v) { glUniform2iv(loc,1,v); } +inline void glUniform2fv_ei (GLint loc, const float* v) { glUniform2fv(loc,1,v); } +inline void glUniform2iv_ei (GLint loc, const int* v) { glUniform2iv(loc,1,v); } -static void glUniform3fv_ei (GLint loc, const float* v) { glUniform3fv(loc,1,v); } -static void glUniform3iv_ei (GLint loc, const int* v) { glUniform3iv(loc,1,v); } +inline void glUniform3fv_ei (GLint loc, const float* v) { glUniform3fv(loc,1,v); } +inline void glUniform3iv_ei (GLint loc, const int* v) { glUniform3iv(loc,1,v); } -static void glUniform4fv_ei (GLint loc, const float* v) { glUniform4fv(loc,1,v); } -static void glUniform4iv_ei (GLint loc, const int* v) { glUniform4iv(loc,1,v); } +inline void glUniform4fv_ei (GLint loc, const float* v) { glUniform4fv(loc,1,v); } +inline void glUniform4iv_ei (GLint loc, const int* v) { glUniform4iv(loc,1,v); } -static void glUniformMatrix2fv_ei (GLint loc, const float* v) { glUniformMatrix2fv(loc,1,false,v); } -static void glUniformMatrix3fv_ei (GLint loc, const float* v) { glUniformMatrix3fv(loc,1,false,v); } -static void glUniformMatrix4fv_ei (GLint loc, const float* v) { glUniformMatrix4fv(loc,1,false,v); } +inline void glUniformMatrix2fv_ei (GLint loc, const float* v) { glUniformMatrix2fv(loc,1,false,v); } +inline void glUniformMatrix3fv_ei (GLint loc, const float* v) { glUniformMatrix3fv(loc,1,false,v); } +inline void glUniformMatrix4fv_ei (GLint loc, const float* v) { glUniformMatrix4fv(loc,1,false,v); } EIGEN_GL_FUNC1_DECLARATION (glUniform,GLint,const) @@ -294,9 +294,9 @@ EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,3,Matrix #ifdef GL_VERSION_3_0 -static void glUniform2uiv_ei (GLint loc, const unsigned int* v) { glUniform2uiv(loc,1,v); } -static void glUniform3uiv_ei (GLint loc, const unsigned int* v) { glUniform3uiv(loc,1,v); } -static void glUniform4uiv_ei (GLint loc, const unsigned int* v) { glUniform4uiv(loc,1,v); } +inline void glUniform2uiv_ei (GLint loc, const unsigned int* v) { glUniform2uiv(loc,1,v); } +inline void glUniform3uiv_ei (GLint loc, const unsigned int* v) { glUniform3uiv(loc,1,v); } +inline void glUniform4uiv_ei (GLint loc, const unsigned int* v) { glUniform4uiv(loc,1,v); } EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei) EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei) @@ -305,9 +305,9 @@ EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei) #endif #ifdef GL_ARB_gpu_shader_fp64 -static void glUniform2dv_ei (GLint loc, const double* v) { glUniform2dv(loc,1,v); } -static void glUniform3dv_ei (GLint loc, const double* v) { glUniform3dv(loc,1,v); } -static void glUniform4dv_ei (GLint loc, const double* v) { glUniform4dv(loc,1,v); } +inline void glUniform2dv_ei (GLint loc, const double* v) { glUniform2dv(loc,1,v); } +inline void glUniform3dv_ei (GLint loc, const double* v) { glUniform3dv(loc,1,v); } +inline void glUniform4dv_ei (GLint loc, const double* v) { glUniform4dv(loc,1,v); } EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 2,2dv_ei) EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 3,3dv_ei) From dcad508986b85a49777dd15a00b88b61e2ef6975 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 15 Dec 2014 12:45:29 +0100 Subject: [PATCH 122/214] At least CMAKE 2.8.4 is required for WORKING_DIRECTORY option in add_test --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f9610a522..00287c9bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ project(Eigen) -cmake_minimum_required(VERSION 2.8.2) +cmake_minimum_required(VERSION 2.8.4) # guard against in-source builds From 7dad5f797e8c270be5f32aee154f6660df2242f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 16 Dec 2014 13:33:43 +0100 Subject: [PATCH 123/214] bug #821: workaround MSVC 2013 issue with using Base::Base::operator= --- Eigen/src/Core/MapBase.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 3c67edae5..5d51548cd 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -243,7 +243,11 @@ template class MapBase return derived(); } - using Base::Base::operator=; + // In theory MapBase should not make a using Base::operator=, + // and thus we should directly do: using Base::Base::operator=; + // However, this would confuse recent MSVC 2013 (bug 821), and since MapBase + // has operator= to make ICC 11 happy, we can also make MSVC 2013 happy as follow: + using Base::operator=; }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS From 99501a2c4c015a0f5a4a11d3cfbcdf1d3a39fe49 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 16 Dec 2014 16:23:47 +0100 Subject: [PATCH 124/214] Fix wrong negative in nullary unit test when extended precision is used (FPU). --- test/nullary.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/nullary.cpp b/test/nullary.cpp index 5408d88b2..fbc721a1a 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -80,7 +80,9 @@ void testVectorType(const VectorType& base) Matrix col_vector(size); row_vector.setLinSpaced(size,low,high); col_vector.setLinSpaced(size,low,high); - VERIFY( row_vector.isApprox(col_vector.transpose(), NumTraits::epsilon())); + // when using the extended precision (e.g., FPU) the relative error might exceed 1 bit + // when computing the squared sum in isApprox, thus the 2x factor. + VERIFY( row_vector.isApprox(col_vector.transpose(), Scalar(2)*NumTraits::epsilon())); Matrix size_changer(size+50); size_changer.setLinSpaced(size,low,high); From f806c23012937f0acf55203dcb1f9d532bd0080b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 16 Dec 2014 16:50:30 +0100 Subject: [PATCH 125/214] Fix false negatives in geo_transformations unit tests --- test/geo_transformations.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index e189217eb..042dd0329 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -99,10 +99,16 @@ template void transformations() Scalar a = internal::random(-Scalar(M_PI), Scalar(M_PI)); Scalar s0 = internal::random(), s1 = internal::random(); + + while(v0.norm() < test_precision()) v0 = Vector3::Random(); + while(v1.norm() < test_precision()) v1 = Vector3::Random(); VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0); VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0); - VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0)); + if(abs(cos(a)) > test_precision()) + { + VERIFY_IS_APPROX(cos(a)*v0.squaredNorm(), v0.dot(AngleAxisx(a, v0.unitOrthogonal()) * v0)); + } m = AngleAxisx(a, v0.normalized()).toRotationMatrix().adjoint(); VERIFY_IS_APPROX(Matrix3::Identity(), m * AngleAxisx(a, v0.normalized())); VERIFY_IS_APPROX(Matrix3::Identity(), AngleAxisx(a, v0.normalized()) * m); @@ -123,11 +129,18 @@ template void transformations() // angle-axis conversion AngleAxisx aa = AngleAxisx(q1); VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1); - VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1); + + if(abs(aa.angle()) > NumTraits::dummy_precision()) + { + VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) ); + } aa.fromRotationMatrix(aa.toRotationMatrix()); VERIFY_IS_APPROX(q1 * v1, Quaternionx(aa) * v1); - VERIFY_IS_NOT_APPROX(q1 * v1, Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1); + if(abs(aa.angle()) > NumTraits::dummy_precision()) + { + VERIFY( !(q1 * v1).isApprox(Quaternionx(AngleAxisx(aa.angle()*2,aa.axis())) * v1) ); + } // AngleAxis VERIFY_IS_APPROX(AngleAxisx(a,v1.normalized()).toRotationMatrix(), @@ -347,7 +360,9 @@ template void transformations() // test transform inversion t0.setIdentity(); t0.translate(v0); - t0.linear().setRandom(); + do { + t0.linear().setRandom(); + } while(t0.linear().jacobiSvd().singularValues()(2)()); Matrix4 t044 = Matrix4::Zero(); t044(3,3) = 1; t044.block(0,0,t0.matrix().rows(),4) = t0.matrix(); From b8d9eaa19bd8501e8007c4c3633904643b44617e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 13 Dec 2014 22:16:39 +0100 Subject: [PATCH 126/214] Use true compile time "if" for Transform::makeAffine --- Eigen/src/Geometry/Transform.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 7ebde6803..d33fc24db 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -78,6 +78,8 @@ struct traits > }; }; +template struct transform_make_affine; + } // end namespace internal /** \geometry_module \ingroup Geometry_Module @@ -246,8 +248,7 @@ public: inline Transform() { check_template_params(); - if (int(Mode)==Affine) - makeAffine(); + internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); } inline Transform(const Transform& other) @@ -610,11 +611,7 @@ public: */ void makeAffine() { - if(int(Mode)!=int(AffineCompact)) - { - matrix().template block<1,Dim>(Dim,0).setZero(); - matrix().coeffRef(Dim,Dim) = Scalar(1); - } + internal::transform_make_affine::run(m_matrix); } /** \internal @@ -1102,6 +1099,24 @@ Transform::fromPositionOrientationScale(const MatrixBas namespace internal { +template +struct transform_make_affine +{ + template + static void run(MatrixType &mat) + { + static const int Dim = MatrixType::ColsAtCompileTime-1; + mat.template block<1,Dim>(Dim,0).setZero(); + mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1); + } +}; + +template<> +struct transform_make_affine +{ + template static void run(MatrixType &) { } +}; + // selector needed to avoid taking the inverse of a 3x4 matrix template struct projective_transform_inverse From 25c7d9164f45119fa20dc6af2fa451d278c5f285 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 18 Dec 2014 22:58:15 +0100 Subject: [PATCH 127/214] bug #920: fix MSVC 2015 compilation issues --- Eigen/src/Core/MapBase.h | 11 +++++------ Eigen/src/Core/util/Macros.h | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 5d51548cd..1589cbaae 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -172,6 +172,7 @@ template class MapBase template class MapBase : public MapBase { + typedef MapBase ReadOnlyMapBase; public: typedef MapBase Base; @@ -239,15 +240,13 @@ template class MapBase EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) { - Base::Base::operator=(other); + ReadOnlyMapBase::Base::operator=(other); return derived(); } - // In theory MapBase should not make a using Base::operator=, - // and thus we should directly do: using Base::Base::operator=; - // However, this would confuse recent MSVC 2013 (bug 821), and since MapBase - // has operator= to make ICC 11 happy, we can also make MSVC 2013 happy as follow: - using Base::operator=; + // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, + // see bugs 821 and 920. + using ReadOnlyMapBase::Base::operator=; }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index bc26043d7..687ba41dd 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -73,7 +73,7 @@ /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) - #define EIGEN_COMP_MSVC_STRICT 1 + #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 #endif @@ -592,7 +592,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT +#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) From f5f6e2c6f46a8999ee36ce0c7adc62098d8d93d2 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 19 Dec 2014 14:41:59 +0100 Subject: [PATCH 128/214] bug #921: fix utilization of bitwise operation on enums in first_aligned --- Eigen/src/Core/util/Memory.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index a54ccaedc..bacf236fb 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -523,9 +523,8 @@ template inline void conditional_aligned_delete_auto(T * template inline Index first_aligned(const Scalar* array, Index size) { - enum { PacketSize = packet_traits::size, - PacketAlignedMask = PacketSize-1 - }; + static const Index PacketSize = packet_traits::size; + static const Index PacketAlignedMask = PacketSize-1; if(PacketSize==1) { From db5b0741b58cbf5997f75a9e2590e633bfbbc13a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 4 Jan 2015 21:39:50 +0100 Subject: [PATCH 129/214] Fix bug #925: typo in MatLab versions of middleRows --- doc/AsciiQuickReference.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index c4d021624..b5bdfa1f4 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -67,10 +67,10 @@ P.rightCols() // P(:, end-cols+1:end) P.rightCols(cols) // P(:, end-cols+1:end) P.topRows() // P(1:rows, :) P.topRows(rows) // P(1:rows, :) -P.middleRows(i) // P(:, i+1:i+rows) -P.middleRows(i, rows) // P(:, i+1:i+rows) -P.bottomRows() // P(:, end-rows+1:end) -P.bottomRows(rows) // P(:, end-rows+1:end) +P.middleRows(i) // P(i+1:i+rows, :) +P.middleRows(i, rows) // P(i+1:i+rows, :) +P.bottomRows() // P(end-rows+1:end, :) +P.bottomRows(rows) // P(end-rows+1:end, :) P.topLeftCorner(rows, cols) // P(1:rows, 1:cols) P.topRightCorner(rows, cols) // P(1:rows, end-cols+1:end) P.bottomLeftCorner(rows, cols) // P(end-rows+1:end, 1:cols) From 9f98650d0a82d4757afb4503ce6f2b6f61763463 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 6 Jan 2015 09:29:13 -0800 Subject: [PATCH 130/214] Ensured that contractions that can be reduced to a matrix vector product work correctly even when the input coefficients aren't aligned. --- Eigen/src/Core/products/GeneralMatrixVector.h | 8 +++- unsupported/test/cxx11_tensor_contraction.cpp | 48 +++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 7dfa48bfb..7df6a6b1a 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -140,10 +140,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) { @@ -412,10 +413,13 @@ EIGEN_DONT_INLINE void general_matrix_vector_product 4) { diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 2b599d30d..17bd335f7 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -352,6 +352,52 @@ static void test_large_contraction() } +static void test_matrix_vector() +{ + Tensor t_left(30, 50); + Tensor t_right(50); + Tensor t_result(30); + + t_left.setRandom(); + t_right.setRandom(); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 30, 50); + MapXf m_right(t_right.data(), 50, 1); + Eigen::Matrix m_result(30, 1); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims{{DimPair(1, 0)}}; + + // compute results by separate methods + t_result = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + } +} + + +static void test_tensor_vector() +{ + Tensor t_left(7, 13, 17); + Tensor t_right(1, 7); + typedef typename Tensor::DimensionPair DimensionPair; + Eigen::array dim_pair01{{{0, 1}}}; + Tensor t_result = t_left.contract(t_right, dim_pair01); + + typedef Map> MapXf; + MapXf m_left(t_left.data(), 7, 13*17); + MapXf m_right(t_right.data(), 1, 7); + Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + } +} + + void test_cxx11_tensor_contraction() { CALL_SUBTEST(test_evals()); @@ -364,4 +410,6 @@ void test_cxx11_tensor_contraction() CALL_SUBTEST(test_out_of_order_contraction()); CALL_SUBTEST(test_consistency()); CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_tensor_vector()); } From 79f4a59ed9ac3fc1a3b6e4516c2b3e04cec5f522 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 7 Jan 2015 09:41:56 +0100 Subject: [PATCH 131/214] bug #907: fix compilation with ARM64 --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6c5c669a1..586fa95e5 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -597,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { @@ -613,7 +613,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) } // Other reduction functions: // mul -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } // min template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } From 63974bcb88f23bd4768eb3232906f2b9f3c92fca Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 7 Jan 2015 09:44:25 +0100 Subject: [PATCH 132/214] Big 907: workaround some missing intrinsics in current NDK's gcc version (ARM64) --- Eigen/src/Core/arch/NEON/PacketMath.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 586fa95e5..d962e8adc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -492,6 +492,21 @@ ptranspose(PacketBlock& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 +#if EIGEN_COMP_GNUC_STRICT && __ANDROID__ +// Bug 907: workaround missing declarations of the following two functions in the ADK +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f64 (float64x2_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_f64_u64 (uint64x2_t __a) +{ + return (float64x2_t) __a; +} +#endif + typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; From 36f7c1337f2f73421325a9bdbe98dd53fd99951f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jan 2015 09:57:37 +0100 Subject: [PATCH 133/214] bug #907, ARM64: workaround vreinterpretq_u64_* not defined in xcode/clang --- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d962e8adc..29512e264 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -492,7 +492,7 @@ ptranspose(PacketBlock& kernel) { //---------- double ---------- #if EIGEN_ARCH_ARM64 -#if EIGEN_COMP_GNUC_STRICT && __ANDROID__ +#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) // Bug 907: workaround missing declarations of the following two functions in the ADK __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vreinterpretq_u64_f64 (float64x2_t __a) From ae4644cc6827bb1b6d654ceed8b3a0c256b1d173 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jan 2015 10:03:00 +0100 Subject: [PATCH 134/214] bug #907, ARM64: workaround ICE in xcode/clang --- Eigen/src/Core/arch/NEON/PacketMath.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 29512e264..f83f8db0e 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -612,7 +612,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { @@ -628,7 +633,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) } // Other reduction functions: // mul +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif // min template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } From 279786e9875cc71c7bed1f78f8df1803be215904 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 13 Jan 2015 10:25:50 +0100 Subject: [PATCH 135/214] Fix missing evaluator in outer-product --- Eigen/src/Core/ProductEvaluators.h | 10 ++++++---- test/product_small.cpp | 10 ++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 3cebbbd12..488eee00c 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -211,24 +211,26 @@ template EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { typedef typename Dst::Index Index; + typename evaluator::type rhsEval(rhs); // FIXME make sure lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and rhs + // FIXME we should probably build an evaluator for dst const Index cols = dst.cols(); for (Index j=0; j EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { typedef typename Dst::Index Index; + typename evaluator::type lhsEval(lhs); // FIXME make sure rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too - // FIXME we should probably build an evaluator for dst and lhs + // FIXME we should probably build an evaluator for dst const Index rows = dst.rows(); for (Index i=0; i diff --git a/test/product_small.cpp b/test/product_small.cpp index 8b132abb6..091955a0f 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -9,6 +9,7 @@ #define EIGEN_NO_STATIC_ASSERT #include "product.h" +#include // regression test for bug 447 void product1x1() @@ -46,5 +47,14 @@ void test_product_small() Vector3f v = Vector3f::Random(); VERIFY_IS_APPROX( (v * v.transpose()) * v, (v * v.transpose()).eval() * v); } + + { + // regression test for pull-request #93 + Eigen::Matrix A; A.setRandom(); + Eigen::Matrix B; B.setRandom(); + Eigen::Matrix C; C.setRandom(); + VERIFY_IS_APPROX(B * A.inverse(), B * A.inverse()[0]); + VERIFY_IS_APPROX(A.inverse() * C, A.inverse()[0] * C); + } #endif } From 91dd53e54db5c85c37e05bce5af95d31ba337e34 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 13 Jan 2015 16:07:51 -0800 Subject: [PATCH 136/214] Created some documentation --- unsupported/Eigen/CXX11/src/Tensor/README.md | 1446 ++++++++++++++++++ 1 file changed, 1446 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/README.md diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md new file mode 100644 index 000000000..6a4d52cc3 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -0,0 +1,1446 @@ +# Eigen Tensors + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +[TOC] + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace ```::Eigen.``` + + +### Class Tensor<data_type, rank> + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor Tensor<data_type, rank>(size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed ```rank``` integers +indicating the sizes of the instance along each of the the ```rank``` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor(3, 4, 3); + +#### Constructor Tensor<data_type, rank>(size_array) + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +```Eigen::array<Eigen::Index>```. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor t_2d({5, 7}); + + +### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>> + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize> t_4x3; + +### Class TensorMap<Tensor<data_type, rank>> + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + Tensor> t_4x3; + TensorMap t_12(t_4x3, 12); + + +#### Class TensorRef + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### <data_type> tensor(index0, index1...) + +Return the element at position ```(index0, index1...)``` in tensor +```tensor```. You must pass as many parameters as the rank of ```tensor```. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: ```ColMajor``` (the default) and +```RowMajor```. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor col_major; // equivalent to Tensor + TensorMap > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +```swap_layout()``` method. Note that this will also reverse the order of the +dimensions. + + Tensor col_major(2, 4); + Tensor row_major(2, 4); + + Tensor col_major_result = col_major; // ok, layouts match + Tensor col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor t1(2, 3, 4); + ...set some values in t1... + Tensor t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression ```t1 + t2``` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors +```t1``` and ```t2```. This is a small C++ object that knows how to add +```t1``` and ```t2```. It is only when the value of the expression is assigned +to the tensor ```t3``` that the addition is actually performed. Technically, +this happens through the overloading of ```operator=()``` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression ```t1 + t2 * +0.3f``` is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ ```auto``` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor ```t3``` and it will contain the +result of the addition of ```t1``` and ```t2```. In the second line, ```t4``` +is actually the tree of tensor operators that will compute the addition of +```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get +the values of its elements: + + Tensor t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use ```auto``` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use ```auto``` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of set of tensor computations you have to assign the +result to a Tensor that will be capable of holding onto them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap result(, , ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the ```auto``` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor ```result``` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling eval(). + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the ```eval()``` method of the +expression Operation. + + // The previous example could have been written: + Tensor result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling ```eval()``` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize tmp = t1 + t2; + Tensor result = (tmp * 0.2f).exp(); + +Note that the return value of ```eval()``` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor result = t4; + +While in the examples above calling ```eval()``` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the ```broadcast()``` expression causes the ```X.maximum()``` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to ```eval()``` between the ```maximum()``` and +```reshape()``` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor ```Y``` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()``` +expressions ensures that the sum is computed before any updates to ```Y``` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of ```Y``` +then you would need to force an eval for correctness by adding an ```eval()``` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a TensorRef. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the ```()``` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the ```device()``` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor a(30, 40); + Tensor b(30, 40); + Tensor c = a + b; + +To choose a different implementation you have to insert a ```device()``` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor c(30, 40); + c.device(...) = a + b; + +The call to ```device()``` must be the last call on the left of the operator=. + +You must pass to the ```device()``` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a ```device()``` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + // Create the Eigen ThreadPoolDevice. + Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### <Tensor-Type>::Dimensions + +Acts like an array of ints. Has an ```int size``` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See ```dimensions()```. + +#### <Tensor-Type>::Index + +Acts like an ```int```. Used for indexing tensors along their dimensions. See +```operator()```, ```dimension()```, and ```size()```. + +#### <Tensor-Type>::Scalar + +Represents the datatype of individual tensor elements. For example, for a +```Tensor```, ```Scalar``` is the type ```float```. See +```setConstant()```. + +#### <Operation> + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### int NumDimensions + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### Dimensions dimensions() + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the dimensions() result is ::Dimensions. + + Eigen::Tensor a(3, 4); + const Eigen::Tensor::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use ```auto``` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### Index dimension(Index n) + +Returns the n-th dimension of the tensor. The actual type of the +```dimension()``` result is ```::Index```, but you can +always use it like an int. + + Eigen::Tensor a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### Index size() + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the ```size()``` result is +```::Index```, but you can always use it like an int. + + Eigen::Tensor a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide ```dimensions()``` directly, +e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides ```dimensions()``` and ```dimension()``` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors and Copies + +TODO. + + Tensor(...) + TensorFixedSize(...) + TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) + TensorMap(PointerArgType dataPtr, const array& dimensions) + TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + Self& operator=(const Self& other) + Self& operator=(const OtherDerived& other) + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory the memory its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### <Tensor-Type> setConstant(const Scalar& val) + +Sets all elements of the tensor to the constant value ```val```. ```Scalar``` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that ```setConstant()``` can be used on any tensor where the element type +has a copy constructor and an ```operator=()```: + + Eigen::Tensor a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### <Tensor-Type> setZero() + +Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### <Tensor-Type> setValues({..initializer_list}) + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a ```TensorFixedSize``` the initializer list must +contains 2 lists of 3 floats each. + +```setValues()``` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### <Tensor-Type> setRandom() + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize ```setRandom()``` by providing your own random number +generator as a template argument: + + a.setRandom(); + +Here, ```MyRandomGenerator``` must be a struct with the following member +functions, where Scalar and Index are the same as ```::Scalar``` +and ```::Index```. + +See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +TODO + + const Scalar& operator()(const array& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator[](Index index) + ??? mention coeff() and coeffRef() ??? + +### Scalar* data() +### const Scalar* data() const + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor ```Operations```. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expression are Evaluated" for more details about +their evaluation. + +### <Operation> constant(const Scalar& val) + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value ```val```. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.constant(2.0f); + Eigen::Tensor c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### <Operation> random() + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for ```setRandom()```. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### <Operation> operator-() + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### <Operation> sqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### <Operation> rsqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### <Operation> square() + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### <Operation> inverse() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### <Operation> exp() + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### <Operation> log() + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### <Operation> abs() + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### <Operation> pow(Scalar exponent) + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor b = a.cast().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### <Operation> operator * (Scalar scale) +TODO + +### <Operation> cwiseMax(Scalar threshold) +TODO + +### <Operation> cwiseMin(Scalar threshold) +TODO + + ### <Operation> unaryExpr(const CustomUnaryOp& func) +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### <Operation> operator+(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### <Operation> operator-(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### <Operation> operator*(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### <Operation> operator/(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### <Operation> cwiseMax(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### <Operation> cwiseMin(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### <Operation> Logical operators + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) + +* operator||(const OtherDerived& other) + +* operator<(const OtherDerived& other) + +* operator<=(const OtherDerived& other) + +* operator>(const OtherDerived& other) + +* operator>=(const OtherDerived& other) + +* operator==(const OtherDerived& other) + +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor if = ...; + Tensor then = ...; + Tensor else = ...; + Tensor result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contractions + +TODO + contract(const OtherDerived& other, const Dimensions& dims) + + + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as ```maximum()``` and ```sum()``` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +```::Dimensions``` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 3 dimensions: 2, 3, 4 + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor b = + a.maximum(Eigen::array({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +one-dimension tensor with a single value. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### <Operation> sum(const Dimensions& new_dims) +### <Operation> sum() + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### <Operation> mean(const Dimensions& new_dims) +### <Operation> mean() + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### <Operation> maximum(const Dimensions& new_dims) +### <Operation> maximum() + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### <Operation> minimum(const Dimensions& new_dims) +### <Operation> minimum() + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### <Operation> prod(const Dimensions& new_dims) +### <Operation> prod() + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer) + +Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Convolutions + +TBD: convolve(const KernelDerived& kernel, const Dimensions& dims) + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### <Operation> reshape(const Dimensions& new_dims) + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor input(7, 11); + array three_dims{{7, 11, 1}}; + Tensor result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array one_dim{{7 * 11}}; + Tensor result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you ```reshape()``` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array two_dim({2, 3}); + Eigen::Tensor b; + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### <Operation> shuffle(const Shuffle& shuffle) + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### <Operation> stride(const Strides& strides) + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you ```stride()``` a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array strides({3, 2}); + Eigen::Tensor b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(40, 90, 200); + output.stride({2, 3, 4}) = input; + + +### <Operation> slice(const StartIndices& startIndices, + const Sizes& sizes) + +TBD + + +### <Operation> chip(const Index offset, const Index dim) + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor row_3 = a.chip(2, 0); + Eigen::Tensor col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### <Operation> reverse(const ReverseDimensions& reverse) + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you ```reverse()``` the first dimension +of a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array reverse({true, false}); + Eigen::Tensor b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +TODO +### <Operation> broadcast(const Broadcast& broadcast) + +TODO + +### <Operation> concatenate(const OtherDerived& other, Axis axis) + +TODO + +### <Operation> pad(const PaddingDimensions& padding) + +TODO + +### <Operation> extract_patches(const PatchDims& patch_dims) + +TODO + +### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) + +TODO + + +## Special Operations + +### <Operation> cast<T>() + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor a(2, 3); + Eigen::Tensor b = a.cast(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor b = + (a.cast() / a.constant(2).cast()).cast(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### <Operation> eval() + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 1. It would be +more logical and user friendly to use tensors of rank 0 instead. For example +Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 1d tensor. In the +future these operations might be updated to return 0d tensors instead. + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* TensorVarDims are only partially supported +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + From c94174b4fe76636ae5f027ad8e59023cd154d90d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:13:08 -0800 Subject: [PATCH 137/214] Improved tensor references --- .../Eigen/CXX11/src/Tensor/TensorRef.h | 73 ++++++++++++++++++- unsupported/test/cxx11_tensor_ref.cpp | 16 ++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index d43fb286e..0a87e67eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -64,7 +64,7 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator(dummy); }; @@ -137,6 +137,8 @@ template class TensorRef : public TensorBase class TensorRef : public TensorBasedimensions().size(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } EIGEN_DEVICE_FUNC @@ -197,6 +201,13 @@ template class TensorRef : public TensorBase indices{{firstIndex, otherIndices...}}; return coeff(indices); } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + const std::size_t NumIndices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } #else EIGEN_DEVICE_FUNC @@ -237,6 +248,44 @@ template class TensorRef : public TensorBase indices; + indices[0] = i0; + indices[1] = i1; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeffRef(indices); + } #endif template EIGEN_DEVICE_FUNC @@ -244,7 +293,7 @@ template class TensorRef : public TensorBasedimensions(); Index index = 0; - if (PlainObjectType::Options&RowMajor) { + if (PlainObjectType::Options & RowMajor) { index += indices[0]; for (int i = 1; i < NumIndices; ++i) { index = index * dims[i] + indices[i]; @@ -257,6 +306,24 @@ template class TensorRef : public TensorBasecoeff(index); } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (int i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const @@ -298,6 +365,8 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = false, + Layout = TensorRef::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index 4ff94a059..aa369f278 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -181,6 +181,21 @@ static void test_ref_in_expr() } +static void test_coeff_ref() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + Tensor original = tensor; + + TensorRef> slice = tensor.chip(7, 4); + slice.coeffRef(0, 0, 0, 0) = 1.0f; + slice.coeffRef(1, 0, 0, 0) += 2.0f; + + VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f); + VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f); +} + + void test_cxx11_tensor_ref() { CALL_SUBTEST(test_simple_lvalue_ref()); @@ -189,4 +204,5 @@ void test_cxx11_tensor_ref() CALL_SUBTEST(test_slice()); CALL_SUBTEST(test_ref_of_ref()); CALL_SUBTEST(test_ref_in_expr()); + CALL_SUBTEST(test_coeff_ref()); } From b00fe1590dd72d51ac3e44c42102caac10a54c28 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:14:46 -0800 Subject: [PATCH 138/214] Added ability to swap the layout of a tensor --- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 198 ++++++++++++++++++ unsupported/test/cxx11_tensor_layout_swap.cpp | 61 ++++++ 2 files changed, 259 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h create mode 100644 unsupported/test/cxx11_tensor_layout_swap.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 000000000..7e448f7c0 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,198 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + + + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + CoeffReturnType* data() const { return m_impl.data(); } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp new file mode 100644 index 000000000..ae297a9da --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap.cpp @@ -0,0 +1,61 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; + +static void test_simple_swap() +{ + Tensor tensor(2,3,7); + tensor.setRandom(); + + Tensor tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i)); + } + } + } +} + + +static void test_swap_as_lvalue() +{ + Tensor tensor(2,3,7); + tensor.setRandom(); + + Tensor tensor2(7,3,2); + tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i)); + } + } + } +} + + +void test_cxx11_tensor_layout_swap() +{ + CALL_SUBTEST(test_simple_swap()); + CALL_SUBTEST(test_swap_as_lvalue()); +} From 4928ea121250fba0979933463624b1edf9863672 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:15:58 -0800 Subject: [PATCH 139/214] Added ability to reverse the order of the coefficients in a tensor --- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 207 ++++++++++++++++++ unsupported/test/cxx11_tensor_reverse.cpp | 167 ++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h create mode 100644 unsupported/test/cxx11_tensor_reverse.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 000000000..439cf3230 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,207 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReverseOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorReverseOp type; +}; + +} // end namespace internal + + + + +template +class TensorReverseOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind + StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, + const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) {} + + EIGEN_DEVICE_FUNC + const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_impl(op.expression(), device), m_reverse(op.reverse()) + { + // Compute strides + m_dimensions = m_impl.dimensions(); + if (Layout == ColMajor) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; + } + } else { + m_strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; + } + } + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + return m_impl.coeff(inputIndex); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_strides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[NumDims-1]) { + inputIndex += (m_dimensions[NumDims-1] - index - 1); + } else { + inputIndex += index; + } + return m_impl.coeff(inputIndex); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type + values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Scalar* data() const { return NULL; } + + protected: + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + ReverseDimensions m_reverse; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp new file mode 100644 index 000000000..4c0be35da --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::array; + +template +static void test_simple_reverse() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + Tensor reversed_tensor; + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + + reversed_tensor = tensor.reverse(dim_rev); + + VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2); + VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3); + VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5); + VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7); + + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + } + } + } + } +} + + +template +static void test_expr_reverse() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + + array dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + + Tensor expected; + expected = tensor.reverse(dim_rev); + + Tensor result(2,3,5,7); + + array src_slice_dim{{2,3,1,7}}; + array src_slice_start{{0,0,0,0}}; + array dst_slice_dim{{2,3,1,7}}; + array dst_slice_start{{0,0,0,0}}; + + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 3); + VERIFY_IS_EQUAL(result.dimension(2), 5); + VERIFY_IS_EQUAL(result.dimension(3), 7); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + for (int i = 0; i < 5; ++i) { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + dst_slice_start[2] += 1; + } + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + +void test_cxx11_tensor_reverse() +{ + CALL_SUBTEST(test_simple_reverse()); + CALL_SUBTEST(test_simple_reverse()); + CALL_SUBTEST(test_expr_reverse()); + CALL_SUBTEST(test_expr_reverse()); +} From 3bd2b41b2e074f9feb31bad7c3bf9769368b5d1a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:17:02 -0800 Subject: [PATCH 140/214] Created a test for tensor type casting --- unsupported/test/cxx11_tensor_casts.cpp | 41 +++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 unsupported/test/cxx11_tensor_casts.cpp diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp new file mode 100644 index 000000000..4f7ff7067 --- /dev/null +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +using Eigen::Tensor; +using Eigen::array; + +static void test_simple_cast() +{ + Tensor ftensor(20,30); + ftensor.setRandom(); + Tensor chartensor(20,30); + chartensor.setRandom(); + Tensor, 2> cplextensor(20,30); + cplextensor.setRandom(); + + chartensor = ftensor.cast(); + cplextensor = ftensor.cast>(); + + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 30; ++j) { + VERIFY_IS_EQUAL(chartensor(i,j), static_cast(ftensor(i,j))); + VERIFY_IS_EQUAL(cplextensor(i,j), static_cast>(ftensor(i,j))); + } + } +} + + +void test_cxx11_tensor_casts() +{ + CALL_SUBTEST(test_simple_cast()); +} From 8f4b8d204bd5f9bf3693b162b799397fa899220e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 10:19:33 -0800 Subject: [PATCH 141/214] Improved the performance of tensor reductions Added the ability to generate random numbers following a normal distribution Created a test to validate the ability to generate random numbers. --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 245 ++++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 218 +++++++++++++--- unsupported/test/cxx11_tensor_random.cpp | 78 ++++++ 3 files changed, 474 insertions(+), 67 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_random.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index e9aa22183..7b8d34321 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -16,50 +16,157 @@ namespace internal { // Standard reduction functors template struct SumReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SumReducer() : m_sum(0) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { - m_sum += t; + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) += t; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { - return m_sum; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); } - private: - typename internal::remove_all::type m_sum; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return saccum + predux(vaccum); + } +}; + +template struct MeanReducer +{ + static const bool PacketAccess = true; + MeanReducer() : count_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + (*accum) += t; + count_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + count_ += packet_traits::size; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum / count_; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / count_; + } + + protected: + int count_; }; template struct MaxReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max(-(std::numeric_limits::max)()) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { - if (t > m_max) { m_max = t; } + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t > *accum) { *accum = t; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { - return m_max; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmax(*accum, p); } - private: - typename internal::remove_all::type m_max; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return -(std::numeric_limits::max)(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(-(std::numeric_limits::max)()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return (std::max)(saccum, predux_max(vaccum)); + } }; template struct MinReducer { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MinReducer() : m_min((std::numeric_limits::max)()) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) { - if (t < m_min) { m_min = t; } + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t < *accum) { *accum = t; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const { - return m_min; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmin(*accum, p); } - private: - typename internal::remove_all::type m_min; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return (std::numeric_limits::max)(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1((std::numeric_limits::max)()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return (std::min)(saccum, predux_min(vaccum)); + } }; +template struct ProdReducer +{ + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + (*accum) *= t; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return static_cast(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(1); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + return saccum * predux_mul(vaccum); + } +}; + #if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__) // We're not compiling a cuda kernel template struct UniformRandomGenerator { + + static const bool PacketAccess = true; + template T operator()(Index, Index = 0) const { return random(); @@ -81,16 +188,19 @@ template struct UniformRandomGenerator { template struct UniformRandomGenerator; template <> struct UniformRandomGenerator { - UniformRandomGenerator() { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC UniformRandomGenerator() { const int tid = blockIdx.x * blockDim.x + threadIdx.x; curand_init(0, tid, 0, &m_state); } - template + template EIGEN_DEVICE_FUNC float operator()(Index, Index = 0) const { return curand_uniform(&m_state); } - template + template EIGEN_DEVICE_FUNC float4 packetOp(Index, Index = 0) const { return curand_uniform4(&m_state); } @@ -100,15 +210,18 @@ template <> struct UniformRandomGenerator { }; template <> struct UniformRandomGenerator { - UniformRandomGenerator() { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC UniformRandomGenerator() { const int tid = blockIdx.x * blockDim.x + threadIdx.x; curand_init(0, tid, 0, &m_state); } - template + template EIGEN_DEVICE_FUNC double operator()(Index, Index = 0) const { return curand_uniform_double(&m_state); } - template + template EIGEN_DEVICE_FUNC double2 packetOp(Index, Index = 0) const { return curand_uniform2_double(&m_state); } @@ -120,6 +233,84 @@ template <> struct UniformRandomGenerator { #endif +#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711 +// We're not compiling a cuda kernel +template struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + NormalRandomGenerator() : m_distribution(0, 1) {} + NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { } + + template + T operator()(Index, Index = 0) const { + return m_distribution(m_generator); + } + template + typename internal::packet_traits::type packetOp(Index, Index = 0) const { + const int packetSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT T values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_distribution(m_generator); + } + return internal::pload::type>(values); + } + + mutable std::normal_distribution m_distribution; + mutable std::default_random_engine m_generator; +}; + +#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__) + +// We're compiling a cuda kernel +template struct NormalRandomGenerator; + +template <> struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC NormalRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + + template EIGEN_DEVICE_FUNC + float operator()(Index, Index = 0) const { + return curand_normal(&m_state); + } + template EIGEN_DEVICE_FUNC + float4 packetOp(Index, Index = 0) const { + return curand_normal4(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +template <> struct NormalRandomGenerator { + + static const bool PacketAccess = true; + + EIGEN_DEVICE_FUNC NormalRandomGenerator() { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(0, tid, 0, &m_state); + } + template EIGEN_DEVICE_FUNC + double operator()(Index, Index = 0) const { + return curand_normal_double(&m_state); + } + template EIGEN_DEVICE_FUNC + double2 packetOp(Index, Index = 0) const { + return curand_normal2_double(&m_state); + } + + private: + mutable curandStatePhilox4_32_10_t m_state; +}; + +#endif + + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index cbe87394b..eebcc4850 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -43,6 +43,75 @@ struct nested, 1, typename eval type; }; + +template +struct are_inner_most_dims { + static const bool value = false; +}; +#if __cplusplus > 199711L +template +struct are_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_eq()(0, 0) && + index_statically_eq()(array_size::value-1, array_size::value-1); +}; +template +struct are_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_eq()(0, NumTensorDims - array_size::value) && + index_statically_eq()(array_size::value - 1, NumTensorDims - 1); +}; +#endif + + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const int packetSize = internal::unpacket_traits::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType p = reducer.template initializePacket(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizePacket(accum, p); + } +}; + } // end namespace internal @@ -52,8 +121,8 @@ class TensorReductionOp : public TensorBase typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -85,20 +154,27 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; static const int NumInputDims = internal::array_size::Dimensions>::value; static const int NumReducedDims = internal::array_size::value; - static const int NumDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; - typedef DSizes Dimensions; + static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims; + typedef DSizes Dimensions; typedef typename XprType::Scalar Scalar; + typedef TensorEvaluator, Device> Self; + static const bool InputPacketAccess = TensorEvaluator::PacketAccess; enum { IsAligned = false, - PacketAccess = false, // The code isn't vectorized properly yet + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()) { EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Bitmap indicating if an input dimension is reduced or not. array reduced; for (int i = 0; i < NumInputDims; ++i) { reduced[i] = false; @@ -122,24 +198,41 @@ struct TensorEvaluator, Device> } } - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + // Precompute output strides. + if (Layout == ColMajor) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } } - array strides; - strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - strides[i] = strides[i-1] * input_dims[i-1]; + // Precompute input strides. + array input_strides; + if (Layout == ColMajor) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } + } else { + input_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } } + outputIndex = 0; reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { if (reduced[i]) { - m_reducedStrides[reduceIndex] = strides[i]; + m_reducedStrides[reduceIndex] = input_strides[i]; ++reduceIndex; } else { - m_preservedStrides[outputIndex] = strides[i]; + m_preservedStrides[outputIndex] = input_strides[i]; ++outputIndex; } } @@ -147,6 +240,7 @@ struct TensorEvaluator, Device> // Special case for full reductions if (NumInputDims == NumReducedDims) { m_dimensions[0] = 1; + m_preservedStrides[0] = internal::array_prod(input_dims); } } @@ -161,14 +255,22 @@ struct TensorEvaluator, Device> m_impl.cleanup(); } - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { Op reducer(m_reducer); - reduce(firstInput(index), 0, reducer); - return reducer.finalize(); + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), + num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } } // TODO(bsteiner): provide a more efficient implementation. @@ -179,9 +281,20 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); - EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < packetSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -190,34 +303,59 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } private: + template friend struct internal::GenericDimReducer; + template friend struct internal::InnerMostDimReducer; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; + if (ReducingInnerMostDims) { + if (Layout == ColMajor) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumOutputDims - 1]; + } + } + Index startInput = 0; + if (Layout == ColMajor) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; } - startInput += index * m_preservedStrides[0]; return startInput; } - EIGEN_DEVICE_FUNC void reduce(Index firstIndex, int DimIndex, Op& reducer) const { - for (int j = 0; j < m_reducedDims[DimIndex]; ++j) { - const Index input = firstIndex + j * m_reducedStrides[DimIndex]; - if (DimIndex < NumReducedDims-1) { - reduce(input, DimIndex+1, reducer); - } else { - reducer.reduce(m_impl.coeff(input)); - } - } - } - + // Dimensions of the output of the operation. Dimensions m_dimensions; - array m_outputStrides; - array m_preservedStrides; + // Precomputed strides for the output tensor. + array m_outputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + array m_preservedStrides; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. array m_reducedDims; + + // Evaluator for the input expression. TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. Op m_reducer; }; diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp new file mode 100644 index 000000000..8276ae822 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -0,0 +1,78 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include + +static void test_default() +{ + Tensor vec(6); + vec.setRandom(); + + // Fixme: we should check that the generated numbers follow a uniform + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + +static void test_normal() +{ + Tensor vec(6); + vec.setRandom>(); + + // Fixme: we should check that the generated numbers follow a gaussian + // distribution instead. + for (int i = 1; i < 6; ++i) { + VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1)); + } +} + + +struct MyGenerator { + MyGenerator() { } + MyGenerator(const MyGenerator&) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const { + return 3 * element_location; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + const int packetSize = internal::packet_traits::size; + EIGEN_ALIGN_DEFAULT int values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = 3 * (packet_location + i); + } + return internal::pload::type>(values); + } +}; + + +static void test_custom() +{ + Tensor vec(6); + vec.setRandom(); + + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(vec(i), 3*i); + } +} + +void test_cxx11_tensor_random() +{ + CALL_SUBTEST(test_default()); + CALL_SUBTEST(test_normal()); + CALL_SUBTEST(test_custom()); +} From 5692723c588c219bca9523962a4620fe7cc4c4c9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 11:42:52 -0800 Subject: [PATCH 142/214] Improved the performance of the contraction code on CUDA --- .../CXX11/src/Tensor/TensorContractionCuda.h | 2060 +++++++++-------- 1 file changed, 1119 insertions(+), 941 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index babe33fff..f6bd949bd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1,7 +1,9 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -19,7 +21,7 @@ template +template __global__ void __launch_bounds__(512) - EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ volatile Scalar lhs_shmem[72 * 64]; - __shared__ volatile Scalar rhs_shmem[72 * 64]; +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ volatile Scalar lhs_shmem[72 * 64]; + __shared__ volatile Scalar rhs_shmem[72 * 64]; - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + bool check_lhs64 = (base_m + 63) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Packet Packet; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + static const int NumDims = max_n_1::size; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; } } - - - template -__device__ EIGEN_STRONG_INLINE void - EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float4* lhs_shmem4, float2* rhs_shmem2, - const Index m_size, const Index n_size, const Index k_size) { - typedef float Scalar; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - const Index lane = threadIdx.x + 8 * (threadIdx.y % 4); - - // prefetch registers - float4 lhs_pf0; - float4 lhs_pf1; - - float4 rhs_pf0; - float4 rhs_pf1; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // all of these indices assume float4 loading - // this thread loads the float4 starting at this index, and then also loads - // another float4 starting 32 columns to to the right - const Index horiz_block_idx = threadIdx.z / 2; - const Index vert_block_idx = threadIdx.x / 2 + 4 * (threadIdx.y % 2); - const Index horiz_idx_in_block = threadIdx.y / 2 + 4 * (threadIdx.z % 2); - const Index vert_idx_in_block = threadIdx.x % 2; - - // there's padding in both the LHS and RHS shared memory layouts. This padding - // allows for 0 bank conflicts on all shmem stores and loads. - // LHS padding: 1 float4 on each 8x8 block of floats - // RHS padding: 1 float2 on each block, and 12 additional float2s between vertical blocks - // 3 and 4 - - // storage indices - // lhs index with respect to float4s - const Index lhs_store_idx_base = - 136 * horiz_block_idx + - 17 * vert_block_idx + - 8 * vert_idx_in_block + - horiz_idx_in_block; - - // rhs index with respect to floats - const Index rhs_store_idx_base = - 552 * horiz_block_idx + - 66 * vert_block_idx + - 32 * (horiz_idx_in_block / 4) + (horiz_idx_in_block % 4) + - 16 * vert_idx_in_block + - ((vert_block_idx < 4) ? 0 : 24); - - const Index lhs_store_idx_0 = lhs_store_idx_base + 544 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 544 * 1; - - const Index rhs_store_idx_0 = (rhs_store_idx_base / 2) + ((lane < 16) ? 0 : 4); - const Index rhs_store_idx_1 = rhs_store_idx_0 + 2; - const Index rhs_store_idx_2 = rhs_store_idx_0 + 1104; - const Index rhs_store_idx_3 = rhs_store_idx_1 + 1104; - - // The below diagrams show which shmem index (with respect to floats) each element - // in an 8x8 input block gets packed into: - // LHS: - // 0 4 8 12 16 20 24 28 - // 1 5 9 13 17 21 25 29 - // 2 6 10 14 18 22 26 30 - // 3 7 11 15 19 23 27 31 - // 32 36 40 44 48 52 56 60 - // ... (pack as 2 rows of float4 indexed row major, each float4 is vertical) - // - // RHS: - // 0 1 2 3 32 33 34 35 - // 4 5 6 7 36 37 38 39 - // ... (pack as 2 cols of float4 indexed col major, each float4 is horizontal) - - // Each thread in a warp loads 2 float4s. This happens in 2 instructions. On each of these - // instruction, the warp loads 2 columns (2 cols * 64 elements / col = 128 elements = 32 threads - // * 4 elements/thread). For the LHS, we're able to store the loaded float4 directly into - // shmem (using a 128 bit store instruction). For the RHS, we need to transpose the data. - // This is done with warp shuffles. Furthermore, we only use 64 bit stores for the RHS, because - // 64 bits is only 2 columns (which is all we load in a warp), and the padding for the RHS - // doesn't meet 64 bit alignment requirements (namely, the 4 consecutive floats that we want - // to load on the RHS are 8 byte aligned, not 16 byte aligned, which is required for float4). - - const Index load_idx_vert = 4 * (threadIdx.x + 8 * (threadIdx.y % 2)); - const Index load_idx_horiz = (threadIdx.y / 2) + 4 * threadIdx.z; - - const Index lhs_vert = base_m + load_idx_vert; - const Index rhs_horiz_0 = base_n + load_idx_horiz; - const Index rhs_horiz_1 = base_n + load_idx_horiz + 32; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = internal::pset1(0); \ - lhs_pf1 = internal::pset1(0); \ - \ - rhs_pf0 = internal::pset1(0); \ - rhs_pf1 = internal::pset1(0); \ - \ - const Index lhs_horiz_0 = base_k + load_idx_horiz; \ - const Index lhs_horiz_1 = base_k + load_idx_horiz + 32; \ - if (!needs_edge_check || lhs_vert + 3 < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs.loadPacket(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs.loadPacket(lhs_vert, lhs_horiz_0); \ - } \ - } else if (lhs_vert + 2 < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ - \ - lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ - lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ - lhs_pf1.z = lhs(lhs_vert + 2, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - lhs_pf0.z = lhs(lhs_vert + 2, lhs_horiz_0); \ - } \ - } else if (lhs_vert + 1 < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - \ - lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ - lhs_pf1.y = lhs(lhs_vert + 1, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf0.y = lhs(lhs_vert + 1, lhs_horiz_0); \ - } \ - } else if (lhs_vert < m_size) { \ - if (lhs_horiz_1 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - lhs_pf1.x = lhs(lhs_vert + 0, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0.x = lhs(lhs_vert + 0, lhs_horiz_0); \ - } \ -} \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (rhs_vert + 3 < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz_0); \ - } \ - } else if (rhs_vert + 2 < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ - \ - rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz_0); \ - } \ - } else if (rhs_vert + 1 < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - \ - rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz_0); \ - } \ - } else if (rhs_vert < k_size) { \ - if (!needs_edge_check || rhs_horiz_1 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - rhs_pf1.x = rhs(rhs_vert + 0, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0.x = rhs(rhs_vert + 0, rhs_horiz_0); \ - } \ -} \ - \ - float swap_val0 = (lane < 16) ? rhs_pf0.z : rhs_pf0.x; \ - float swap_val1 = (lane < 16) ? rhs_pf0.w : rhs_pf0.y; \ - float swap_val2 = (lane < 16) ? rhs_pf1.z : rhs_pf1.x; \ - float swap_val3 = (lane < 16) ? rhs_pf1.w : rhs_pf1.y; \ - \ - swap_val0 = __shfl_xor(swap_val0, 16); \ - swap_val1 = __shfl_xor(swap_val1, 16); \ - swap_val2 = __shfl_xor(swap_val2, 16); \ - swap_val3 = __shfl_xor(swap_val3, 16); \ - \ - if (lane < 16) { \ - rhs_pf0.z = swap_val0; \ - rhs_pf0.w = swap_val1; \ - rhs_pf1.z = swap_val2; \ - rhs_pf1.w = swap_val3; \ - } else { \ - rhs_pf0.x = swap_val0; \ - rhs_pf0.y = swap_val1; \ - rhs_pf1.x = swap_val2; \ - rhs_pf1.y = swap_val3; \ - } \ -} \ - - -#define writeRegToShmem(_) \ - lhs_shmem4[lhs_store_idx_0] = lhs_pf0; \ - \ - rhs_shmem2[rhs_store_idx_0] = make_float2(rhs_pf0.x, rhs_pf0.z); \ - rhs_shmem2[rhs_store_idx_1] = make_float2(rhs_pf0.y, rhs_pf0.w); \ - \ - lhs_shmem4[lhs_store_idx_1] = lhs_pf1; \ - \ - rhs_shmem2[rhs_store_idx_2] = make_float2(rhs_pf1.x, rhs_pf1.z); \ - rhs_shmem2[rhs_store_idx_3] = make_float2(rhs_pf1.y, rhs_pf1.w); \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = Scalar(0); \ - Scalar res(i, 1) = Scalar(0); \ - Scalar res(i, 2) = Scalar(0); \ - Scalar res(i, 3) = Scalar(0); \ - Scalar res(i, 4) = Scalar(0); \ - Scalar res(i, 5) = Scalar(0); \ - Scalar res(i, 6) = Scalar(0); \ - Scalar res(i, 7) = Scalar(0); \ - - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - -#undef prefetchIntoRegisters -#undef writeRegoToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - - float4 _lcol0; - float4 _lcol1; - float2 _rrow0; - float2 _rrow1; - float2 _rrow2; - float2 _rrow3; - -#define lcol0 _lcol0.x -#define lcol1 _lcol0.y -#define lcol2 _lcol0.z -#define lcol3 _lcol0.w -#define lcol4 _lcol1.x -#define lcol5 _lcol1.y -#define lcol6 _lcol1.z -#define lcol7 _lcol1.w -#define rrow0 _rrow0.x -#define rrow1 _rrow0.y -#define rrow2 _rrow1.x -#define rrow3 _rrow1.y -#define rrow4 _rrow2.x -#define rrow5 _rrow2.y -#define rrow6 _rrow3.x -#define rrow7 _rrow3.y - - // Now x corresponds to k, y to m, and z to n - const float4* lhs_block = &lhs_shmem4[threadIdx.x + 8 * (threadIdx.y % 2) + 17 * (threadIdx.y / 2)]; - const float2* rhs_block = &rhs_shmem2[2 * threadIdx.x + 16 * (threadIdx.z % 2) + 276 * (threadIdx.z / 2)]; - -#define lhs_element(i, k) lhs_block[68 * i + 136 * k] -#define rhs_element(k, j) rhs_block[33 * k + 1104 * j + ((k < 4) ? 0 : 12)] - -#define loadData(i) \ - _lcol0 = lhs_element(0, i); \ - _rrow0 = rhs_element(i, 0); \ - _rrow1 = *(&(rhs_element(i, 0)) + 1); \ - _lcol1 = lhs_element(1, i); \ - _rrow2 = rhs_element(i, 1); \ - _rrow3 = *(&(rhs_element(i, 1)) + 1); \ - -#define computeCol(j) \ - res(0, j) += lcol0 * rrow##j; \ - res(1, j) += lcol1 * rrow##j; \ - res(2, j) += lcol2 * rrow##j; \ - res(3, j) += lcol3 * rrow##j; \ - res(4, j) += lcol4 * rrow##j; \ - res(5, j) += lcol5 * rrow##j; \ - res(6, j) += lcol6 * rrow##j; \ - res(7, j) += lcol7 * rrow##j; \ - -#define computePass(i) \ - loadData(i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol0 -#undef lcol1 -#undef lcol2 -#undef lcol3 -#undef lcol4 -#undef lcol5 -#undef lcol6 -#undef lcol7 -#undef rrow0 -#undef rrow1 -#undef rrow2 -#undef rrow3 -#undef rrow4 -#undef rrow5 -#undef rrow6 -#undef rrow7 - -#undef computePass -#undef computeCol -#undef loadData -#undef lhs_element -#undef rhs_element - - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (3) Copies the values into new registers using conditional logic. - -#define makeAssignments(i) \ - val0 = res(i, 0); \ - val1 = res(i, 1); \ - val2 = res(i, 2); \ - val3 = res(i, 3); \ - val4 = res(i, 4); \ - val5 = res(i, 5); \ - val6 = res(i, 6); \ - val7 = res(i, 7); \ - - Scalar val0; - Scalar val1; - Scalar val2; - Scalar val3; - Scalar val4; - Scalar val5; - Scalar val6; - Scalar val7; - - switch (threadIdx.x) { - case 0: - makeAssignments(0); - break; - case 1: - makeAssignments(1); - break; - case 2: - makeAssignments(2); - break; - case 3: - makeAssignments(3); - break; - case 4: - makeAssignments(4); - break; - case 5: - makeAssignments(5); - break; - case 6: - makeAssignments(6); - break; - case 7: - makeAssignments(7); - break; - } - -#undef res - - const Index vert_base = base_m + 4 * threadIdx.y + (threadIdx.x % 4) + 32 * (threadIdx.x / 4); - const Index horiz_base = base_n + 4 * threadIdx.z; - - if (!needs_edge_check || vert_base < m_size) { - if (!needs_edge_check || horiz_base + 35 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - output(vert_base, horiz_base + 33) = val5; - output(vert_base, horiz_base + 34) = val6; - output(vert_base, horiz_base + 35) = val7; - } else if (horiz_base + 34 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - output(vert_base, horiz_base + 33) = val5; - output(vert_base, horiz_base + 34) = val6; - } else if (horiz_base + 33 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - output(vert_base, horiz_base + 33) = val5; - } else if (horiz_base + 32 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - output(vert_base, horiz_base + 32) = val4; - } else if (horiz_base + 3 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - output(vert_base, horiz_base + 3) = val3; - } else if (horiz_base + 2 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - output(vert_base, horiz_base + 2) = val2; - } else if (horiz_base + 1 < n_size) { - output(vert_base, horiz_base + 0) = val0; - output(vert_base, horiz_base + 1) = val1; - } else if (horiz_base < n_size) { - output(vert_base, horiz_base + 0) = val0; - } - } - } - - - template -__global__ void - __launch_bounds__(512) - EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float4 lhs_shmem[(68 * 64) / 4]; - __shared__ float2 rhs_shmem[((66 * 8 + 24) * 8) / 2]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenFloatContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } - } - - - template - struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Packet Packet; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; - - typedef array::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; - - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value> right_nocontract_t; - - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); } else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } + evalTyped(buffer); } } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; - // rows in left side - const Index m = this->m_i_size; + // rows in left side + const Index m = this->m_i_size; - // columns in right side - const Index n = this->m_j_size; + // columns in right side + const Index n = this->m_j_size; - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - typedef internal::TensorContractionInputMapper LhsMapper; + typedef internal::TensorContractionInputMapper LhsMapper; - typedef internal::TensorContractionInputMapper RhsMapper; + typedef internal::TensorContractionInputMapper RhsMapper; - typedef internal::blas_data_mapper OutputMapper; + typedef internal::blas_data_mapper OutputMapper; - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); - OutputMapper output(buffer, m); + OutputMapper output(buffer, m); + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + if (internal::is_same::value && + internal::is_same::value) { + if (m < 768 || n < 768) { const Index m_blocks = (m + 63) / 64; const Index n_blocks = (n + 63) / 64; const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - - cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); - if (internal::is_same::value && - internal::is_same::value) { - EigenFloatContractionKernel - <<m_device.stream()>>>(lhs, rhs, output, m, n, k); - } else { - EigenContractionKernel - <<m_device.stream()>>>(lhs, rhs, output, m, n, k); - } - - assert(cudaGetLastError() == cudaSuccess); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); } - }; + } else { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k); + } + } +}; } // end namespace Eigen #endif // EIGEN_USE_GPU and __CUDACC__ - #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H From 0a0ab6dd158e3f4471ba1fe20454de35b18fdce5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 11:45:17 -0800 Subject: [PATCH 143/214] Increased the functionality of the tensor devices --- .../Eigen/CXX11/src/Tensor/TensorDeviceType.h | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h index bb05e4177..efd207507 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h @@ -43,11 +43,14 @@ typedef std::promise Promise; static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) { f->wait(); - // eigen_assert(f->ready()); +} +static EIGEN_STRONG_INLINE void get_when_ready(Future* f) { + f->get(); } + struct ThreadPoolDevice { - ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { } + ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return internal::aligned_malloc(num_bytes); @@ -79,9 +82,9 @@ struct ThreadPoolDevice { } private: - // todo: NUMA, ... size_t num_threads_; }; + #endif @@ -114,6 +117,10 @@ static inline int sharedMemPerBlock() { return m_deviceProperties.sharedMemPerBlock; } +static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + assert(status == cudaSuccess); +} struct GpuDevice { // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction. @@ -163,10 +170,19 @@ struct GpuDevice { return 32; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { + cudaStreamSynchronize(*stream_); + } + private: // TODO: multigpu. const cudaStream_t* stream_; }; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); + #endif } // end namespace Eigen From 71676eaddd7fb6b8abdc5713f437750f3c963fcb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:36:57 -0800 Subject: [PATCH 144/214] Added support for RowMajor inputs to the contraction code. --- .../CXX11/src/Tensor/TensorContraction.h | 267 ++++++++++++------ .../CXX11/src/Tensor/TensorContractionCuda.h | 6 +- .../src/Tensor/TensorContractionThreadPool.h | 39 ++- 3 files changed, 219 insertions(+), 93 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c5ec42cf4..a02a273e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -320,6 +320,8 @@ class TensorContractionInputMapper }; + + template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + template struct traits > { @@ -378,6 +388,10 @@ struct traits > typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + // From NumDims below. + static const int NumDimensions = max_n_1::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; + static const int Layout = traits::Layout; + enum { Flags = 0, }; @@ -401,19 +415,19 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value>::size; }; } // end namespace internal - - template class TensorContractionOp : public TensorBase > { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; - typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::promote_storage_type::ret CoeffReturnType; typedef typename internal::promote_storage_type::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } protected: typename LhsXprType::Nested m_lhs_xpr; @@ -444,12 +459,17 @@ class TensorContractionOp : public TensorBase struct max_n_1 { - static const size_t size = n; -}; -template <> struct max_n_1<0> { - static const size_t size = 1; -}; +template struct Cond {}; + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond, const T1&, const T2& second) { + return second; +} template @@ -467,37 +487,94 @@ struct TensorContractionEvaluatorBase typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; - typedef array::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; - - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; - - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; - - typedef DSizes Dimensions; - enum { IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL) - { + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + static const int NumDims = internal::max_n_1::size; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_result(NULL) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == + TensorEvaluator::Layout), + YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); - array::Dimensions::count> lhs_strides; - lhs_strides[0] = 1; - for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i]; + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (Layout == ColMajor) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[i].first; + } } - array::Dimensions::count> rhs_strides; + array lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; + } + + array rhs_strides; rhs_strides[0] = 1; - for (int i = 0; i < TensorEvaluator::Dimensions::count-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i]; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; } m_i_strides[0] = 1; @@ -515,27 +592,28 @@ struct TensorContractionEvaluatorBase m_lhs_inner_dim_contiguous = true; int dim_idx = 0; int nocontract_idx = 0; - const typename TensorEvaluator::Dimensions& left_dims = m_leftImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + + for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor bool contracting = false; - for (int j = 0; j < internal::array_size::value; j++) { - if (op.indices()[j].first == i) { + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { contracting = true; break; } } if (!contracting) { // add dimension size to output dimensions - m_dimensions[dim_idx] = left_dims[i]; + m_dimensions[dim_idx] = eval_left_dims[i]; m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; if (dim_idx != i) { m_lhs_inner_dim_contiguous = false; } if (nocontract_idx+1 < internal::array_size::value) { - m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i]; + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; } else { - m_i_size = m_i_strides[nocontract_idx] * left_dims[i]; + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; } dim_idx++; nocontract_idx++; @@ -543,22 +621,22 @@ struct TensorContractionEvaluatorBase } nocontract_idx = 0; - const typename TensorEvaluator::Dimensions& right_dims = m_rightImpl.dimensions(); - for (int i = 0; i < TensorEvaluator::Dimensions::count; i++) { + for (int i = 0; i < RDims; i++) { bool contracting = false; // find if we are contracting on index i of right tensor - for (int j = 0; j < internal::array_size::value; j++) { - if (op.indices()[j].second == i) { + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { contracting = true; break; } } if (!contracting) { - m_dimensions[dim_idx] = right_dims[i]; + m_dimensions[dim_idx] = eval_right_dims[i]; if (nocontract_idx+1 < internal::array_size::value) { - m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i]; + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; } else { - m_j_size = m_j_strides[nocontract_idx] * right_dims[i]; + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; } m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; dim_idx++; @@ -573,12 +651,13 @@ struct TensorContractionEvaluatorBase // each tensor, we'll only look at the first tensor here. m_rhs_inner_dim_contiguous = true; m_rhs_inner_dim_reordered = false; - for (int i = 0; i < internal::array_size::value; i++) { - Index left = op.indices()[i].first; - Index right = op.indices()[i].second; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; - Index size = left_dims[left]; - eigen_assert(size == right_dims[right] && "Contraction axes must be same size"); + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); if (i+1 < internal::array_size::value) { m_k_strides[i+1] = m_k_strides[i] * size; @@ -588,7 +667,7 @@ struct TensorContractionEvaluatorBase m_left_contracting_strides[i] = lhs_strides[left]; m_right_contracting_strides[i] = rhs_strides[right]; - if (i > 0 && right < op.indices()[i-1].second) { + if (i > 0 && right < eval_op_indices[i-1].second) { m_rhs_inner_dim_reordered = true; } if (right != i) { @@ -597,9 +676,16 @@ struct TensorContractionEvaluatorBase } // Scalar case. We represent the result as a 1d tensor of size 1. - if (TensorEvaluator::Dimensions::count + TensorEvaluator::Dimensions::count == 2 * internal::array_size::value) { + if (LDims + RDims == 2 * ContractDims) { m_dimensions[0] = 1; } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (Layout == RowMajor) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + std::swap(m_dimensions[i], m_dimensions[j]); + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -661,10 +747,10 @@ struct TensorContractionEvaluatorBase const Index rows = m_i_size; const Index cols = m_k_size; - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; const int lhs_packet_size = internal::packet_traits::size; const int rhs_packet_size = internal::packet_traits::size; typedef internal::TensorContractionInputMapper m_leftImpl; - TensorEvaluator m_rightImpl; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; const Device& m_device; Scalar* m_result; }; +// evaluator for default device template struct TensorEvaluator, Device> : - public TensorContractionEvaluatorBase, Device> > { + public TensorContractionEvaluatorBase< + TensorEvaluator, Device> > { typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; @@ -759,15 +846,35 @@ struct TensorEvaluator::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; + enum { + Layout = TensorEvaluator::Layout, + }; - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + static const int NumDims = internal::max_n_1::size; + + // Could we use NumDimensions here? typedef DSizes Dimensions; @@ -799,15 +906,15 @@ struct TensorEvaluatorm_device.memset(buffer, 0, m * n * sizeof(Scalar)); // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; const Index nr = Traits::nr; const Index mr = Traits::mr; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; const int lhs_packet_size = internal::packet_traits::size; const int rhs_packet_size = internal::packet_traits::size; @@ -826,10 +933,10 @@ struct TensorEvaluator OutputMapper; - // Declare GEBP packing and kernel structs internal::gemm_pack_lhs pack_lhs; internal::gemm_pack_rhs pack_rhs; + internal::gebp_kernel gebp; // initialize data mappers diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index f6bd949bd..588770bb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1241,10 +1241,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = internal::max_n_1::size; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index f0e9bb616..5851e5adc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -70,24 +70,43 @@ struct TensorEvaluator::Dimensions::count> left_dim_mapper_t; - typedef array::Dimensions::count> right_dim_mapper_t; + enum { + Layout = TensorEvaluator::Layout, + }; - typedef array::value> contract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> left_nocontract_t; - typedef array::Dimensions::count - internal::array_size::value>::size> right_nocontract_t; + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; - static const int NumDims = max_n_1::Dimensions::count + TensorEvaluator::Dimensions::count - 2 * internal::array_size::value>::size; + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; + + static const int NumDims = max_n_1::size; typedef DSizes Dimensions; // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; typedef typename internal::gebp_traits Traits; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} From b12dd1ae3cc4077740dded430bc244623a6cc3b8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:39:34 -0800 Subject: [PATCH 145/214] Misc improvements for fixed size tensors --- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 32 +++++++++++++++---- unsupported/test/cxx11_tensor_fixed_size.cpp | 13 +++++--- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 1af2d7bcd..94b3f957b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -42,7 +42,9 @@ class TensorFixedSize : public TensorBase::size > 1), - }; + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + }; typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; @@ -51,11 +53,12 @@ class TensorFixedSize : public TensorBase m_storage; public: - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED // work, because that uses base().coeffRef() - and we don't yet @@ -187,6 +190,23 @@ class TensorFixedSize : public TensorBase Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 99ffc7f07..8a27f5ad8 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -32,13 +32,14 @@ static void test_1d() vec1(5) = 42.0; vec2(5) = 5.0; float data3[6]; - TensorMap > > vec3(data3, Sizes<6>()); + TensorMap > > vec3(data3, 6); vec3 = vec1.sqrt(); float data4[6]; - TensorMap, RowMajor> > vec4(data4, Sizes<6>()); + TensorMap, RowMajor> > vec4(data4, 6); vec4 = vec2.sqrt(); VERIFY_IS_EQUAL((vec3.size()), 6); + VERIFY_IS_EQUAL(vec3.rank(), 1); // VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6); // VERIFY_IS_EQUAL((vec3.dimension(0)), 6); @@ -68,11 +69,12 @@ static void test_1d() static void test_2d() { float data1[6]; - TensorMap >> mat1(data1, Sizes<2, 3>()); + TensorMap >> mat1(data1,2,3); float data2[6]; - TensorMap, RowMajor>> mat2(data2, Sizes<2, 3>()); + TensorMap, RowMajor>> mat2(data2,2,3); VERIFY_IS_EQUAL((mat1.size()), 2*3); + VERIFY_IS_EQUAL(mat1.rank(), 2); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); @@ -120,6 +122,7 @@ static void test_3d() TensorFixedSize, RowMajor> mat2; VERIFY_IS_EQUAL((mat1.size()), 2*3*7); + VERIFY_IS_EQUAL(mat1.rank(), 3); // VERIFY_IS_EQUAL((mat1.dimension(0)), 2); // VERIFY_IS_EQUAL((mat1.dimension(1)), 3); // VERIFY_IS_EQUAL((mat1.dimension(2)), 7); @@ -166,7 +169,7 @@ static void test_array() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { - mat1(array{{i,j,k}}) = val; + mat1(i,j,k) = val; val += 1.0; } } From 7e0b6c56b45be9adf002e59f97902c8a760519af Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:41:30 -0800 Subject: [PATCH 146/214] Added ability to initialize a tensor using an initializer list --- .../CXX11/src/Tensor/TensorInitializer.h | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 000000000..6afef0fbb --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + +#include + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list< + typename Initializer::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (auto v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (auto v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + // Static initialization not implemented for VarDims tensors. + eigen_assert(false); + } +}; + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H From 1a36590e8475f688ef42122c0dd96f7a3b89654e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:43:20 -0800 Subject: [PATCH 147/214] Fixed the printing of RowMajor tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 15 ++++- unsupported/test/cxx11_tensor_io.cpp | 58 ++++++++++++++++--- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h index 959b5db73..a9d0f6c39 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h @@ -12,6 +12,14 @@ namespace Eigen { +namespace internal { +template<> +struct significant_decimals_impl + : significant_decimals_default_impl +{}; +} + + template std::ostream& operator << (std::ostream& os, const TensorBase& expr) { // Evaluate the expression if needed @@ -19,18 +27,19 @@ std::ostream& operator << (std::ostream& os, const TensorBase, DefaultDevice> tensor(eval, DefaultDevice()); tensor.evalSubExprsIfNeeded(NULL); - typedef typename T::Scalar Scalar; + typedef typename internal::remove_const::type Scalar; typedef typename T::Index Index; typedef typename TensorEvaluator, DefaultDevice>::Dimensions Dimensions; const Index total_size = internal::array_prod(tensor.dimensions()); // Print the tensor as a 1d vector or a 2d matrix. if (internal::array_size::value == 1) { - Map > array(tensor.data(), total_size); + Map > array(const_cast(tensor.data()), total_size); os << array; } else { const Index first_dim = tensor.dimensions()[0]; - Map > matrix(tensor.data(), first_dim, total_size/first_dim); + static const int layout = TensorEvaluator, DefaultDevice>::Layout; + Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); os << matrix; } diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp index b73c024f5..8bbcf7089 100644 --- a/unsupported/test/cxx11_tensor_io.cpp +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -13,9 +13,10 @@ #include +template static void test_output_1d() { - Tensor tensor(5); + Tensor tensor(5); for (int i = 0; i < 5; ++i) { tensor(i) = i; } @@ -28,9 +29,10 @@ static void test_output_1d() } +template static void test_output_2d() { - Tensor tensor(5, 3); + Tensor tensor(5, 3); for (int i = 0; i < 5; ++i) { for (int j = 0; j < 3; ++j) { tensor(i, j) = i*j; @@ -45,10 +47,11 @@ static void test_output_2d() } +template static void test_output_expr() { - Tensor tensor1(5); - Tensor tensor2(5); + Tensor tensor1(5); + Tensor tensor2(5); for (int i = 0; i < 5; ++i) { tensor1(i) = i; tensor2(i) = 7; @@ -62,9 +65,50 @@ static void test_output_expr() } +template +static void test_output_string() +{ + Tensor tensor(5, 3); + tensor.setConstant(std::string("foo")); + + std::cout << tensor << std::endl; + + std::stringstream os; + os << tensor; + + std::string expected("foo foo foo\nfoo foo foo\nfoo foo foo\nfoo foo foo\nfoo foo foo"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + +template +static void test_output_const() +{ + Tensor tensor(5); + for (int i = 0; i < 5; ++i) { + tensor(i) = i; + } + + TensorMap > tensor_map(tensor.data(), 5); + + std::stringstream os; + os << tensor_map; + + std::string expected("0\n1\n2\n3\n4"); + VERIFY_IS_EQUAL(std::string(os.str()), expected); +} + + void test_cxx11_tensor_io() { - CALL_SUBTEST(test_output_1d()); - CALL_SUBTEST(test_output_2d()); - CALL_SUBTEST(test_output_expr()); + CALL_SUBTEST(test_output_1d()); + CALL_SUBTEST(test_output_1d()); + CALL_SUBTEST(test_output_2d()); + CALL_SUBTEST(test_output_2d()); + CALL_SUBTEST(test_output_expr()); + CALL_SUBTEST(test_output_expr()); + CALL_SUBTEST(test_output_string()); + CALL_SUBTEST(test_output_string()); + CALL_SUBTEST(test_output_const()); + CALL_SUBTEST(test_output_const()); } From 0526dc1bb4091c484f5a0dab71818f48c0d4fc5f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:44:08 -0800 Subject: [PATCH 148/214] Added missing apis to the tensor class --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 105 +++++++++++++++++--- 1 file changed, 92 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index ceed09505..e125ca799 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -77,18 +77,20 @@ class Tensor : public TensorBase > enum { IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign), PacketAccess = (internal::packet_traits::size > 1), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, }; static const int Options = Options_; - static const std::size_t NumIndices = NumIndices_; - - typedef DSizes Dimensions; + typedef DSizes Dimensions; protected: TensorStorage m_storage; public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const { return m_storage.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } @@ -153,6 +155,27 @@ class Tensor : public TensorBase > EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array(i0, i1, i2, i3, i4)); + } #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const @@ -182,6 +205,27 @@ class Tensor : public TensorBase > EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) return operator()(array{{firstIndex, secondIndex, otherIndices...}}); } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array(i0, i1, i2, i3, i4)); + } #endif EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) @@ -223,6 +267,32 @@ class Tensor : public TensorBase > // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) } +#else + inline explicit Tensor(Index dim1) + : m_storage(dim1, array(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array(dim1, dim2)) + { + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } #endif inline explicit Tensor(const array& dimensions) @@ -231,24 +301,24 @@ class Tensor : public TensorBase > EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) { typedef TensorAssignOp Assign; Assign assign(*this, other.derived()); resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); internal::TensorExecutor::run(assign, DefaultDevice()); } - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) @@ -297,7 +367,16 @@ class Tensor : public TensorBase > #endif } + void resize(const DSizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + protected: + bool checkIndexRange(const array& indices) const { using internal::array_apply_and_reduce; From 378bdfb7f0c4b2a8eb2b91c2a65f3bc1c57e689e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:45:20 -0800 Subject: [PATCH 149/214] Added missing apis to the TensorMap class --- .../Eigen/CXX11/src/Tensor/TensorMap.h | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 0a8c10ac7..2cb2bc7a6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -48,6 +48,8 @@ template class TensorMap : public Tensor enum { IsAligned = ((int(Options_)&Aligned)==Aligned), PacketAccess = (internal::packet_traits::size > 1), + Layout = PlainObjectType::Layout, + CoordAccess = true, }; #ifdef EIGEN_HAS_VARIADIC_TEMPLATES @@ -62,13 +64,35 @@ template class TensorMap : public Tensor // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } #endif - template - inline TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + inline TensorMap(PointerArgType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } + template + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } EIGEN_DEVICE_FUNC From 1ac86001266db55b78086617fb68206b29748919 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 12:47:46 -0800 Subject: [PATCH 150/214] Fixed the return type of coefficient wise operations. For example, the abs function returns a floating point value when called on a complex input. --- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 58 ++++++++++++- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 85 +++++++++++-------- 2 files changed, 106 insertions(+), 37 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f7c784942..97f225f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -34,9 +34,15 @@ struct TensorEvaluator typedef typename Derived::Packet PacketReturnType; typedef typename Derived::Dimensions Dimensions; + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + enum { IsAligned = Derived::IsAligned, PacketAccess = Derived::PacketAccess, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -77,6 +83,24 @@ struct TensorEvaluator return internal::pstoret(m_data + index, x); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + if (Layout == ColMajor) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { + eigen_assert(m_data); + if (Layout == ColMajor) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + Scalar* data() const { return m_data; } protected: @@ -97,9 +121,15 @@ struct TensorEvaluator typedef typename Derived::Packet PacketReturnType; typedef typename Derived::Dimensions Dimensions; + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + enum { IsAligned = Derived::IsAligned, PacketAccess = Derived::PacketAccess, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&) @@ -126,6 +156,17 @@ struct TensorEvaluator return internal::ploadt_ro(m_data + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data); + const Index index = (Layout == ColMajor) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); +#ifdef __CUDA_ARCH__ + return __ldg(m_data+index); +#else + return m_data[index]; +#endif + } + const Scalar* data() const { return m_data; } protected: @@ -146,6 +187,8 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC @@ -194,6 +237,8 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -247,6 +292,8 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -254,7 +301,8 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); } typedef typename XprType::Index Index; @@ -309,6 +357,8 @@ struct TensorEvaluator IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess/* & TensorEvaluator::PacketAccess*/, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -316,8 +366,10 @@ struct TensorEvaluator m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) { - eigen_assert(internal::dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); - eigen_assert(internal::dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); } typedef typename XprType::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index 6e5503de1..b66b3ec2c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -17,14 +17,14 @@ namespace Eigen { * * \brief Tensor expression classes. * - * The TensorCwiseNullaryOp class applies a nullary operators to an expression. This - * is typically used to generate constants. + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. * * The TensorCwiseUnaryOp class represents an expression where a unary operator * (e.g. cwiseSqrt) is applied to an expression. * - * The TensorCwiseBinaryOp class represents an expression where a binary operator - * (e.g. addition) is applied to a lhs and a rhs expression. + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. * */ namespace internal { @@ -33,9 +33,12 @@ struct traits > : traits { typedef typename XprType::Packet Packet; + typedef traits XprTraits; typedef typename XprType::Scalar Scalar; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -47,7 +50,7 @@ struct traits > template -class TensorCwiseNullaryOp : public TensorBase > +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -81,12 +84,15 @@ template struct traits > : traits { - typedef typename result_of< - UnaryOp(typename XprType::Scalar) - >::type Scalar; + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; typedef typename internal::packet_traits::type Packet; typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -106,14 +112,16 @@ struct nested, 1, typename eval -class TensorCwiseUnaryOp : public TensorBase > +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> { public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -139,22 +147,27 @@ namespace internal { template struct traits > { - // Type promotion to handle the case where the types of the lhs and the rhs are different. + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. typedef typename result_of< - BinaryOp( - typename LhsXprType::Scalar, - typename RhsXprType::Scalar - ) - >::type Scalar; + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits XprTraits; typedef typename internal::packet_traits::type Packet; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; + typedef typename promote_storage_type< + typename traits::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits::Index, + typename traits::Index>::type Index; typedef typename LhsXprType::Nested LhsNested; typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -178,21 +191,22 @@ struct nested, 1, typename template -class TensorCwiseBinaryOp : public TensorBase > +class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> { public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::internal::traits::Packet Packet; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::internal::traits::Packet Packet; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; } @@ -219,7 +233,8 @@ struct traits > : traits { typedef typename traits::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -227,6 +242,8 @@ struct traits > typedef typename IfXprType::Nested IfNested; typedef typename ThenXprType::Nested ThenNested; typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template From cd679f2c475b891b6498ae04ee8fdcd68c1cb589 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 14 Jan 2015 22:06:09 +0100 Subject: [PATCH 151/214] Fix doc: setConstant does not exist for SparseMatrix. --- doc/SparseQuickReference.dox | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index 4a33d0cc9..d04ac35c5 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -71,11 +71,10 @@ i.e either row major or column major. The default is column major. Most arithmet Constant or Random Insertion \code -sm1.setZero(); // Set the matrix with zero elements -sm1.setConstant(val); //Replace all the nonzero values with val +sm1.setZero(); \endcode - The matrix sm1 should have been created before ??? +Remove all non-zero coefficients From 0feff6e987750a61f0ee14774efaef85d2fb6fac Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:29:48 -0800 Subject: [PATCH 152/214] Expanded the functionality of index lists --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 105 +++++++++++++- unsupported/test/cxx11_tensor_index_list.cpp | 131 ++++++++++++++++++ 2 files changed, 231 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index eaf0195ce..209749042 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -95,6 +95,20 @@ struct tuple_coeff { return ((i == Idx) & is_compile_time_constant >::type>::value) || tuple_coeff::value_known_statically(i, t); } + + template + static constexpr bool values_up_to_known_statically(const std::tuple& t) { + return is_compile_time_constant >::type>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple& t) { + return is_compile_time_constant >::type>::value && + is_compile_time_constant >::type>::value && + std::get(t) > std::get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } }; template <> @@ -110,10 +124,20 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } + + template + static constexpr bool values_up_to_known_statically(const std::tuple& t) { + return is_compile_time_constant >::type>::value; + } + + template + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple& t) { + return true; + } }; } // namespace internal @@ -133,6 +157,13 @@ struct IndexList : std::tuple { constexpr bool value_known_statically(const DenseIndex i) const { return internal::tuple_coeff >::value-1>::value_known_statically(i, *this); } + constexpr bool all_values_known_statically() const { + return internal::tuple_coeff >::value-1>::values_up_to_known_statically(*this); + } + + constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff >::value-1>::values_up_to_statically_known_to_increase(*this); + } }; @@ -144,6 +175,14 @@ constexpr IndexList make_index_list(FirstType val1, Ot namespace internal { +template size_t array_prod(const IndexList& sizes) { + size_t result = 1; + for (int i = 0; i < array_size >::value; ++i) { + result *= sizes[i]; + } + return result; +} + template struct array_size > { static const size_t value = std::tuple_size >::value; }; @@ -179,6 +218,48 @@ struct index_known_statically > { } }; +template +struct all_indices_known_statically { + constexpr bool operator() () const { + return false; + } +}; + +template +struct all_indices_known_statically > { + constexpr bool operator() () const { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically > { + constexpr bool operator() () const { + return IndexList().all_values_known_statically(); + } +}; + +template +struct indices_statically_known_to_increase { + constexpr bool operator() () const { + return false; + } +}; + +template +struct indices_statically_known_to_increase > { + constexpr bool operator() () const { + return IndexList().values_statically_known_to_increase(); + } +}; + +template +struct indices_statically_known_to_increase > { + constexpr bool operator() () const { + return IndexList().values_statically_known_to_increase(); + } +}; + template struct index_statically_eq { constexpr bool operator() (DenseIndex, DenseIndex) const { @@ -190,7 +271,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] == value); + IndexList()[i] == value; } }; @@ -198,7 +279,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] == value); + IndexList()[i] == value; } }; @@ -213,7 +294,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] != value); + IndexList()[i] != value; } }; @@ -221,7 +302,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - (IndexList()[i] != value); + IndexList()[i] != value; } }; @@ -242,6 +323,20 @@ struct index_known_statically { } }; +template +struct all_indices_known_statically { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + +template +struct indices_statically_known_to_increase { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const { + return false; + } +}; + template struct index_statically_eq { EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index 6a103cab1..d79a3ed45 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -44,6 +44,120 @@ static void test_static_index_list() } +static void test_type2index_list() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + tensor += tensor.constant(10.0f); + + typedef Eigen::IndexList> Dims0; + typedef Eigen::IndexList, Eigen::type2index<1>> Dims1; + typedef Eigen::IndexList, Eigen::type2index<1>, Eigen::type2index<2>> Dims2; + typedef Eigen::IndexList, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3; + typedef Eigen::IndexList, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4; + +#if 0 + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::are_inner_most_dims::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const Dims0 reduction_axis0; + Tensor result0 = tensor.sum(reduction_axis0); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + float expected = 0.0f; + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + VERIFY_IS_APPROX(result0(j,k,l,m), expected); + } + } + } + } + + const Dims1 reduction_axis1; + Tensor result1 = tensor.sum(reduction_axis1); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + float expected = 0.0f; + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + VERIFY_IS_APPROX(result1(k,l,m), expected); + } + } + } + + const Dims2 reduction_axis2; + Tensor result2 = tensor.sum(reduction_axis2); + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + float expected = 0.0f; + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + VERIFY_IS_APPROX(result2(l,m), expected); + } + } + + const Dims3 reduction_axis3; + Tensor result3 = tensor.sum(reduction_axis3); + for (int m = 0; m < 11; ++m) { + float expected = 0.0f; + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + } + VERIFY_IS_APPROX(result3(m), expected); + } + + const Dims4 reduction_axis4; + Tensor result4 = tensor.sum(reduction_axis4); + float expected = 0.0f; + for (int m = 0; m < 11; ++m) { + for (int l = 0; l < 7; ++l) { + for (int k = 0; k < 5; ++k) { + for (int j = 0; j < 3; ++j) { + for (int i = 0; i < 2; ++i) { + expected += tensor(i,j,k,l,m); + } + } + } + } + } + VERIFY_IS_APPROX(result4(0), expected); +} + + static void test_dynamic_index_list() { Tensor tensor(2,3,5,7); @@ -105,10 +219,25 @@ static void test_mixed_index_list() EIGEN_STATIC_ASSERT((internal::index_known_statically()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((internal::index_statically_eq()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((internal::index_statically_eq()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#if 0 + EIGEN_STATIC_ASSERT((internal::all_indices_known_statically()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif + typedef IndexList, type2index<1>, type2index<2>, type2index<3>> ReductionList; + ReductionList reduction_list; + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::index_statically_eq()(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#if 0 + EIGEN_STATIC_ASSERT((internal::all_indices_known_statically()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE); +#endif Tensor result1 = tensor.sum(reduction_axis); Tensor result2 = tensor.sum(reduction_indices); + Tensor result3 = tensor.sum(reduction_list); float expected = 0.0f; for (int i = 0; i < 2; ++i) { @@ -122,12 +251,14 @@ static void test_mixed_index_list() } VERIFY_IS_APPROX(result1(0), expected); VERIFY_IS_APPROX(result2(0), expected); + VERIFY_IS_APPROX(result3(0), expected); } void test_cxx11_tensor_index_list() { CALL_SUBTEST(test_static_index_list()); + CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); } From 4cdf3fe427b4fdc271733d0404a66e2d5613cb16 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:30:47 -0800 Subject: [PATCH 153/214] Misc fixes --- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 136 ++++++------------ 1 file changed, 44 insertions(+), 92 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 6d9e09318..6c9a67c58 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -40,6 +40,10 @@ template struct IndexPair { // Boilerplate code namespace internal { +template struct dget { + static const std::size_t value = get::value; +}; + template struct fixed_size_tensor_index_linearization_helper @@ -49,7 +53,7 @@ struct fixed_size_tensor_index_linearization_helper const Dimensions& dimensions) { return array_get(indices) + - get::value * + dget::value * fixed_size_tensor_index_linearization_helper::run(indices, dimensions); } }; @@ -75,6 +79,10 @@ struct Sizes : internal::numeric_list { typedef internal::numeric_list Base; static const std::size_t total_size = internal::arg_prod(Indices...); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return Base::count; + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() { return internal::arg_prod(Indices...); } @@ -85,6 +93,7 @@ struct Sizes : internal::numeric_list { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template Sizes(DenseIndex... indices) { } explicit Sizes(std::initializer_list /*l*/) { // todo: add assertion } @@ -121,11 +130,15 @@ struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes : typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type { +template struct Sizes { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; static const size_t count = Base::count; static const std::size_t total_size = internal::arg_prod::value; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return count; + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { return internal::arg_prod::value; } @@ -160,11 +173,11 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this); } }; @@ -208,6 +221,10 @@ struct DSizes : array { typedef array Base; static const std::size_t count = NumDims; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { + return NumDims; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { return internal::array_prod(*static_cast(this)); } @@ -219,31 +236,44 @@ struct DSizes : array { } EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + (*this) = array{{firstDimension, otherDimensions...}}; + } +#else EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); (*this)[0] = i0; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); (*this)[0] = i0; (*this)[1] = i1; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; (*this)[3] = i3; } EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); (*this)[0] = i0; (*this)[1] = i1; (*this)[2] = i2; (*this)[3] = i3; (*this)[4] = i4; } +#endif DSizes& operator = (const array& other) { *static_cast(this) = other; @@ -287,84 +317,6 @@ struct tensor_vsize_index_linearization_helper }; } // end namespace internal -template -struct VSizes : std::vector { - typedef std::vector Base; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const { - return internal::array_prod(*static_cast(this)); - } - - EIGEN_DEVICE_FUNC VSizes() { } - EIGEN_DEVICE_FUNC explicit VSizes(const std::vector& a) : Base(a) { } - - template - EIGEN_DEVICE_FUNC explicit VSizes(const array& a) { - this->resize(NumDims); - for (int i = 0; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } - - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0) { - this->resize(1); - (*this)[0] = i0; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1) { - this->resize(2); - (*this)[0] = i0; - (*this)[1] = i1; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { - this->resize(3); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { - this->resize(4); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - } - EIGEN_DEVICE_FUNC explicit VSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { - this->resize(5); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - (*this)[4] = i4; - } - - VSizes& operator = (const std::vector& other) { - *static_cast(this) = other; - return *this; - } - - // A constexpr would be so much better here - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::tensor_vsize_index_linearization_helper::run(indices, *static_cast(this)); - } -}; - - -// Boilerplate -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex array_prod(const VSizes& sizes) { - DenseIndex total_size = 1; - for (int i = 0; i < sizes.size(); ++i) { - total_size *= sizes[i]; - } - return total_size; -} -} namespace internal { @@ -381,8 +333,8 @@ static const size_t value = Sizes::count; template struct array_size > { static const size_t value = Sizes::count; }; - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes) { - return get::Base>::value; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes& a) { + return get >::value; } #else template struct array_size > { @@ -412,17 +364,17 @@ struct sizes_match_up_to_dim { } }; -template -bool dimensions_match(Dims1& dims1, Dims2& dims2) { - if (array_size::value != array_size::value) { - return false; - } - return sizes_match_up_to_dim::value-1>::run(dims1, dims2); -} - } // end namespace internal +template +bool dimensions_match(Dims1& dims1, Dims2& dims2) { + if (internal::array_size::value != internal::array_size::value) { + return false; + } + return internal::sizes_match_up_to_dim::value-1>::run(dims1, dims2); +} + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H From 703c526355c929cc6c422b7599ecfed57642e988 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:31:52 -0800 Subject: [PATCH 154/214] Misc improvements --- .../Eigen/CXX11/src/Core/util/CXX11Meta.h | 20 +++-- .../CXX11/src/Core/util/CXX11Workarounds.h | 12 ++- .../CXX11/src/Core/util/EmulateCXX11Meta.h | 81 +++++++++++++++---- 3 files changed, 86 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 1e6b97ce4..36d91e780 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -42,14 +42,14 @@ struct numeric_list { constexpr static std::size_t count = sizeof.. * typename gen_numeric_list_repeated::type numeric_list */ -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; template struct gen_numeric_list_repeated { typedef numeric_list type; }; @@ -370,6 +370,14 @@ constexpr inline auto array_prod(std::array arr) -> decltype(array_reduce< return array_reduce(arr); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + /* zip an array */ template diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h index e30eb6ad8..a590cf4e1 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h @@ -48,15 +48,13 @@ namespace internal { * - libstdc++ from version 4.7 onwards has it nevertheless, * so use that * - libstdc++ older versions: use _M_instance directly - * - libc++ from version 3.4 onwards has it IF compiled with - * -std=c++1y - * - libc++ older versions or -std=c++11: use __elems_ directly + * - libc++ all versions so far: use __elems_ directly * - all other libs: use std::get to be portable, but * this may not be constexpr */ #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 #define STD_GET_ARR_HACK a._M_instance[I] -#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11) +#elif defined(_LIBCPP_VERSION) #define STD_GET_ARR_HACK a.__elems_[I] #else #define STD_GET_ARR_HACK std::template get(a) @@ -70,14 +68,14 @@ template constexpr inline T& array_get(std::vector template constexpr inline T&& array_get(std::vector&& a) { return a[I]; } template constexpr inline T const& array_get(std::vector const& a) { return a[I]; } - #undef STD_GET_ARR_HACK template struct array_size; -template struct array_size > { +template struct array_size > { static const size_t value = N; }; -template struct array_size > { +template struct array_size; +template struct array_size > { static const size_t value = N; }; diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h index e45d0a3b1..494f95690 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h @@ -29,7 +29,7 @@ template class array { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() { } - EIGEN_DEVICE_FUNC + explicit EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v) { EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) values[0] = v; @@ -106,6 +106,7 @@ template class array { #ifdef EIGEN_HAS_VARIADIC_TEMPLATES array(std::initializer_list l) { + eigen_assert(l.size() == n); std::copy(l.begin(), l.end(), values); } #endif @@ -211,6 +212,29 @@ template struct gen_numeric_list_repeated { template struct get; +template +struct get +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template +struct get > +{ + get() { eigen_assert(false && "index overflow"); } + typedef void type; + static const char value = '\0'; +}; + +template +struct get<0, type_list > +{ + typedef typename Head::type type; + static const type value = Head::value; +}; + template struct get<0, type_list > { @@ -221,10 +245,11 @@ struct get<0, type_list > template struct get > { - typedef typename get::type type; + typedef typename Tail::HeadType::type type; static const type value = get::value; }; + template struct arg_prod { static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod::value; }; @@ -354,23 +379,51 @@ struct greater_equal_zero_op { template -inline bool array_apply_and_reduce(const array& a) { - EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) - bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); - for (size_t i = 2; i < N; ++i) { - result = Reducer::run(result, Op::run(a[i])); +struct ArrayApplyAndReduce { + static inline bool run(const array& a) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0]), Op::run(a[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i])); + } + return result; } - return result; +}; + +template +struct ArrayApplyAndReduce { + static inline bool run(const array& a) { + return Op::run(a[0]); + } +}; + +template +inline bool array_apply_and_reduce(const array& a) { + return ArrayApplyAndReduce::run(a); } template -inline bool array_zip_and_reduce(const array& a, const array& b) { - EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE) - bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); - for (size_t i = 2; i < N; ++i) { - result = Reducer::run(result, Op::run(a[i], b[i])); +struct ArrayZipAndReduce { + static inline bool run(const array& a, const array& b) { + EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); + bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1])); + for (size_t i = 2; i < N; ++i) { + result = Reducer::run(result, Op::run(a[i], b[i])); + } + return result; } - return result; +}; + +template +struct ArrayZipAndReduce { + static inline bool run(const array& a, const array& b) { + return Op::run(a[0], b[0]); + } +}; + +template +inline bool array_zip_and_reduce(const array& a, const array& b) { + return ArrayZipAndReduce::run(a, b); } } // end namespace internal From 8a382aa119274efd2eb73b822ae7cd2afa128cc5 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:33:11 -0800 Subject: [PATCH 155/214] Improved the resizing of tensors --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index aaec39756..dfe85602a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -114,16 +114,12 @@ class TensorStorage& dimensions() const {return m_dimensions;} - void conservativeResize(DenseIndex size, const array& nbDimensions) - { - m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, internal::array_prod(m_dimensions)); - m_dimensions = nbDimensions; - } void resize(DenseIndex size, const array& nbDimensions) { - if(size != internal::array_prod(m_dimensions)) + const DenseIndex currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) { - internal::conditional_aligned_delete_auto(m_data, internal::array_prod(m_dimensions)); + internal::conditional_aligned_delete_auto(m_data, currentSz); if (size) m_data = internal::conditional_aligned_new_auto(size); else @@ -139,8 +135,6 @@ class TensorStorage Date: Wed, 14 Jan 2015 15:34:50 -0800 Subject: [PATCH 156/214] Ensured that each thread has it's own copy of the TensorEvaluator: this avoid race conditions when the evaluator calls a non thread safe functor, eg when generating random numbers. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f27f643c1..d93fdd907 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -77,17 +77,17 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator* evaluator, const Index first, const Index last) { + static void run(Evaluator evaluator, const Index first, const Index last) { eigen_assert(last > first); for (Index i = first; i < last; ++i) { - evaluator->evalScalar(i); + evaluator.evalScalar(i); } } }; template struct EvalRange { - static void run(Evaluator* evaluator, const Index first, const Index last) { + static void run(Evaluator evaluator, const Index first, const Index last) { eigen_assert(last > first); Index i = first; @@ -96,12 +96,12 @@ struct EvalRange { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { - evaluator->evalPacket(i); + evaluator.evalPacket(i); } } for (; i < last; ++i) { - evaluator->evalScalar(i); + evaluator.evalScalar(i); } } }; @@ -130,16 +130,17 @@ class TensorExecutor std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { - results.push_back(device.enqueue(&EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); - } - - for (int i = 0; i < numblocks; ++i) { - results[i].get(); + results.push_back(device.enqueue(&EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize)); } if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); + EvalRange::run(evaluator, numblocks * blocksize, size); } + + for (int i = 0; i < numblocks; ++i) { + get_when_ready(&results[i]); + } + } evaluator.cleanup(); } @@ -168,7 +169,8 @@ __launch_bounds__(1024) const Index PacketSize = unpacket_traits::size; const Index vectorized_step_size = step_size * PacketSize; const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; i += vectorized_step_size) { + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { eval.evalPacket(i); } for (Index i = vectorized_size + first_index; i < size; i += step_size) { @@ -192,8 +194,7 @@ class TensorExecutor const int block_size = maxCudaThreadsPerBlock(); const Index size = array_prod(evaluator.dimensions()); - EigenMetaKernel, Index><<>>(evaluator, size); - assert(cudaGetLastError() == cudaSuccess); + LAUNCH_CUDA_KERNEL((EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } From f697df723798779bc29d9f7299bb5398767d5db0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:38:48 -0800 Subject: [PATCH 157/214] Improved support for RowMajor tensors Misc fixes and API cleanups. --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 315 ++++++++++++++---- .../CXX11/src/Tensor/TensorBroadcasting.h | 166 ++++++++- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 208 +++++++++--- .../CXX11/src/Tensor/TensorConcatenation.h | 75 ++++- .../src/Tensor/TensorContractionThreadPool.h | 6 +- .../CXX11/src/Tensor/TensorConvolution.h | 50 +-- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 33 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 24 +- .../src/Tensor/TensorForwardDeclarations.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 142 ++++++-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 227 +++++++++---- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 171 ++++++++-- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 46 ++- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 54 ++- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 175 ++++++++-- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 53 +++ 17 files changed, 1405 insertions(+), 356 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index e973c00d3..93938bd1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -33,6 +33,8 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const std::size_t NumDimensions = internal::traits::NumDimensions; + static const int Layout = internal::traits::Layout; enum { Flags = 0, @@ -94,12 +96,18 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) - { } + { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions())); + } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -114,7 +122,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - eigen_assert(internal::dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non // null value), attempt to evaluate the rhs expression in place. Returns true iff in place diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index f451a3c99..8860f622b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -25,77 +25,118 @@ template class TensorBase { public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::Index Index; - typedef Scalar CoeffReturnType; - typedef typename internal::packet_traits::type PacketReturnType; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::packet_traits::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; - // Dimensions - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); } + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } - // Nullary operators + // Coefficient-wise nullary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> constant(const Scalar& value) const { - return TensorCwiseNullaryOp, const Derived> - (derived(), internal::scalar_constant_op(value)); + return nullaryExpr(internal::scalar_constant_op(value)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> random() const { - return TensorCwiseNullaryOp, const Derived>(derived()); + return nullaryExpr(internal::UniformRandomGenerator()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseNullaryOp random() const { - return TensorCwiseNullaryOp(derived()); + return nullaryExpr(RandomGenerator()); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); } // Coefficient-wise unary operators EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator-() const { return derived(); } + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sqrt() const { return derived(); } + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - square() const { return derived(); } + square() const { + return unaryExpr(internal::scalar_square_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - inverse() const { return derived(); } + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - exp() const { return derived(); } + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - log() const { return derived(); } + log() const { + return unaryExpr(internal::scalar_log_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - abs() const { return derived(); } + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> pow(Scalar exponent) const { - return TensorCwiseUnaryOp, const Derived> - (derived(), internal::scalar_pow_op(exponent)); + return unaryExpr(internal::scalar_pow_op(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::scalar_add_op(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((std::numeric_limits::is_signed || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_sub_op(rhs)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator * (Scalar scale) const { - return TensorCwiseUnaryOp, const Derived> - (derived(), internal::scalar_multiple_op(scale)); + operator* (Scalar rhs) const { + return unaryExpr(internal::scalar_multiple_op(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator/ (Scalar rhs) const { + // EIGEN_STATIC_ASSERT(!std::numeric_limits::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::scalar_quotient1_op(rhs)); } EIGEN_DEVICE_FUNC @@ -110,86 +151,106 @@ class TensorBase return cwiseMin(constant(threshold)); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp - unaryExpr(const CustomUnaryOp& func) const { - return TensorCwiseUnaryOp(derived(), func); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> cast() const { - return derived(); + return unaryExpr(internal::scalar_cast_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); } // Coefficient-wise binary operators. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator+(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_sum_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator-(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_difference_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator*(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_product_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator/(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_quotient_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_max_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); } // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator<(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::less()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator<=(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::less_equal()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator>(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::greater()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator>=(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::greater_equal()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator==(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::equal_to()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> operator!=(const OtherDerived& other) const { - return TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return binaryExpr(other.derived(), std::not_equal_to()); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); } // Contractions. @@ -208,29 +269,72 @@ class TensorBase return TensorConvolutionOp(derived(), kernel.derived(), dims); } - // Coefficient-wise ternary operators. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSelectOp - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { - return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); - } - // Reductions. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + const TensorReductionOp, const Dims, const Derived> sum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); } + + const TensorReductionOp, const array, const Derived> + sum() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::SumReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); + } + + const TensorReductionOp, const array, const Derived> + mean() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const array, const Derived> + prod() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> maximum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); } + + const TensorReductionOp, const array, const Derived> + maximum() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::MaxReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> + const TensorReductionOp, const Dims, const Derived> minimum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); } + + const TensorReductionOp, const array, const Derived> + minimum() const { + array in_dims; + for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i; + return TensorReductionOp, const array, const Derived>(derived(), in_dims, internal::MinReducer()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReductionOp reduce(const Dims& dims, const Reducer& reducer) const { @@ -258,17 +362,44 @@ class TensorBase template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorImagePatchOp extract_image_patches() const { - return TensorImagePatchOp(derived(), Rows, Cols, 1, 1); + return TensorImagePatchOp(derived(), Rows, Cols, 1, 1, PADDING_SAME); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const PaddingType padding_type) const { + return TensorImagePatchOp(derived(), Rows, Cols, 1, 1, padding_type); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index stride, const PaddingType padding_type) const { + return TensorImagePatchOp(derived(), Rows, Cols, stride, stride, padding_type); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorImagePatchOp extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride = 1, const Index col_stride = 1) const { - return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride); + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + PADDING_SAME); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const PaddingType padding_type) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + padding_type); } // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorReshapingOp reshape(const NewDimensions& newDimensions) const { @@ -279,10 +410,20 @@ class TensorBase slice(const StartIndices& startIndices, const Sizes& sizes) const { return TensorSlicingOp(derived(), startIndices, sizes); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorChippingOp chip(const Index offset) const { - return TensorChippingOp(derived(), offset); + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorPaddingOp @@ -308,21 +449,24 @@ class TensorBase protected: template friend class Tensor; + template friend class TensorVarDim; template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; - template class TensorBase : public TensorBase { public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::Index Index; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; typedef Scalar CoeffReturnType; typedef typename internal::packet_traits::type PacketReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; template friend class Tensor; + template friend class TensorVarDim; template friend class TensorBase; EIGEN_DEVICE_FUNC @@ -337,24 +481,43 @@ class TensorBase : public TensorBaserandom(); } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() + other.derived(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() - other.derived(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() * other.derived(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const OtherDerived& other) { - return derived() = TensorCwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return derived() = derived() / other.derived(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp reshape(const NewDimensions& newDimensions) const { @@ -365,16 +528,26 @@ class TensorBase : public TensorBase(derived(), startIndices, sizes); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp chip(const Index offset) const { - return TensorChippingOp(derived(), offset); + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp shuffle(const Shuffle& shuffle) const { return TensorShufflingOp(derived(), shuffle); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } // Select the device on which to evaluate the expression. template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 8cb41aec8..ef134adf2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -30,6 +30,8 @@ struct traits > : public traits::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -91,6 +93,7 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -103,11 +106,20 @@ struct TensorEvaluator, Device> m_dimensions[i] = input_dims[i] * broadcast[i]; } - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + if (Layout == ColMajor) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_inputStrides[NumDims-1] = 1; + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } } } @@ -125,16 +137,30 @@ struct TensorEvaluator, Device> m_impl.cleanup(); } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (Layout == ColMajor) { + return coeffColMajor(index); + } else { + return coeffRowMajor(index); + } + } + // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq()(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } } index -= idx * m_outputStrides[i]; } @@ -142,15 +168,59 @@ struct TensorEvaluator, Device> eigen_assert(index < m_impl.dimensions()[0]); inputIndex += index; } else { - inputIndex += (index % m_impl.dimensions()[0]); + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } } return m_impl.coeff(inputIndex); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + inputIndex += index; + } else { + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims-1]); + } + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (Layout == ColMajor) { + return packetColMajor(index); + } else { + return packetRowMajor(index); + } + } + // Ignore the LoadMode and always use unaligned loads since we can't guarantee // the alignment at compile time. template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -161,10 +231,15 @@ struct TensorEvaluator, Device> Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq()(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } } index -= idx * m_outputStrides[i]; } @@ -173,7 +248,12 @@ struct TensorEvaluator, Device> eigen_assert(index < m_impl.dimensions()[0]); innermostLoc = index; } else { - innermostLoc = index % m_impl.dimensions()[0]; + if (internal::index_statically_eq()(0, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } } inputIndex += innermostLoc; @@ -185,13 +265,67 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < packetSize; ++i) { - values[i] = coeff(originalIndex+i); + values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); return rslt; } } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq()(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq()(NumDims-1, 1)) { + eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndex); + for (int i = 1; i < packetSize; ++i) { + values[i] = coeffRowMajor(originalIndex+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index b862a8fd3..bc336e488 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -21,34 +21,61 @@ namespace Eigen { */ namespace internal { -template +template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; }; -template +template struct eval, Eigen::Dense> { typedef const TensorChippingOp& type; }; -template +template struct nested, 1, typename eval >::type> { typedef TensorChippingOp type; }; +template +struct DimensionId +{ + DimensionId(DenseIndex dim) { + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId +{ + DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + } // end namespace internal -template +template class TensorChippingOp : public TensorBase > { public: @@ -61,34 +88,39 @@ class TensorChippingOp : public TensorBase > typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset) - : m_xpr(expr), m_offset(offset) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } protected: typename XprType::Nested m_xpr; const Index m_offset; + const internal::DimensionId m_dim; }; // Eval as rvalue -template +template struct TensorEvaluator, Device> { typedef TensorChippingOp XprType; @@ -96,41 +128,50 @@ struct TensorEvaluator, Device> static const int NumDims = NumInputDims-1; typedef typename XprType::Index Index; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. IsAligned = false, - PacketAccess = false, // not yet implemented + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { // We could also support the case where NumInputDims==1 if needed. EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(NumInputDims > DimId, YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); int j = 0; for (int i = 0; i < NumInputDims; ++i) { - if (i != DimId) { + if (i != m_dim.actualDim()) { m_dimensions[j] = input_dims[i]; ++j; } } - m_stride = 1; - m_inputStride = 1; - for (int i = 0; i < DimId; ++i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - m_inputStride *= input_dims[DimId]; - m_inputOffset = m_stride * op.offset(); + m_stride = 1; + m_inputStride = 1; + if (Layout == ColMajor) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -150,16 +191,52 @@ struct TensorEvaluator, Device> return m_impl.coeff(srcCoeff(index)); } - /* to be done template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - }*/ + if ((Layout == ColMajor && m_dim.actualDim() == 0) || + (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims - 1) || + (Layout == RowMajor && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + packetSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); - if (DimId == NumDims && result) { + if (m_dim.actualDim() == NumDims && result) { return result + m_inputOffset; } else { return NULL; @@ -170,11 +247,13 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if (DimId == 0) { + if ((Layout == ColMajor && m_dim.actualDim() == 0) || + (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if (DimId == NumInputDims-1) { + } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims-1) || + (Layout == RowMajor && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -192,12 +271,13 @@ struct TensorEvaluator, Device> Index m_inputOffset; Index m_inputStride; TensorEvaluator m_impl; + const internal::DimensionId m_dim; const Device& m_device; }; // Eval as lvalue -template +template struct TensorEvaluator, Device> : public TensorEvaluator, Device> { @@ -207,17 +287,17 @@ struct TensorEvaluator, Device> static const int NumDims = NumInputDims-1; typedef typename XprType::Index Index; typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; enum { IsAligned = false, - PacketAccess = false, + PacketAccess = TensorEvaluator::PacketAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketReturnType PacketReturnType; @@ -226,11 +306,45 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - /* to be done template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - } */ + static const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + if ((this->Layout == ColMajor && this->m_dim.actualDim() == 0) || + (this->Layout == RowMajor && this->m_dim.actualDim() == NumInputDims-1)) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + for (int i = 0; i < packetSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if ((this->Layout == ColMajor && this->m_dim.actualDim() == NumInputDims-1) || + (this->Layout == RowMajor && this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + packetSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 74485b15b..fb4e7fb11 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -35,6 +35,8 @@ struct traits > typedef typename RhsXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; enum { Flags = 0 }; }; @@ -103,11 +105,13 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(0 <= m_axis && m_axis < NumDims); const Dimensions& lhs_dims = m_leftImpl.dimensions(); @@ -127,13 +131,26 @@ struct TensorEvaluator= 0; --i) { + m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1]; + m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } } } @@ -159,25 +176,49 @@ struct TensorEvaluator subs; - for (int i = NumDims - 1; i > 0; --i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; } - subs[0] = index; const Dimensions& left_dims = m_leftImpl.dimensions(); if (subs[m_axis] < left_dims[m_axis]) { - Index left_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + Index left_index; + if (Layout == ColMajor) { + left_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } } return m_leftImpl.coeff(left_index); } else { subs[m_axis] -= left_dims[m_axis]; const Dimensions& right_dims = m_rightImpl.dimensions(); - Index right_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + Index right_index; + if (Layout == ColMajor) { + right_index = subs[0]; + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } } return m_rightImpl.coeff(right_index); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 5851e5adc..e358e6a3a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -93,10 +93,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array::size> left_nocontract_t; + typedef array::size> right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = internal::max_n_1::size; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 50cb10a33..aecef3313 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -144,9 +144,9 @@ template struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename internal::promote_storage_type::ret Scalar; - typedef typename internal::packet_traits::type Packet; + typedef typename promote_storage_type::ret Scalar; + typedef typename packet_traits::type Packet; typedef typename promote_storage_type::StorageKind, typename traits::StorageKind>::ret StorageKind; typedef typename promote_index_type::Index, @@ -155,6 +155,8 @@ struct traits > typedef typename KernelXprType::Nested RhsNested; typedef typename remove_reference::type _LhsNested; typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; enum { Flags = 0, @@ -227,11 +229,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -389,10 +397,6 @@ struct TensorEvaluator m_inputStride; array m_outputStride; @@ -421,7 +425,7 @@ struct GetKernelSize { } }; template <> -struct GetKernelSize { +struct GetKernelSize { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { return kernelSize; } @@ -610,11 +614,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { + EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -740,19 +750,17 @@ struct TensorEvaluator indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); switch(kernel_size) { case 4: { - EigenConvolutionKernel1D, Index, InputDims, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); break; } case 7: { - EigenConvolutionKernel1D, Index, InputDims, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); break; } default: { - EigenConvolutionKernel1D, Index, InputDims, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); } } - cudaError_t error = cudaGetLastError(); - assert(error == cudaSuccess); break; } @@ -797,11 +805,11 @@ struct TensorEvaluator, Index, InputDims, 4, 7> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); break; } default: { - EigenConvolutionKernel2D, Index, InputDims, 4, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); break; } } @@ -810,23 +818,21 @@ struct TensorEvaluator, Index, InputDims, 7, 4> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); break; } default: { - EigenConvolutionKernel2D, Index, InputDims, 7, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); break; } } break; } default: { - EigenConvolutionKernel2D, Index, InputDims, Eigen::Dynamic, Eigen::Dynamic> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); break; } } - cudaError_t error = cudaGetLastError(); - assert(error == cudaSuccess); break; } @@ -858,9 +864,7 @@ struct TensorEvaluator kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]); internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - EigenConvolutionKernel3D, Index, InputDims> <<>>(m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); - cudaError_t error = cudaGetLastError(); - assert(error == cudaSuccess); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index ce9d73578..93ebbe277 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -25,11 +25,14 @@ struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -60,24 +63,24 @@ class TensorEvalToOp : public TensorBase > typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(Scalar* buffer, const XprType& expr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} EIGEN_DEVICE_FUNC const typename internal::remove_all::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC Scalar* buffer() const { return m_buffer; } + EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; } protected: typename XprType::Nested m_xpr; - Scalar* m_buffer; + CoeffReturnType* m_buffer; }; @@ -93,6 +96,8 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -103,12 +108,12 @@ struct TensorEvaluator, Device> } typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -117,7 +122,7 @@ struct TensorEvaluator, Device> m_buffer[i] = m_impl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -135,12 +140,12 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - Scalar* data() const { return NULL; } + CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_impl; const Device& m_device; - Scalar* m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index cb14cc7f7..a9501336e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -25,11 +25,14 @@ struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; typedef typename traits::StorageKind StorageKind; typedef typename traits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; enum { Flags = 0, @@ -59,8 +62,8 @@ class TensorForcedEvalOp : public TensorBase > typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -88,6 +91,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -100,10 +104,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_impl.evalSubExprsIfNeeded(NULL); - m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar)); - + const Index numValues = m_impl.dimensions().TotalSize(); + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } typedef TensorEvalToOp EvalTo; EvalTo evalToTmp(m_buffer, m_op); internal::TensorExecutor::PacketAccess>::run(evalToTmp, m_device); @@ -132,7 +142,7 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; const ArgType m_op; const Device& m_device; - Scalar* m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 85599ccfd..7bec2b10a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -29,9 +29,11 @@ template cla template class TensorPatchOp; template class TensorImagePatchOp; template class TensorBroadcastingOp; -template class TensorChippingOp; +template class TensorChippingOp; template class TensorReshapingOp; +template class TensorLayoutSwapOp; template class TensorSlicingOp; +template class TensorReverseOp; template class TensorPaddingOp; template class TensorShufflingOp; template class TensorStridingOp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 0dfb6913b..585ebc778 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -37,6 +37,8 @@ struct traits > : public traits typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; }; template @@ -53,8 +55,6 @@ struct nested, 1, typename eval class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> { @@ -69,9 +69,11 @@ class TensorImagePatchOp : public TensorBase::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides) + DenseIndex row_strides, DenseIndex col_strides, + PaddingType padding_type) : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides){} + m_row_strides(row_strides), m_col_strides(col_strides), + m_padding_type(padding_type) {} EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; } @@ -81,6 +83,8 @@ class TensorImagePatchOp : public TensorBase::type& @@ -92,6 +96,7 @@ class TensorImagePatchOp : public TensorBase, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = NumDims == 5, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // We only support same strides for both dimensions and square patches. + eigen_assert(m_row_strides == m_col_strides); + + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = ceil((m_inputRows - op.patch_rows() + 1.f) / static_cast(m_row_strides)); + m_outputCols = ceil((m_inputCols - op.patch_cols() + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2; + break; + case PADDING_SAME: + m_outputRows = ceil(m_inputRows / static_cast(m_row_strides)); + m_outputCols = ceil(m_inputCols / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2; + break; + default: + eigen_assert(false && "unexpected padding"); + } + + // Dimensions for result of extraction. + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). m_dimensions[0] = input_dims[0]; m_dimensions[1] = op.patch_rows(); m_dimensions[2] = op.patch_cols(); - m_dimensions[3] = ceilf(static_cast(input_dims[1]) / op.row_strides()) * - ceilf(static_cast(input_dims[2]) / op.col_strides()); + m_dimensions[3] = m_outputRows * m_outputCols; for (int i = 4; i < NumDims; ++i) { m_dimensions[i] = input_dims[i-1]; } + // Strides for moving the patch in various dimensions. m_colStride = m_dimensions[1]; m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; m_otherStride = m_patchStride * m_dimensions[3]; - m_inputRows = input_dims[1]; - m_inputCols = input_dims[2]; - - m_rowInputStride = input_dims[0] * op.row_strides(); - m_colInputStride = input_dims[0] * input_dims[1] * op.col_strides(); + // Strides for navigating through the input tensor. + m_rowInputStride = input_dims[0]; + m_colInputStride = input_dims[0] * input_dims[1]; m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2]; - m_rowPaddingTop = op.patch_rows() / 2; - m_colPaddingLeft = op.patch_cols() / 2; - + // Fast representations of different variables. m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); m_fastColStride = internal::TensorIntDivisor(m_colStride); - m_fastInputRows = internal::TensorIntDivisor(m_inputRows); + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); m_fastDimZero = internal::TensorIntDivisor(m_dimensions[0]); } @@ -162,26 +205,29 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - // Find the location of the first element of the patch. + // Patch index corresponding to the passed in index. const Index patchIndex = index / m_fastPatchStride; // Find the offset of the element wrt the location of the first element. const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero; + // Other ways to index this element. const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; - const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colIndex = patch2DIndex / m_fastOutputRows; const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex + colOffset - m_colPaddingLeft; + // Calculate col index in the input original tensor. + const Index inputCol = colIndex * m_col_strides + colOffset - m_colPaddingLeft; if (inputCol < 0 || inputCol >= m_inputCols) { return Scalar(0); } - const Index rowIndex = patch2DIndex - colIndex * m_inputRows; // m_rowStride is always 1 + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; const Index rowOffset = patchOffset - colOffset * m_colStride; - const Index inputRow = rowIndex + rowOffset - m_rowPaddingTop; + // Calculate row index in the original input tensor. + const Index inputRow = rowIndex * m_row_strides + rowOffset - m_rowPaddingTop; if (inputRow < 0 || inputRow >= m_inputRows) { return Scalar(0); } @@ -214,20 +260,24 @@ struct TensorEvaluator, Device> const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); - const Index colIndex = patch2DIndex / m_fastInputRows; + const Index colIndex = patch2DIndex / m_fastOutputRows; const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; - const Index inputCols[2] = {colIndex + colOffsets[0] - m_colPaddingLeft, colIndex + colOffsets[1] - m_colPaddingLeft}; + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { // all zeros return internal::pset1(Scalar(0)); } if (inputCols[0] == inputCols[1]) { - const Index rowIndex = patch2DIndex - colIndex * m_inputRows; + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; eigen_assert(rowOffsets[0] <= rowOffsets[1]); - const Index inputRows[2] = {rowIndex + rowOffsets[0] - m_rowPaddingTop, rowIndex + rowOffsets[1] - m_rowPaddingTop}; + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { // all zeros @@ -247,6 +297,43 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } + const TensorEvaluator& impl() const { return m_impl; } + + Index rowPaddingTop() const { return m_rowPaddingTop; } + Index colPaddingLeft() const { return m_colPaddingLeft; } + Index outputRows() const { return m_outputRows; } + Index outputCols() const { return m_outputCols; } + Index userRowStride() const { return m_row_strides; } + Index userColStride() const { return m_col_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const + { + // Location of the first element of the patch. + // 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches + const Index patchIndex = coords[3]; + + array inputCoords; + inputCoords[0] = coords[0]; // depth + inputCoords[1] = patchIndex / m_inputCols + coords[1] - m_rowPaddingTop; + inputCoords[2] = patchIndex - patchIndex / m_inputCols * m_inputCols + coords[2] - m_colPaddingLeft; + inputCoords[3] = coords[4]; // batch + // If the computed coordinates are outside the original image perimeter, return 0. + if (inputCoords[1] < 0 || inputCoords[1] >= m_inputRows || + inputCoords[2] < 0 || inputCoords[2] >= m_inputCols) { + return Scalar(0); + } + if (TensorEvaluator::CoordAccess) { + return m_impl.coeff(inputCoords); + } else { + Index inputIndex = + inputCoords[3] * m_patchInputStride + + inputCoords[2] * m_colInputStride + + inputCoords[1] * m_rowInputStride + + inputCoords[0]; + return m_impl.coeff(inputIndex); + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { @@ -264,6 +351,8 @@ struct TensorEvaluator, Device> Index m_otherStride; Index m_patchStride; Index m_colStride; + Index m_row_strides; + Index m_col_strides; internal::TensorIntDivisor m_fastOtherStride; internal::TensorIntDivisor m_fastPatchStride; internal::TensorIntDivisor m_fastColStride; @@ -275,10 +364,13 @@ struct TensorEvaluator, Device> Index m_inputRows; Index m_inputCols; + Index m_outputRows; + Index m_outputCols; + Index m_rowPaddingTop; Index m_colPaddingLeft; - internal::TensorIntDivisor m_fastInputRows; + internal::TensorIntDivisor m_fastOutputRows; internal::TensorIntDivisor m_fastDimZero; TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 33849ed3e..23b595ac3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; }; template @@ -54,8 +57,8 @@ class TensorReshapingOp : public TensorBase::Scalar Scalar; typedef typename Eigen::internal::traits::Packet Packet; typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename XprType::PacketReturnType PacketReturnType; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename internal::remove_const::type PacketReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; @@ -96,11 +99,17 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) - { } + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; @@ -109,7 +118,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -127,7 +136,9 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - Scalar* data() const { return m_impl.data(); } + CoeffReturnType* data() const { return m_impl.data(); } + + const TensorEvaluator& impl() const { return m_impl; } protected: TensorEvaluator m_impl; @@ -148,6 +159,8 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -183,11 +196,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; }; template @@ -260,6 +276,8 @@ struct TensorEvaluator, Devi // slice offsets and sizes. IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -270,22 +288,30 @@ struct TensorEvaluator, Devi } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } else { - m_inputStrides[0] = 1; - } - } - const Sizes& output_dims = op.sizes(); - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { + if (Layout == ColMajor) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + + m_outputStrides[0] = 1; + m_fastOutputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } else { - m_outputStrides[0] = 1; - m_fastOutputStrides[0] = 1; + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + m_outputStrides[NumDims-1] = 1; + m_fastOutputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -299,14 +325,23 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); if (internal::is_arithmetic::value && data && m_impl.data()) { Index contiguous_values = 1; - for (int i = 0; i < NumDims; ++i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; + if (Layout == ColMajor) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } } } // Use memcpy if it's going to be faster than using the regular evaluation. @@ -340,16 +375,29 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); if (inputIndices[1] - inputIndices[0] == packetSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; @@ -366,20 +414,44 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) + { + array inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return m_impl.coeff(inputCoords); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { Scalar* result = m_impl.data(); if (result) { Index offset = 0; - for (int i = 0; i < NumDims; ++i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i+1; j < NumDims; ++j) { - if (m_dimensions[j] > 1) { - return NULL; + if (Layout == ColMajor) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; } - offset += m_offsets[j] * m_inputStrides[j]; + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; } - break; } } return result + offset; @@ -391,12 +463,21 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); } - inputIndex += (index + m_offsets[0]); return inputIndex; } @@ -422,6 +503,8 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -445,16 +528,29 @@ struct TensorEvaluator, Device> const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); } - inputIndices[0] += (indices[0] + this->m_offsets[0]); - inputIndices[1] += (indices[1] + this->m_offsets[0]); if (inputIndices[1] - inputIndices[0] == packetSize - 1) { this->m_impl.template writePacket(inputIndices[0], x); } @@ -468,6 +564,15 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array& coords) + { + array inputCoords; + for (int i = 0; i < NumDims; ++i) { + inputCoords = coords[i] + this->m_offsets[i]; + } + return this->m_impl.coeffRef(inputCoords); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index d6347b054..9b14e01f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -88,6 +91,8 @@ struct TensorEvaluator, Device enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = true, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -99,13 +104,23 @@ struct TensorEvaluator, Device m_dimensions[i] += m_padding[i].first + m_padding[i].second; } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + if (Layout == ColMajor) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; } - m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; } typedef typename XprType::Scalar Scalar; @@ -126,23 +141,84 @@ struct TensorEvaluator, Device { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { return Scalar(0); } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; + inputIndex += (index - m_padding[0].first); + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (index < m_padding[NumDims-1].first || + index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return Scalar(0); + } + inputIndex += (index - m_padding[NumDims-1].first); } - if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) { - return Scalar(0); - } - inputIndex += (index - m_padding[0].first); return m_impl.coeff(inputIndex); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (Layout == ColMajor) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const + { + Index inputIndex; + if (Layout == ColMajor) { + const Index idx = coords[0]; + if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { + return Scalar(0); + } + inputIndex = idx - m_padding[0].first; + for (int i = 1; i < NumDims; ++i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } else { + const Index idx = coords[NumDims-1]; + if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) { + return Scalar(0); + } + inputIndex = idx - m_padding[NumDims-1].first; + for (int i = NumDims - 2; i >= 0; --i) { + const Index idx = coords[i]; + if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { + return Scalar(0); + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + } + } + return m_impl.coeff(inputIndex); + } + + Scalar* data() const { return NULL; } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -200,9 +276,64 @@ struct TensorEvaluator, Device return packetWithPossibleZero(initialIndex); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - protected: + const Index initialIndex = index; + Index inputIndex = 0; + + for (int i = 0; i < NumDims - 1; ++i) { + const Index first = index; + const Index last = index + packetSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index last = index + packetSize - 1; + const Index first = index; + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (last < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= firstPaddedRight && last < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(Scalar(0)); + } + else if (first >= lastPaddedLeft && last < firstPaddedRight) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index e2fe32d67..1c03d202f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; }; template @@ -89,11 +92,16 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - }; + Layout = TensorEvaluator::Layout, + CoordAccess = true, + }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { + // Only column major tensors are supported for now. + EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const PatchDim& patch_dims = op.patch_dims(); @@ -195,6 +203,35 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const + { + // Location of the first element of the patch. + const Index patchIndex = coords[NumDims - 1]; + + if (TensorEvaluator::CoordAccess) { + array inputCoords; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputCoords[i] = coords[i] + patchIdx; + } + inputCoords[0] = (patchIndex + coords[0]); + return m_impl.coeff(inputCoords); + } + else { + Index inputIndex = 0; + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = coords[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + inputIndex += (patchIndex + coords[0]); + return m_impl.coeff(inputIndex); + } + } + Scalar* data() const { return NULL; } protected: @@ -206,7 +243,6 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; }; - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 831a9f005..ab5fc6a69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -99,6 +102,8 @@ struct TensorEvaluator, Device> enum { IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -112,15 +117,22 @@ struct TensorEvaluator, Device> array inputStrides; - for (int i = 0; i < NumDims; ++i) { - if (i > 0) { - inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } else { - inputStrides[0] = 1; - m_outputStrides[0] = 1; + if (Layout == ColMajor) { + inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } else { + inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; } } + for (int i = 0; i < NumDims; ++i) { m_inputStrides[i] = inputStrides[shuffle[i]]; } @@ -162,15 +174,23 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; } - return inputIndex + index * m_inputStrides[0]; } Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index ecfdb762c..2fbdfadfe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -24,11 +24,14 @@ template struct traits > : public traits { typedef typename XprType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; + typedef traits XprTraits; + typedef typename packet_traits::type Packet; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; typedef typename XprType::Nested Nested; typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; }; template @@ -98,6 +101,8 @@ struct TensorEvaluator, Device> enum { IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -109,14 +114,25 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - m_outputStrides[0] = 1; - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_inputStrides[i-1] *= op.strides()[i-1]; + if (Layout == ColMajor) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; + } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_inputStrides[i+1] *= op.strides()[i+1]; + } + m_inputStrides[0] *= op.strides()[0]; } - m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; } typedef typename XprType::Scalar Scalar; @@ -135,14 +151,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - inputIndex += idx * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += index * m_inputStrides[0]; - return m_impl.coeff(inputIndex); + return m_impl.coeff(srcCoeff(index)); } template @@ -154,16 +163,29 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStrides[i]; - const Index idx1 = indices[1] / m_outputStrides[i]; - inputIndices[0] += idx0 * m_inputStrides[i]; - inputIndices[1] += idx1 * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; } - inputIndices[0] += indices[0] * m_inputStrides[0]; - inputIndices[1] += indices[1] * m_inputStrides[0]; if (inputIndices[1] - inputIndices[0] == packetSize - 1) { PacketReturnType rslt = m_impl.template packet(inputIndices[0]); return rslt; @@ -183,6 +205,27 @@ struct TensorEvaluator, Device> Scalar* data() const { return NULL; } protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims-1]; + } + return inputIndex; + } + Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; @@ -190,6 +233,84 @@ struct TensorEvaluator, Device> }; +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorStridingOp XprType; + typedef TensorEvaluator Base; + // typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + // typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } + else { + EIGEN_ALIGN_DEFAULT Scalar values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 5c0f78489..022d20360 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -50,6 +50,8 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + static const int NumDimensions = NumIndices_; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, Flags = compute_tensor_flags::ret | LvalueBit, @@ -63,6 +65,8 @@ struct traits > typedef Scalar_ Scalar; typedef Dense StorageKind; typedef DenseIndex Index; + static const int NumDimensions = array_size::value; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; enum { Options = Options_, Flags = compute_tensor_flags::ret | LvalueBit, @@ -78,6 +82,8 @@ struct traits > typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; enum { Options = Options_, Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), @@ -92,6 +98,8 @@ struct traits > typedef typename BaseTraits::Scalar Scalar; typedef typename BaseTraits::StorageKind StorageKind; typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; enum { Options = BaseTraits::Options, Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0), @@ -198,6 +206,51 @@ struct nested, 1, typename eval Date: Wed, 14 Jan 2015 15:43:38 -0800 Subject: [PATCH 158/214] Updated the list of include files --- unsupported/Eigen/CXX11/Tensor | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index aa26e5283..34107ae71 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -30,13 +30,20 @@ #include #include +#if __cplusplus > 199711 +#include +#endif + #ifdef EIGEN_USE_THREADS #include #endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +#ifdef EIGEN_USE_GPU +#include +#if defined(__CUDACC__) #include #endif +#endif #include "Eigen/Core" @@ -44,6 +51,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" @@ -55,15 +63,17 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" @@ -77,7 +87,6 @@ #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" - #include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" From b5124e7cfda27ed99dcfcec8cb1b674efa1ef4a3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 14 Jan 2015 15:46:04 -0800 Subject: [PATCH 159/214] Created many additional tests --- unsupported/test/CMakeLists.txt | 13 +- unsupported/test/cxx11_tensor_assign.cpp | 73 +++ .../test/cxx11_tensor_broadcasting.cpp | 86 +++- unsupported/test/cxx11_tensor_chipping.cpp | 183 +++++-- .../test/cxx11_tensor_concatenation.cpp | 34 +- .../test/cxx11_tensor_contract_cuda.cpp | 121 +++++ unsupported/test/cxx11_tensor_contraction.cpp | 221 +++++--- unsupported/test/cxx11_tensor_cuda.cpp | 474 ++++++++++++++++++ unsupported/test/cxx11_tensor_device.cpp | 116 ++--- unsupported/test/cxx11_tensor_dimension.cpp | 9 +- unsupported/test/cxx11_tensor_expr.cpp | 40 ++ unsupported/test/cxx11_tensor_forced_eval.cpp | 27 + unsupported/test/cxx11_tensor_image_patch.cpp | 206 +++++++- unsupported/test/cxx11_tensor_map.cpp | 7 +- unsupported/test/cxx11_tensor_morphing.cpp | 145 ++++-- unsupported/test/cxx11_tensor_of_strings.cpp | 54 +- unsupported/test/cxx11_tensor_padding.cpp | 23 +- unsupported/test/cxx11_tensor_patch.cpp | 17 + unsupported/test/cxx11_tensor_reduction.cpp | 287 +++++++++-- unsupported/test/cxx11_tensor_shuffling.cpp | 28 +- unsupported/test/cxx11_tensor_simple.cpp | 3 + unsupported/test/cxx11_tensor_striding.cpp | 38 +- unsupported/test/cxx11_tensor_thread_pool.cpp | 72 +-- 23 files changed, 1909 insertions(+), 368 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_contract_cuda.cpp create mode 100644 unsupported/test/cxx11_tensor_cuda.cpp diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 89c651804..9f44e47f9 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -99,7 +99,7 @@ if(EIGEN_TEST_CXX11) # older compiler that don't support cxx11. ei_add_test(cxx11_meta "-std=c++0x") ei_add_test(cxx11_tensor_simple "-std=c++0x") - ei_add_test(cxx11_tensor_symmetry "-std=c++0x") +# ei_add_test(cxx11_tensor_symmetry "-std=c++0x") ei_add_test(cxx11_tensor_assign "-std=c++0x") ei_add_test(cxx11_tensor_dimension "-std=c++0x") ei_add_test(cxx11_tensor_index_list "-std=c++0x") @@ -126,8 +126,17 @@ if(EIGEN_TEST_CXX11) ei_add_test(cxx11_tensor_reduction "-std=c++0x") ei_add_test(cxx11_tensor_shuffling "-std=c++0x") ei_add_test(cxx11_tensor_striding "-std=c++0x") -# ei_add_test(cxx11_tensor_device "-std=c++0x") ei_add_test(cxx11_tensor_thread_pool "-std=c++0x") ei_add_test(cxx11_tensor_ref "-std=c++0x") + ei_add_test(cxx11_tensor_random "-std=c++0x") + ei_add_test(cxx11_tensor_casts "-std=c++0x") + ei_add_test(cxx11_tensor_reverse "-std=c++0x") + ei_add_test(cxx11_tensor_layout_swap "-std=c++0x") ei_add_test(cxx11_tensor_io "-std=c++0x") + + # These tests needs nvcc +# ei_add_test(cxx11_tensor_device "-std=c++0x") +# ei_add_test(cxx11_tensor_cuda "-std=c++0x") +# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x") + endif() diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index 0ac3f9bf9..d16aaf847 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -285,6 +285,78 @@ static void test_compound_assign() } } +static void test_std_initializers_tensor() { +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES + Tensor a(3); + a.setValues({0, 1, 2}); + VERIFY_IS_EQUAL(a(0), 0); + VERIFY_IS_EQUAL(a(1), 1); + VERIFY_IS_EQUAL(a(2), 2); + + // It fills the top-left slice. + a.setValues({10, 20}); + VERIFY_IS_EQUAL(a(0), 10); + VERIFY_IS_EQUAL(a(1), 20); + VERIFY_IS_EQUAL(a(2), 2); + + // Chaining. + Tensor a2(3); + a2 = a.setValues({100, 200, 300}); + VERIFY_IS_EQUAL(a(0), 100); + VERIFY_IS_EQUAL(a(1), 200); + VERIFY_IS_EQUAL(a(2), 300); + VERIFY_IS_EQUAL(a2(0), 100); + VERIFY_IS_EQUAL(a2(1), 200); + VERIFY_IS_EQUAL(a2(2), 300); + + Tensor b(2, 3); + b.setValues({{0, 1, 2}, {3, 4, 5}}); + VERIFY_IS_EQUAL(b(0, 0), 0); + VERIFY_IS_EQUAL(b(0, 1), 1); + VERIFY_IS_EQUAL(b(0, 2), 2); + VERIFY_IS_EQUAL(b(1, 0), 3); + VERIFY_IS_EQUAL(b(1, 1), 4); + VERIFY_IS_EQUAL(b(1, 2), 5); + + // It fills the top-left slice. + b.setValues({{10, 20}, {30}}); + VERIFY_IS_EQUAL(b(0, 0), 10); + VERIFY_IS_EQUAL(b(0, 1), 20); + VERIFY_IS_EQUAL(b(0, 2), 2); + VERIFY_IS_EQUAL(b(1, 0), 30); + VERIFY_IS_EQUAL(b(1, 1), 4); + VERIFY_IS_EQUAL(b(1, 2), 5); + + Eigen::Tensor c(3, 2, 4); + c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}}, + {{10, 11, 12, 13}, {14, 15, 16, 17}}, + {{20, 21, 22, 23}, {24, 25, 26, 27}}}); + VERIFY_IS_EQUAL(c(0, 0, 0), 0); + VERIFY_IS_EQUAL(c(0, 0, 1), 1); + VERIFY_IS_EQUAL(c(0, 0, 2), 2); + VERIFY_IS_EQUAL(c(0, 0, 3), 3); + VERIFY_IS_EQUAL(c(0, 1, 0), 4); + VERIFY_IS_EQUAL(c(0, 1, 1), 5); + VERIFY_IS_EQUAL(c(0, 1, 2), 6); + VERIFY_IS_EQUAL(c(0, 1, 3), 7); + VERIFY_IS_EQUAL(c(1, 0, 0), 10); + VERIFY_IS_EQUAL(c(1, 0, 1), 11); + VERIFY_IS_EQUAL(c(1, 0, 2), 12); + VERIFY_IS_EQUAL(c(1, 0, 3), 13); + VERIFY_IS_EQUAL(c(1, 1, 0), 14); + VERIFY_IS_EQUAL(c(1, 1, 1), 15); + VERIFY_IS_EQUAL(c(1, 1, 2), 16); + VERIFY_IS_EQUAL(c(1, 1, 3), 17); + VERIFY_IS_EQUAL(c(2, 0, 0), 20); + VERIFY_IS_EQUAL(c(2, 0, 1), 21); + VERIFY_IS_EQUAL(c(2, 0, 2), 22); + VERIFY_IS_EQUAL(c(2, 0, 3), 23); + VERIFY_IS_EQUAL(c(2, 1, 0), 24); + VERIFY_IS_EQUAL(c(2, 1, 1), 25); + VERIFY_IS_EQUAL(c(2, 1, 2), 26); + VERIFY_IS_EQUAL(c(2, 1, 3), 27); +#endif // EIGEN_HAS_VARIADIC_TEMPLATES +} void test_cxx11_tensor_assign() { @@ -294,4 +366,5 @@ void test_cxx11_tensor_assign() CALL_SUBTEST(test_same_type()); CALL_SUBTEST(test_auto_resize()); CALL_SUBTEST(test_compound_assign()); + CALL_SUBTEST(test_std_initializers_tensor()); } diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 9663912a4..f0792bdcf 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -13,9 +13,10 @@ using Eigen::Tensor; +template static void test_simple_broadcasting() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array broadcasts; broadcasts[0] = 1; @@ -23,7 +24,7 @@ static void test_simple_broadcasting() broadcasts[2] = 1; broadcasts[3] = 1; - Tensor no_broadcast; + Tensor no_broadcast; no_broadcast = tensor.broadcast(broadcasts); VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2); @@ -45,7 +46,7 @@ static void test_simple_broadcasting() broadcasts[1] = 3; broadcasts[2] = 1; broadcasts[3] = 4; - Tensor broadcast; + Tensor broadcast; broadcast = tensor.broadcast(broadcasts); VERIFY_IS_EQUAL(broadcast.dimension(0), 4); @@ -65,16 +66,17 @@ static void test_simple_broadcasting() } +template static void test_vectorized_broadcasting() { - Tensor tensor(8,3,5); + Tensor tensor(8,3,5); tensor.setRandom(); array broadcasts; broadcasts[0] = 2; broadcasts[1] = 3; broadcasts[2] = 4; - Tensor broadcast; + Tensor broadcast; broadcast = tensor.broadcast(broadcasts); VERIFY_IS_EQUAL(broadcast.dimension(0), 16); @@ -107,8 +109,78 @@ static void test_vectorized_broadcasting() } +template +static void test_static_broadcasting() +{ + Tensor tensor(8,3,5); + tensor.setRandom(); + Eigen::IndexList, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; + + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 16); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k)); + } + } + } + + tensor.resize(11,3,5); + tensor.setRandom(); + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 22); + VERIFY_IS_EQUAL(broadcast.dimension(1), 9); + VERIFY_IS_EQUAL(broadcast.dimension(2), 20); + + for (int i = 0; i < 22; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 20; ++k) { + VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k)); + } + } + } +} + + +template +static void test_fixed_size_broadcasting() +{ + // Need to add a [] operator to the Size class for this to work +#if 0 + Tensor t1(10); + t1.setRandom(); + TensorFixedSize, DataLayout> t2; + t2 = t2.constant(20.0f); + + Tensor t3 = t1 + t2.broadcast(Eigen::array{{10}}); + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + } + + TensorMap, DataLayout> > t4(t2.data(), {{1}}); + Tensor t5 = t1 + t4.broadcast(Eigen::array{{10}}); + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t5(i), t1(i) + t2(0)); + } +#endif +} + + void test_cxx11_tensor_broadcasting() { - CALL_SUBTEST(test_simple_broadcasting()); - CALL_SUBTEST(test_vectorized_broadcasting()); + CALL_SUBTEST(test_simple_broadcasting()); + CALL_SUBTEST(test_simple_broadcasting()); + CALL_SUBTEST(test_vectorized_broadcasting()); + CALL_SUBTEST(test_vectorized_broadcasting()); + CALL_SUBTEST(test_static_broadcasting()); + CALL_SUBTEST(test_static_broadcasting()); + CALL_SUBTEST(test_fixed_size_broadcasting()); + CALL_SUBTEST(test_fixed_size_broadcasting()); } diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 0027b2888..0de7bbac6 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -13,18 +13,20 @@ using Eigen::Tensor; - +template static void test_simple_chip() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); - Tensor chip1; - chip1 = tensor.chip<0>(1); + Tensor chip1; + chip1 = tensor.template chip<0>(1); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); VERIFY_IS_EQUAL(chip1.dimension(1), 5); VERIFY_IS_EQUAL(chip1.dimension(2), 7); VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { @@ -35,7 +37,7 @@ static void test_simple_chip() } } - Tensor chip2 = tensor.chip<1>(1); + Tensor chip2 = tensor.template chip<1>(1); VERIFY_IS_EQUAL(chip2.dimension(0), 2); VERIFY_IS_EQUAL(chip2.dimension(1), 5); VERIFY_IS_EQUAL(chip2.dimension(2), 7); @@ -50,7 +52,7 @@ static void test_simple_chip() } } - Tensor chip3 = tensor.chip<2>(2); + Tensor chip3 = tensor.template chip<2>(2); VERIFY_IS_EQUAL(chip3.dimension(0), 2); VERIFY_IS_EQUAL(chip3.dimension(1), 3); VERIFY_IS_EQUAL(chip3.dimension(2), 7); @@ -65,7 +67,7 @@ static void test_simple_chip() } } - Tensor chip4(tensor.chip<3>(5)); + Tensor chip4(tensor.template chip<3>(5)); VERIFY_IS_EQUAL(chip4.dimension(0), 2); VERIFY_IS_EQUAL(chip4.dimension(1), 3); VERIFY_IS_EQUAL(chip4.dimension(2), 5); @@ -80,7 +82,7 @@ static void test_simple_chip() } } - Tensor chip5(tensor.chip<4>(7)); + Tensor chip5(tensor.template chip<4>(7)); VERIFY_IS_EQUAL(chip5.dimension(0), 2); VERIFY_IS_EQUAL(chip5.dimension(1), 3); VERIFY_IS_EQUAL(chip5.dimension(2), 5); @@ -96,14 +98,97 @@ static void test_simple_chip() } } +template +static void test_dynamic_chip() +{ + Tensor tensor(2,3,5,7,11); + tensor.setRandom(); + Tensor chip1; + chip1 = tensor.chip(1, 0); + VERIFY_IS_EQUAL(chip1.dimension(0), 3); + VERIFY_IS_EQUAL(chip1.dimension(1), 5); + VERIFY_IS_EQUAL(chip1.dimension(2), 7); + VERIFY_IS_EQUAL(chip1.dimension(3), 11); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l)); + } + } + } + } + + Tensor chip2 = tensor.chip(1, 1); + VERIFY_IS_EQUAL(chip2.dimension(0), 2); + VERIFY_IS_EQUAL(chip2.dimension(1), 5); + VERIFY_IS_EQUAL(chip2.dimension(2), 7); + VERIFY_IS_EQUAL(chip2.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); + } + } + } + } + + Tensor chip3 = tensor.chip(2, 2); + VERIFY_IS_EQUAL(chip3.dimension(0), 2); + VERIFY_IS_EQUAL(chip3.dimension(1), 3); + VERIFY_IS_EQUAL(chip3.dimension(2), 7); + VERIFY_IS_EQUAL(chip3.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 11; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l)); + } + } + } + } + + Tensor chip4(tensor.chip(5, 3)); + VERIFY_IS_EQUAL(chip4.dimension(0), 2); + VERIFY_IS_EQUAL(chip4.dimension(1), 3); + VERIFY_IS_EQUAL(chip4.dimension(2), 5); + VERIFY_IS_EQUAL(chip4.dimension(3), 11); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); + } + } + } + } + + Tensor chip5(tensor.chip(7, 4)); + VERIFY_IS_EQUAL(chip5.dimension(0), 2); + VERIFY_IS_EQUAL(chip5.dimension(1), 3); + VERIFY_IS_EQUAL(chip5.dimension(2), 5); + VERIFY_IS_EQUAL(chip5.dimension(3), 7); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7)); + } + } + } + } +} + +template static void test_chip_in_expr() { - Tensor input1(2,3,5,7,11); + Tensor input1(2,3,5,7,11); input1.setRandom(); - Tensor input2(3,5,7,11); + Tensor input2(3,5,7,11); input2.setRandom(); - Tensor result = input1.chip<0>(0) + input2; + Tensor result = input1.template chip<0>(0) + input2; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { @@ -115,9 +200,9 @@ static void test_chip_in_expr() { } } - Tensor input3(3,7,11); + Tensor input3(3,7,11); input3.setRandom(); - Tensor result2 = input1.chip<0>(0).chip<1>(2) + input3; + Tensor result2 = input1.template chip<0>(0).template chip<1>(2) + input3; for (int i = 0; i < 3; ++i) { for (int j = 0; j < 7; ++j) { for (int k = 0; k < 11; ++k) { @@ -128,16 +213,16 @@ static void test_chip_in_expr() { } } - +template static void test_chip_as_lvalue() { - Tensor input1(2,3,5,7,11); + Tensor input1(2,3,5,7,11); input1.setRandom(); - Tensor input2(3,5,7,11); + Tensor input2(3,5,7,11); input2.setRandom(); - Tensor tensor = input1; - tensor.chip<0>(1) = input2; + Tensor tensor = input1; + tensor.template chip<0>(1) = input2; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -154,10 +239,10 @@ static void test_chip_as_lvalue() } } - Tensor input3(2,5,7,11); + Tensor input3(2,5,7,11); input3.setRandom(); tensor = input1; - tensor.chip<1>(1) = input3; + tensor.template chip<1>(1) = input3; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -174,10 +259,10 @@ static void test_chip_as_lvalue() } } - Tensor input4(2,3,7,11); + Tensor input4(2,3,7,11); input4.setRandom(); tensor = input1; - tensor.chip<2>(3) = input4; + tensor.template chip<2>(3) = input4; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -194,10 +279,10 @@ static void test_chip_as_lvalue() } } - Tensor input5(2,3,5,11); + Tensor input5(2,3,5,11); input5.setRandom(); tensor = input1; - tensor.chip<3>(4) = input5; + tensor.template chip<3>(4) = input5; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -214,10 +299,10 @@ static void test_chip_as_lvalue() } } - Tensor input6(2,3,5,7); + Tensor input6(2,3,5,7); input6.setRandom(); tensor = input1; - tensor.chip<4>(5) = input6; + tensor.template chip<4>(5) = input6; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { @@ -235,47 +320,57 @@ static void test_chip_as_lvalue() } } - +template static void test_chip_raw_data() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); - typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; - auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice()); + typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; + auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice()); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { for (int l = 0; l < 7; ++l) { - int chip_index = i + 2 * (j + 3 * (k + 5 * l)); + int chip_index; + if (DataLayout == ColMajor) { + chip_index = i + 2 * (j + 3 * (k + 5 * l)); + } else { + chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i))); + } VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); } } } } - typedef TensorEvaluator(0)), DefaultDevice> Evaluator0; - auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator0; + auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip0.data(), static_cast(0)); - typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; - auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip1.data(), static_cast(0)); - typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; - auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip2.data(), static_cast(0)); - typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; - auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice()); + typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice()); VERIFY_IS_EQUAL(chip3.data(), static_cast(0)); } - void test_cxx11_tensor_chipping() { - CALL_SUBTEST(test_simple_chip()); - CALL_SUBTEST(test_chip_in_expr()); - CALL_SUBTEST(test_chip_as_lvalue()); - CALL_SUBTEST(test_chip_raw_data()); + CALL_SUBTEST(test_simple_chip()); + CALL_SUBTEST(test_simple_chip()); + CALL_SUBTEST(test_dynamic_chip()); + CALL_SUBTEST(test_dynamic_chip()); + CALL_SUBTEST(test_chip_in_expr()); + CALL_SUBTEST(test_chip_in_expr()); + CALL_SUBTEST(test_chip_as_lvalue()); + CALL_SUBTEST(test_chip_as_lvalue()); + CALL_SUBTEST(test_chip_raw_data()); + CALL_SUBTEST(test_chip_raw_data()); } diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp index 8fd4f5f80..9fdf33c16 100644 --- a/unsupported/test/cxx11_tensor_concatenation.cpp +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -13,15 +13,16 @@ using Eigen::Tensor; +template static void test_dimension_failures() { - Tensor left(2, 3, 1); - Tensor right(3, 3, 1); + Tensor left(2, 3, 1); + Tensor right(3, 3, 1); left.setRandom(); right.setRandom(); // Okay; other dimensions are equal. - Tensor concatenation = left.concatenate(right, 0); + Tensor concatenation = left.concatenate(right, 0); // Dimension mismatches. VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1)); @@ -32,33 +33,35 @@ static void test_dimension_failures() VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1)); } +template static void test_static_dimension_failure() { - Tensor left(2, 3); - Tensor right(2, 3, 1); + Tensor left(2, 3); + Tensor right(2, 3, 1); #ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE // Technically compatible, but we static assert that the inputs have same // NumDims. - Tensor concatenation = left.concatenate(right, 0); + Tensor concatenation = left.concatenate(right, 0); #endif // This can be worked around in this case. - Tensor concatenation = left + Tensor concatenation = left .reshape(Tensor::Dimensions{{2, 3, 1}}) .concatenate(right, 0); - Tensor alternative = left + Tensor alternative = left .concatenate(right.reshape(Tensor::Dimensions{{2, 3}}), 0); } +template static void test_simple_concatenation() { - Tensor left(2, 3, 1); - Tensor right(2, 3, 1); + Tensor left(2, 3, 1); + Tensor right(2, 3, 1); left.setRandom(); right.setRandom(); - Tensor concatenation = left.concatenate(right, 0); + Tensor concatenation = left.concatenate(right, 0); VERIFY_IS_EQUAL(concatenation.dimension(0), 4); VERIFY_IS_EQUAL(concatenation.dimension(1), 3); VERIFY_IS_EQUAL(concatenation.dimension(2), 1); @@ -103,8 +106,11 @@ static void test_simple_concatenation() void test_cxx11_tensor_concatenation() { - CALL_SUBTEST(test_dimension_failures()); - CALL_SUBTEST(test_static_dimension_failure()); - CALL_SUBTEST(test_simple_concatenation()); + CALL_SUBTEST(test_dimension_failures()); + CALL_SUBTEST(test_dimension_failures()); + CALL_SUBTEST(test_static_dimension_failure()); + CALL_SUBTEST(test_static_dimension_failure()); + CALL_SUBTEST(test_simple_concatenation()); + CALL_SUBTEST(test_simple_concatenation()); // CALL_SUBTEST(test_vectorized_concatenation()); } diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cpp b/unsupported/test/cxx11_tensor_contract_cuda.cpp new file mode 100644 index 000000000..9599607c6 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contract_cuda.cpp @@ -0,0 +1,121 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2014 Navdeep Jaitly +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; +typedef Tensor::DimensionPair DimPair; + +template +static void test_cuda_contraction(int m_size, int k_size, int n_size) +{ + cout<<"Calling with ("< t_left(Eigen::array(m_size, k_size)); + Tensor t_right(Eigen::array(k_size, n_size)); + Tensor t_result(Eigen::array(m_size, n_size)); + Tensor t_result_gpu(Eigen::array(m_size, n_size)); + Eigen::array dims(DimPair(1, 0)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, Eigen::array(m_size, k_size)); + Eigen::TensorMap > + gpu_t_right(d_t_right, Eigen::array(k_size, n_size)); + Eigen::TensorMap > + gpu_t_result(d_t_result, Eigen::array(m_size, n_size)); + + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] + << " vs " << t_result_gpu.data()[i] << endl; + assert(false); + } + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + +void test_cxx11_tensor_cuda() +{ + cout<<"Calling contraction tests"<(128, 128, 128)); + CALL_SUBTEST(test_cuda_contraction(128, 128, 128)); + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + CALL_SUBTEST(test_cuda_contraction(128, k, 128)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + CALL_SUBTEST(test_cuda_contraction(128, 128, k)); + } + for (int k = 32; k < 256; k++) { + CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + CALL_SUBTEST(test_cuda_contraction(k, 128, 128)); + } + + int m_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + int n_sizes[] = {31, 39, 63, 64, 65, + 127, 129, 255, 257, 511, + 512, 513, 1023, 1024, 1025 }; + + int k_sizes[] = { 31, 39, 63, 64, 65, + 95, 96, 127, 129, 255, + 257, 511, 512, 513, 1023, + 1024, 1025}; + + for (int i = 0; i <15; i++) + for (int j = 0; j < 15; j++) + for (int k = 0; k < 17; k++) { + CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + CALL_SUBTEST(test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k])); + } +} diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 17bd335f7..6124818fd 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -16,18 +16,18 @@ using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; - +template static void test_evals() { - Tensor mat1(2, 3); - Tensor mat2(2, 3); - Tensor mat3(3, 2); + Tensor mat1(2, 3); + Tensor mat2(2, 3); + Tensor mat3(3, 2); mat1.setRandom(); mat2.setRandom(); mat3.setRandom(); - Tensor mat4(3,3); + Tensor mat4(3,3); mat4.setZero(); Eigen::array dims3({{DimPair(0, 0)}}); typedef TensorEvaluator Evaluator; @@ -47,7 +47,7 @@ static void test_evals() VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1)); VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2)); - Tensor mat5(2,2); + Tensor mat5(2,2); mat5.setZero(); Eigen::array dims4({{DimPair(1, 1)}}); typedef TensorEvaluator Evaluator2; @@ -62,7 +62,7 @@ static void test_evals() VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2)); VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2)); - Tensor mat6(2,2); + Tensor mat6(2,2); mat6.setZero(); Eigen::array dims6({{DimPair(1, 0)}}); typedef TensorEvaluator Evaluator3; @@ -78,16 +78,16 @@ static void test_evals() VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1)); } - +template static void test_scalar() { - Tensor vec1({6}); - Tensor vec2({6}); + Tensor vec1({6}); + Tensor vec2({6}); vec1.setRandom(); vec2.setRandom(); - Tensor scalar(1); + Tensor scalar(1); scalar.setZero(); Eigen::array dims({{DimPair(0, 0)}}); typedef TensorEvaluator Evaluator; @@ -102,16 +102,16 @@ static void test_scalar() VERIFY_IS_APPROX(scalar(0), expected); } - +template static void test_multidims() { - Tensor mat1(2, 2, 2); - Tensor mat2(2, 2, 2, 2); + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2, 2); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(2, 2, 2); + Tensor mat3(2, 2, 2); mat3.setZero(); Eigen::array dims({{DimPair(1, 2), DimPair(2, 3)}}); typedef TensorEvaluator Evaluator; @@ -140,15 +140,15 @@ static void test_multidims() mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1)); } - +template static void test_holes() { - Tensor t1(2, 5, 7, 3); - Tensor t2(2, 7, 11, 13, 3); + Tensor t1(2, 5, 7, 3); + Tensor t2(2, 7, 11, 13, 3); t1.setRandom(); t2.setRandom(); Eigen::array dims({{DimPair(0, 0), DimPair(3, 4)}}); - Tensor result = t1.contract(t2, dims); + Tensor result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 5); VERIFY_IS_EQUAL(result.dimension(1), 7); VERIFY_IS_EQUAL(result.dimension(2), 7); @@ -174,16 +174,16 @@ static void test_holes() { } } - +template static void test_full_redux() { - Tensor t1(2, 2); - Tensor t2(2, 2, 2); + Tensor t1(2, 2); + Tensor t2(2, 2, 2); t1.setRandom(); t2.setRandom(); Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); - Tensor result = t1.contract(t2, dims); + Tensor result = t1.contract(t2, dims); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0) + t1(0, 1) * t2(0, 1, 0) + t1(1, 1) * t2(1, 1, 0)); @@ -200,13 +200,13 @@ static void test_full_redux() + t1(0, 1) * t2(1, 0, 1) + t1(1, 1) * t2(1, 1, 1)); } - +template static void test_contraction_of_contraction() { - Tensor t1(2, 2); - Tensor t2(2, 2); - Tensor t3(2, 2); - Tensor t4(2, 2); + Tensor t1(2, 2); + Tensor t2(2, 2); + Tensor t3(2, 2); + Tensor t4(2, 2); t1.setRandom(); t2.setRandom(); t3.setRandom(); @@ -216,30 +216,32 @@ static void test_contraction_of_contraction() auto contract1 = t1.contract(t2, dims); auto diff = t3 - contract1; auto contract2 = t1.contract(t4, dims); - Tensor result = contract2.contract(diff, dims); + Tensor result = contract2.contract(diff, dims); + VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 2); - Eigen::Map m1(t1.data(), 2, 2); - Eigen::Map m2(t2.data(), 2, 2); - Eigen::Map m3(t3.data(), 2, 2); - Eigen::Map m4(t4.data(), 2, 2); - Eigen::MatrixXf expected = (m1 * m4) * (m3 - m1 * m2); + Eigen::Map> + m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2), + m4(t4.data(), 2, 2); + Eigen::Matrix + expected = (m1 * m4) * (m3 - m1 * m2); + VERIFY_IS_APPROX(result(0, 0), expected(0, 0)); VERIFY_IS_APPROX(result(0, 1), expected(0, 1)); VERIFY_IS_APPROX(result(1, 0), expected(1, 0)); VERIFY_IS_APPROX(result(1, 1), expected(1, 1)); } - +template static void test_expr() { - Tensor mat1(2, 3); - Tensor mat2(3, 2); + Tensor mat1(2, 3); + Tensor mat2(3, 2); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(2,2); + Tensor mat3(2,2); Eigen::array dims({{DimPair(1, 0)}}); mat3 = mat1.contract(mat2, dims); @@ -250,16 +252,16 @@ static void test_expr() VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); } - +template static void test_out_of_order_contraction() { - Tensor mat1(2, 2, 2); - Tensor mat2(2, 2, 2); + Tensor mat1(2, 2, 2); + Tensor mat2(2, 2, 2); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(2, 2); + Tensor mat3(2, 2); Eigen::array dims({{DimPair(2, 0), DimPair(0, 2)}}); mat3 = mat1.contract(mat2, dims); @@ -295,18 +297,18 @@ static void test_out_of_order_contraction() } - +template static void test_consistency() { // this does something like testing (A*B)^T = (B^T * A^T) - Tensor mat1(4, 3, 5); - Tensor mat2(3, 2, 1, 5, 4); + Tensor mat1(4, 3, 5); + Tensor mat2(3, 2, 1, 5, 4); mat1.setRandom(); mat2.setRandom(); - Tensor mat3(5, 2, 1, 5); - Tensor mat4(2, 1, 5, 5); + Tensor mat3(5, 2, 1, 5); + Tensor mat4(2, 1, 5, 5); // contract on dimensions of size 4 and 3 Eigen::array dims1({{DimPair(0, 4), DimPair(1, 0)}}); @@ -316,27 +318,40 @@ static void test_consistency() mat4 = mat2.contract(mat1, dims2); // check that these are equal except for ordering of dimensions - for (size_t i = 0; i < 5; i++) { - for (size_t j = 0; j < 10; j++) { - VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + if (DataLayout == ColMajor) { + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]); + } + } + } else { + // Row major + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 10; j++) { + VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]); + } } } } - +template static void test_large_contraction() { - Tensor t_left(30, 50, 8, 31); - Tensor t_right(8, 31, 7, 20, 10); - Tensor t_result(30, 50, 7, 20, 10); + Tensor t_left(30, 50, 8, 31); + Tensor t_right(8, 31, 7, 20, 10); + Tensor t_result(30, 50, 7, 20, 10); t_left.setRandom(); t_right.setRandom(); - typedef Map MapXf; + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map> MapXf; MapXf m_left(t_left.data(), 1500, 248); MapXf m_right(t_right.data(), 248, 1400); - MatrixXf m_result(1500, 1400); + Eigen::Matrix m_result(1500, 1400); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); @@ -351,20 +366,20 @@ static void test_large_contraction() } } - +template static void test_matrix_vector() { - Tensor t_left(30, 50); - Tensor t_right(50); - Tensor t_result(30); + Tensor t_left(30, 50); + Tensor t_right(50); + Tensor t_result(30); t_left.setRandom(); t_right.setRandom(); - typedef Map> MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 30, 50); MapXf m_right(t_right.data(), 50, 1); - Eigen::Matrix m_result(30, 1); + Eigen::Matrix m_result(30, 1); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims{{DimPair(1, 0)}}; @@ -379,18 +394,19 @@ static void test_matrix_vector() } +template static void test_tensor_vector() { - Tensor t_left(7, 13, 17); - Tensor t_right(1, 7); - typedef typename Tensor::DimensionPair DimensionPair; + Tensor t_left(7, 13, 17); + Tensor t_right(1, 7); + typedef typename Tensor::DimensionPair DimensionPair; Eigen::array dim_pair01{{{0, 1}}}; - Tensor t_result = t_left.contract(t_right, dim_pair01); + Tensor t_result = t_left.contract(t_right, dim_pair01); - typedef Map> MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 7, 13*17); MapXf m_right(t_right.data(), 1, 7); - Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); + Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); @@ -398,18 +414,63 @@ static void test_tensor_vector() } +template +static void test_small_blocking_factors() +{ + Tensor t_left(30, 5, 3, 31); + Tensor t_right(3, 31, 7, 20, 1); + t_left.setRandom(); + t_right.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + // Force the cache sizes, which results in smaller blocking factors. + Eigen::setCpuCacheSizes(896, 1920, 2944); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); + Tensor t_result; + t_result = t_left.contract(t_right, dims); + + // compute result using a simple eigen matrix product + Map> m_left(t_left.data(), 150, 93); + Map> m_right(t_right.data(), 93, 140); + Eigen::Matrix m_result = m_left * m_right; + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + + void test_cxx11_tensor_contraction() { - CALL_SUBTEST(test_evals()); - CALL_SUBTEST(test_scalar()); - CALL_SUBTEST(test_multidims()); - CALL_SUBTEST(test_holes()); - CALL_SUBTEST(test_full_redux()); - CALL_SUBTEST(test_contraction_of_contraction()); - CALL_SUBTEST(test_expr()); - CALL_SUBTEST(test_out_of_order_contraction()); - CALL_SUBTEST(test_consistency()); - CALL_SUBTEST(test_large_contraction()); - CALL_SUBTEST(test_matrix_vector()); - CALL_SUBTEST(test_tensor_vector()); + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_evals()); + CALL_SUBTEST(test_scalar()); + CALL_SUBTEST(test_scalar()); + CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_multidims()); + CALL_SUBTEST(test_holes()); + CALL_SUBTEST(test_holes()); + CALL_SUBTEST(test_full_redux()); + CALL_SUBTEST(test_full_redux()); + CALL_SUBTEST(test_contraction_of_contraction()); + CALL_SUBTEST(test_contraction_of_contraction()); + CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_expr()); + CALL_SUBTEST(test_out_of_order_contraction()); + CALL_SUBTEST(test_out_of_order_contraction()); + CALL_SUBTEST(test_consistency()); + CALL_SUBTEST(test_consistency()); + CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_large_contraction()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_matrix_vector()); + CALL_SUBTEST(test_tensor_vector()); + CALL_SUBTEST(test_tensor_vector()); + CALL_SUBTEST(test_small_blocking_factors()); + CALL_SUBTEST(test_small_blocking_factors()); } diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp new file mode 100644 index 000000000..059d23de1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -0,0 +1,474 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// TODO(mdevin): Free the cuda memory. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_GPU + + +#include "main.h" +#include + +using Eigen::Tensor; + +void test_cuda_elementwise_small() { + Tensor in1(Eigen::array(2)); + Tensor in2(Eigen::array(2)); + Tensor out(Eigen::array(2)); + in1.setRandom(); + in2.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap, Eigen::Aligned> gpu_in1( + d_in1, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_in2( + d_in2, Eigen::array(2)); + Eigen::TensorMap, Eigen::Aligned> gpu_out( + d_out, Eigen::array(2)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, + gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 2; ++i) { + VERIFY_IS_APPROX( + out(Eigen::array(i)), + in1(Eigen::array(i)) + in2(Eigen::array(i))); + } +} + +void test_cuda_elementwise() +{ + Tensor in1(Eigen::array(72,53,97)); + Tensor in2(Eigen::array(72,53,97)); + Tensor in3(Eigen::array(72,53,97)); + Tensor out(Eigen::array(72,53,97)); + in1.setRandom(); + in2.setRandom(); + in3.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t in2_bytes = in2.size() * sizeof(float); + std::size_t in3_bytes = in3.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_in2; + float* d_in3; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_in2), in2_bytes); + cudaMalloc((void**)(&d_in3), in3_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_in3(d_in3, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * in3(Eigen::array(i,j,k))); + } + } + } +} + + +void test_cuda_reduction() +{ + Tensor in1(Eigen::array(72,53,97,113)); + Tensor out(Eigen::array(72,97)); + in1.setRandom(); + + std::size_t in1_bytes = in1.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_in1; + float* d_out; + cudaMalloc((void**)(&d_in1), in1_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(72,53,97,113)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,97)); + + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; + + gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = 0; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = + std::max(expected, in1(Eigen::array(i, k, j, l))); + } + } + VERIFY_IS_APPROX(out(Eigen::array(i,j)), expected); + } + } +} + +template +static void test_cuda_contraction() +{ + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor t_left(Eigen::array(6, 50, 3, 31)); + Tensor t_right(Eigen::array(3, 31, 7, 20, 1)); + Tensor t_result(Eigen::array(6, 50, 7, 20, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size() * sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, Eigen::array(6, 50, 3, 31)); + Eigen::TensorMap > + gpu_t_right(d_t_right, Eigen::array(3, 31, 7, 20, 1)); + Eigen::TensorMap > + gpu_t_result(d_t_result, Eigen::array(6, 50, 7, 20, 1)); + + typedef Eigen::Map > MapXf; + MapXf m_left(t_left.data(), 300, 93); + MapXf m_right(t_right.data(), 93, 140); + Eigen::Matrix m_result(300, 140); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(2, 0); + dims[1] = DimPair(3, 1); + + m_result = m_left * m_right; + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + + cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + + for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { + cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl; + assert(false); + } + } +} + +static void test_cuda_convolution_1d() +{ + Tensor input(Eigen::array(74,37,11,137)); + Tensor kernel(Eigen::array(4)); + Tensor out(Eigen::array(74,34,11,137)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, Eigen::array(74,37,11,137)); + Eigen::TensorMap > gpu_kernel(d_kernel, Eigen::array(4)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(74,34,11,137)); + + Eigen::array dims(1); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 34; ++j) { + for (int k = 0; k < 11; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(Eigen::array(i,j,k,l)); + const float expected = input(Eigen::array(i,j+0,k,l)) * kernel(Eigen::array(0)) + + input(Eigen::array(i,j+1,k,l)) * kernel(Eigen::array(1)) + + input(Eigen::array(i,j+2,k,l)) * kernel(Eigen::array(2)) + + input(Eigen::array(i,j+3,k,l)) * kernel(Eigen::array(3)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + + +static void test_cuda_convolution_2d() +{ + Tensor input(Eigen::array(74,37,11,137)); + Tensor kernel(Eigen::array(3,4)); + Tensor out(Eigen::array(74,35,8,137)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, Eigen::array(74,37,11,137)); + Eigen::TensorMap > gpu_kernel(d_kernel, Eigen::array(3,4)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(74,35,8,137)); + + Eigen::array dims(1,2); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 137; ++l) { + const float result = out(Eigen::array(i,j,k,l)); + const float expected = input(Eigen::array(i,j+0,k+0,l)) * kernel(Eigen::array(0,0)) + + input(Eigen::array(i,j+1,k+0,l)) * kernel(Eigen::array(1,0)) + + input(Eigen::array(i,j+2,k+0,l)) * kernel(Eigen::array(2,0)) + + input(Eigen::array(i,j+0,k+1,l)) * kernel(Eigen::array(0,1)) + + input(Eigen::array(i,j+1,k+1,l)) * kernel(Eigen::array(1,1)) + + input(Eigen::array(i,j+2,k+1,l)) * kernel(Eigen::array(2,1)) + + input(Eigen::array(i,j+0,k+2,l)) * kernel(Eigen::array(0,2)) + + input(Eigen::array(i,j+1,k+2,l)) * kernel(Eigen::array(1,2)) + + input(Eigen::array(i,j+2,k+2,l)) * kernel(Eigen::array(2,2)) + + input(Eigen::array(i,j+0,k+3,l)) * kernel(Eigen::array(0,3)) + + input(Eigen::array(i,j+1,k+3,l)) * kernel(Eigen::array(1,3)) + + input(Eigen::array(i,j+2,k+3,l)) * kernel(Eigen::array(2,3)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } +} + + +static void test_cuda_convolution_3d() +{ + Tensor input(Eigen::array(74,37,11,137,17)); + Tensor kernel(Eigen::array(3,4,2)); + Tensor out(Eigen::array(74,35,8,136,17)); + input = input.constant(10.0f) + input.random(); + kernel = kernel.constant(7.0f) + kernel.random(); + + std::size_t input_bytes = input.size() * sizeof(float); + std::size_t kernel_bytes = kernel.size() * sizeof(float); + std::size_t out_bytes = out.size() * sizeof(float); + + float* d_input; + float* d_kernel; + float* d_out; + cudaMalloc((void**)(&d_input), input_bytes); + cudaMalloc((void**)(&d_kernel), kernel_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_input(d_input, Eigen::array(74,37,11,137,17)); + Eigen::TensorMap > gpu_kernel(d_kernel, Eigen::array(3,4,2)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(74,35,8,136,17)); + + Eigen::array dims(1,2,3); + gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 74; ++i) { + for (int j = 0; j < 35; ++j) { + for (int k = 0; k < 8; ++k) { + for (int l = 0; l < 136; ++l) { + for (int m = 0; m < 17; ++m) { + const float result = out(Eigen::array(i,j,k,l,m)); + const float expected = input(Eigen::array(i,j+0,k+0,l+0,m)) * kernel(Eigen::array(0,0,0)) + + input(Eigen::array(i,j+1,k+0,l+0,m)) * kernel(Eigen::array(1,0,0)) + + input(Eigen::array(i,j+2,k+0,l+0,m)) * kernel(Eigen::array(2,0,0)) + + input(Eigen::array(i,j+0,k+1,l+0,m)) * kernel(Eigen::array(0,1,0)) + + input(Eigen::array(i,j+1,k+1,l+0,m)) * kernel(Eigen::array(1,1,0)) + + input(Eigen::array(i,j+2,k+1,l+0,m)) * kernel(Eigen::array(2,1,0)) + + input(Eigen::array(i,j+0,k+2,l+0,m)) * kernel(Eigen::array(0,2,0)) + + input(Eigen::array(i,j+1,k+2,l+0,m)) * kernel(Eigen::array(1,2,0)) + + input(Eigen::array(i,j+2,k+2,l+0,m)) * kernel(Eigen::array(2,2,0)) + + input(Eigen::array(i,j+0,k+3,l+0,m)) * kernel(Eigen::array(0,3,0)) + + input(Eigen::array(i,j+1,k+3,l+0,m)) * kernel(Eigen::array(1,3,0)) + + input(Eigen::array(i,j+2,k+3,l+0,m)) * kernel(Eigen::array(2,3,0)) + + input(Eigen::array(i,j+0,k+0,l+1,m)) * kernel(Eigen::array(0,0,1)) + + input(Eigen::array(i,j+1,k+0,l+1,m)) * kernel(Eigen::array(1,0,1)) + + input(Eigen::array(i,j+2,k+0,l+1,m)) * kernel(Eigen::array(2,0,1)) + + input(Eigen::array(i,j+0,k+1,l+1,m)) * kernel(Eigen::array(0,1,1)) + + input(Eigen::array(i,j+1,k+1,l+1,m)) * kernel(Eigen::array(1,1,1)) + + input(Eigen::array(i,j+2,k+1,l+1,m)) * kernel(Eigen::array(2,1,1)) + + input(Eigen::array(i,j+0,k+2,l+1,m)) * kernel(Eigen::array(0,2,1)) + + input(Eigen::array(i,j+1,k+2,l+1,m)) * kernel(Eigen::array(1,2,1)) + + input(Eigen::array(i,j+2,k+2,l+1,m)) * kernel(Eigen::array(2,2,1)) + + input(Eigen::array(i,j+0,k+3,l+1,m)) * kernel(Eigen::array(0,3,1)) + + input(Eigen::array(i,j+1,k+3,l+1,m)) * kernel(Eigen::array(1,3,1)) + + input(Eigen::array(i,j+2,k+3,l+1,m)) * kernel(Eigen::array(2,3,1)); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + } +} + +static float* CudaCopyFloat(float* data, int size) { + const int nbytes = size * sizeof(float); + float* result = NULL; + if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) { + return NULL; + } else { + if (data != NULL) { + cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice); + } + return result; + } +} + +static void test_cuda_constant_broadcast() +{ + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Tensor t1(10); + for (int i = 0; i < 10; ++i) { + t1(i) = 10.0f * i; + } + float* t1_cuda = CudaCopyFloat(t1.data(), t1.size()); + Eigen::TensorMap > t1_gpu(t1_cuda, 10); + + Tensor t2(1); + t2 = t2.constant(20.0f); + float* t2_cuda = CudaCopyFloat(t2.data(), t2.size()); + Eigen::TensorMap > > t2_gpu(t2_cuda, 1); + + float* t3_cuda = CudaCopyFloat(NULL, 10); + Eigen::TensorMap > t3_gpu(t3_cuda, 10); + + t3_gpu.device(gpu_device) = + t1_gpu + t2_gpu.broadcast(Eigen::array(10)); + + Eigen::Tensor t3(10); + cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < 10; ++i) { + VERIFY_IS_APPROX(t3(i), t1(i) + t2(0)); + } +} + +void test_cxx11_tensor_cuda() +{ + CALL_SUBTEST(test_cuda_elementwise_small()); + CALL_SUBTEST(test_cuda_elementwise()); + CALL_SUBTEST(test_cuda_reduction()); + CALL_SUBTEST(test_cuda_contraction()); + CALL_SUBTEST(test_cuda_contraction()); + CALL_SUBTEST(test_cuda_convolution_1d()); + CALL_SUBTEST(test_cuda_convolution_2d()); + CALL_SUBTEST(test_cuda_convolution_3d()); + CALL_SUBTEST(test_cuda_constant_broadcast()); +} diff --git a/unsupported/test/cxx11_tensor_device.cpp b/unsupported/test/cxx11_tensor_device.cpp index 26465ee11..f2d7e4ce6 100644 --- a/unsupported/test/cxx11_tensor_device.cpp +++ b/unsupported/test/cxx11_tensor_device.cpp @@ -22,23 +22,23 @@ using Eigen::RowMajor; // Context for evaluation on cpu struct CPUContext { - CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(Eigen::array(2,2)), kernel_3d_(Eigen::array(2,2,2)) { + CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) { kernel_1d_(0) = 3.14f; kernel_1d_(1) = 2.7f; - kernel_2d_(Eigen::array(0,0)) = 3.14f; - kernel_2d_(Eigen::array(1,0)) = 2.7f; - kernel_2d_(Eigen::array(0,1)) = 0.2f; - kernel_2d_(Eigen::array(1,1)) = 7.0f; + kernel_2d_(0,0) = 3.14f; + kernel_2d_(1,0) = 2.7f; + kernel_2d_(0,1) = 0.2f; + kernel_2d_(1,1) = 7.0f; - kernel_3d_(Eigen::array(0,0,0)) = 3.14f; - kernel_3d_(Eigen::array(0,1,0)) = 2.7f; - kernel_3d_(Eigen::array(0,0,1)) = 0.2f; - kernel_3d_(Eigen::array(0,1,1)) = 7.0f; - kernel_3d_(Eigen::array(1,0,0)) = -1.0f; - kernel_3d_(Eigen::array(1,1,0)) = -0.3f; - kernel_3d_(Eigen::array(1,0,1)) = -0.7f; - kernel_3d_(Eigen::array(1,1,1)) = -0.5f; + kernel_3d_(0,0,0) = 3.14f; + kernel_3d_(0,1,0) = 2.7f; + kernel_3d_(0,0,1) = 0.2f; + kernel_3d_(0,1,1) = 7.0f; + kernel_3d_(1,0,0) = -1.0f; + kernel_3d_(1,1,0) = -0.3f; + kernel_3d_(1,0,1) = -0.7f; + kernel_3d_(1,1,1) = -0.5f; } const Eigen::DefaultDevice& device() const { return cpu_device_; } @@ -93,8 +93,8 @@ struct GPUContext { const Eigen::TensorMap >& in2() const { return in2_; } Eigen::TensorMap >& out() { return out_; } Eigen::TensorMap > kernel1d() const { return Eigen::TensorMap >(kernel_1d_, 2); } - Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, Eigen::array(2, 2)); } - Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, Eigen::array(2, 2, 2)); } + Eigen::TensorMap > kernel2d() const { return Eigen::TensorMap >(kernel_2d_, 2, 2); } + Eigen::TensorMap > kernel3d() const { return Eigen::TensorMap >(kernel_3d_, 2, 2, 2); } private: const Eigen::TensorMap >& in1_; @@ -150,8 +150,8 @@ static void test_contraction(Context* context) template static void test_1d_convolution(Context* context) { - Eigen::DSizes indices(Eigen::array(0,0,0)); - Eigen::DSizes sizes(Eigen::array(40,49,70)); + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(40,49,70); Eigen::array dims(1); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); @@ -160,8 +160,8 @@ static void test_1d_convolution(Context* context) template static void test_2d_convolution(Context* context) { - Eigen::DSizes indices(Eigen::array(0,0,0)); - Eigen::DSizes sizes(Eigen::array(40,49,69)); + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(40,49,69); Eigen::array dims(1,2); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); @@ -170,8 +170,8 @@ static void test_2d_convolution(Context* context) template static void test_3d_convolution(Context* context) { - Eigen::DSizes indices(Eigen::array(0,0,0)); - Eigen::DSizes sizes(Eigen::array(39,49,69)); + Eigen::DSizes indices(0,0,0); + Eigen::DSizes sizes(39,49,69); Eigen::array dims(0,1,2); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); @@ -179,9 +179,9 @@ static void test_3d_convolution(Context* context) static void test_cpu() { - Eigen::Tensor in1(Eigen::array(40,50,70)); - Eigen::Tensor in2(Eigen::array(40,50,70)); - Eigen::Tensor out(Eigen::array(40,50,70)); + Eigen::Tensor in1(40,50,70); + Eigen::Tensor in2(40,50,70); + Eigen::Tensor out(40,50,70); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); @@ -191,7 +191,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -200,7 +200,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); } } } @@ -209,7 +209,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -217,11 +217,11 @@ static void test_cpu() { test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { - const float result = out(Eigen::array(i,j,0)); + const float result = out(i,j,0); float expected = 0; for (int k = 0; k < 50; ++k) { for (int l = 0; l < 70; ++l) { - expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + expected += in1(i, k, l) * in2(j, k, l); } } VERIFY_IS_APPROX(expected, result); @@ -232,7 +232,7 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); } } } @@ -241,9 +241,9 @@ static void test_cpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f) + - (in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) + + (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { continue; } @@ -256,11 +256,11 @@ static void test_cpu() { for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + - in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f) + - (in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + - in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) + + (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) { continue; } @@ -271,9 +271,9 @@ static void test_cpu() { } static void test_gpu() { - Eigen::Tensor in1(Eigen::array(40,50,70)); - Eigen::Tensor in2(Eigen::array(40,50,70)); - Eigen::Tensor out(Eigen::array(40,50,70)); + Eigen::Tensor in1(40,50,70); + Eigen::Tensor in2(40,50,70); + Eigen::Tensor out(40,50,70); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); @@ -291,9 +291,9 @@ static void test_gpu() { cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - Eigen::TensorMap > gpu_in1(d_in1, Eigen::array(40,50,70)); - Eigen::TensorMap > gpu_in2(d_in2, Eigen::array(40,50,70)); - Eigen::TensorMap > gpu_out(d_out, Eigen::array(40,50,70)); + Eigen::TensorMap > gpu_in1(d_in1, 40,50,70); + Eigen::TensorMap > gpu_in2(d_in2, 40,50,70); + Eigen::TensorMap > gpu_out(d_out, 40,50,70); GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); @@ -301,7 +301,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -311,7 +311,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k))) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f); } } } @@ -321,7 +321,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), in1(Eigen::array(i,j,k)) + in2(Eigen::array(i,j,k)) * 3.14f + 2.718f); + VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f); } } } @@ -330,11 +330,11 @@ static void test_gpu() { assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { - const float result = out(Eigen::array(i,j,0)); + const float result = out(i,j,0); float expected = 0; for (int k = 0; k < 50; ++k) { for (int l = 0; l < 70; ++l) { - expected += in1(Eigen::array(i, k, l)) * in2(Eigen::array(j, k, l)); + expected += in1(i, k, l) * in2(j, k, l); } } VERIFY_IS_APPROX(expected, result); @@ -347,7 +347,7 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { - VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f)); + VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f)); } } } @@ -358,9 +358,9 @@ static void test_gpu() { for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + - in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f); VERIFY_IS_APPROX(expected, result); } } @@ -372,11 +372,11 @@ static void test_gpu() { for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { - const float result = out(Eigen::array(i,j,k)); - const float expected = (in1(Eigen::array(i,j,k)) * 3.14f + in1(Eigen::array(i,j+1,k)) * 2.7f + - in1(Eigen::array(i,j,k+1)) * 0.2f + in1(Eigen::array(i,j+1,k+1)) * 7.0f + - in1(Eigen::array(i+1,j,k)) * -1.0f + in1(Eigen::array(i+1,j+1,k)) * -0.3f + - in1(Eigen::array(i+1,j,k+1)) * -0.7f + in1(Eigen::array(i+1,j+1,k+1)) * -0.5f); + const float result = out(i,j,k); + const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f + + in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f + + in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f + + in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f); VERIFY_IS_APPROX(expected, result); } } diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index c806b623f..0cc4e86f7 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -16,12 +16,15 @@ using Eigen::Tensor; static void test_dynamic_size() { - Eigen::DSizes dimensions(Eigen::array{{2,3,7}}); + Eigen::DSizes dimensions(2,3,7); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3); VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7); VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7); + VERIFY_IS_EQUAL((int)dimensions[0], 2); + VERIFY_IS_EQUAL((int)dimensions[1], 3); + VERIFY_IS_EQUAL((int)dimensions[2], 7); } static void test_fixed_size() @@ -37,9 +40,9 @@ static void test_fixed_size() static void test_match() { - Eigen::DSizes dyn(Eigen::array{{2,3,7}}); + Eigen::DSizes dyn(2,3,7); Eigen::Sizes<2,3,7> stat; - VERIFY_IS_EQUAL(Eigen::internal::dimensions_match(dyn, stat), true); + VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true); } diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index e85fcbfa9..792fdeade 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -125,6 +125,12 @@ static void test_3d() mat7 = mat1.cwiseMax(mat5 * 2.0f).exp(); Tensor mat8(2,3,7); mat8 = (-mat2).exp() * 3.14f; + Tensor mat9(2,3,7); + mat9 = mat2 + 3.14f; + Tensor mat10(2,3,7); + mat10 = mat2 - 3.14f; + Tensor mat11(2,3,7); + mat11 = mat2 / 3.14f; val = 1.0; for (int i = 0; i < 2; ++i) { @@ -136,6 +142,9 @@ static void test_3d() VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f); VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f))); VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f); + VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f); + VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f); + VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f); val += 1.0; } } @@ -172,6 +181,36 @@ static void test_constants() } } +static void test_boolean() +{ + Tensor vec(6); + std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data()); + + // Test ||. + Tensor bool1 = vec < vec.constant(1) || vec > vec.constant(4); + VERIFY_IS_EQUAL(bool1[0], true); + VERIFY_IS_EQUAL(bool1[1], false); + VERIFY_IS_EQUAL(bool1[2], false); + VERIFY_IS_EQUAL(bool1[3], false); + VERIFY_IS_EQUAL(bool1[4], false); + VERIFY_IS_EQUAL(bool1[5], true); + + // Test &&, including cast of operand vec. + Tensor bool2 = vec.cast() && vec < vec.constant(4); + VERIFY_IS_EQUAL(bool2[0], false); + VERIFY_IS_EQUAL(bool2[1], true); + VERIFY_IS_EQUAL(bool2[2], true); + VERIFY_IS_EQUAL(bool2[3], true); + VERIFY_IS_EQUAL(bool2[4], false); + VERIFY_IS_EQUAL(bool2[5], false); + + // Compilation tests: + // Test Tensor against results of cast or comparison; verifies that + // CoeffReturnType is set to match Op return type of bool for Unary and Binary + // Ops. + Tensor bool3 = vec.cast() && bool2; + bool3 = vec < vec.constant(4) && bool2; +} static void test_functors() { @@ -258,6 +297,7 @@ void test_cxx11_tensor_expr() CALL_SUBTEST(test_2d()); CALL_SUBTEST(test_3d()); CALL_SUBTEST(test_constants()); + CALL_SUBTEST(test_boolean()); CALL_SUBTEST(test_functors()); CALL_SUBTEST(test_type_casting()); CALL_SUBTEST(test_select()); diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp index 529584a7b..ad9de867d 100644 --- a/unsupported/test/cxx11_tensor_forced_eval.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -45,7 +45,34 @@ static void test_simple() } +static void test_const() +{ + MatrixXf input(3,3); + input.setRandom(); + MatrixXf output = input; + output.rowwise() -= input.colwise().maxCoeff(); + + Eigen::array depth_dim; + depth_dim[0] = 0; + Tensor::Dimensions dims2d; + dims2d[0] = 1; + dims2d[1] = 3; + Eigen::array bcast; + bcast[0] = 3; + bcast[1] = 1; + const TensorMap> input_tensor(input.data(), 3, 3); + Tensor output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + VERIFY_IS_APPROX(output(i, j), output_tensor(i, j)); + } + } +} + + void test_cxx11_tensor_forced_eval() { CALL_SUBTEST(test_simple()); + CALL_SUBTEST(test_const()); } diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 55d35eac0..26854f5a4 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -28,6 +28,9 @@ static void test_simple_patch() VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7); for (int i = 0; i < tensor.size(); ++i) { + if (tensor.data()[i] != single_pixel_patch.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl; + } VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); } @@ -51,6 +54,9 @@ static void test_simple_patch() if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { expected = tensor(d, r-1+i, c-2+j, b); } + if (entire_image_patch(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected); } } @@ -68,6 +74,11 @@ static void test_simple_patch() VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); VERIFY_IS_EQUAL(twod_patch.dimension(4), 7); + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + int row_padding = 0; + int col_padding = 0; + int stride = 1; + for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { int patchId = i+3*j; @@ -76,8 +87,13 @@ static void test_simple_patch() for (int d = 0; d < 2; ++d) { for (int b = 0; b < 7; ++b) { float expected = 0.0f; - if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { - expected = tensor(d, r-1+i, c-1+j, b); + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) { + expected = tensor(d, row_offset, col_offset, b); + } + if (twod_patch(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; } VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected); } @@ -88,6 +104,156 @@ static void test_simple_patch() } } +// Verifies VALID padding (no padding) with incrementing values. +static void test_patch_padding_valid() +{ + int input_depth = 3; + int input_rows = 3; + int input_cols = 3; + int input_batches = 1; + int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // No padding is carried out. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r + i - row_padding; + int col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies VALID padding (no padding) with the same value. +static void test_patch_padding_valid_same_value() +{ + int input_depth = 1; + int input_rows = 5; + int input_cols = 5; + int input_batches = 2; + int ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + tensor = tensor.constant(11.0f); + Tensor result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // No padding is carried out. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r + i - row_padding; + int col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} + +// Verifies SAME padding. +static void test_patch_padding_same() +{ + int input_depth = 3; + int input_rows = 4; + int input_cols = 2; + int input_batches = 1; + int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + int stride = 2; // Only same stride is supported. + Tensor tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // Based on the calculation described in TensorTraits.h, padding happens to be + // 0. + int row_padding = 0; + int col_padding = 0; + + for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + int patchId = i+input_rows*j; + for (int r = 0; r < ksize; ++r) { // patch rows + for (int c = 0; c < ksize; ++c) { // patch cols + for (int d = 0; d < input_depth; ++d) { // depth + for (int b = 0; b < input_batches; ++b) { // batch + float expected = 0.0f; + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected = tensor(d, row_offset, col_offset, b); + } + if (result(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected); + } + } + } + } + } + } +} static void test_patch_no_extra_dim() { @@ -103,6 +269,9 @@ static void test_patch_no_extra_dim() VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5); for (int i = 0; i < tensor.size(); ++i) { + if (tensor.data()[i] != single_pixel_patch.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl; + } VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]); } @@ -124,6 +293,9 @@ static void test_patch_no_extra_dim() if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { expected = tensor(d, r-1+i, c-2+j); } + if (entire_image_patch(d, r, c, patchId) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected); } } @@ -139,6 +311,11 @@ static void test_patch_no_extra_dim() VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5); + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + int row_padding = 0; + int col_padding = 0; + int stride = 1; + for (int i = 0; i < 3; ++i) { for (int j = 0; j < 5; ++j) { int patchId = i+3*j; @@ -146,8 +323,13 @@ static void test_patch_no_extra_dim() for (int c = 0; c < 2; ++c) { for (int d = 0; d < 2; ++d) { float expected = 0.0f; - if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 3 && c-1+j < 5) { - expected = tensor(d, r-1+i, c-1+j); + int row_offset = r*stride + i - row_padding; + int col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) { + expected = tensor(d, row_offset, col_offset); + } + if (twod_patch(d, r, c, patchId) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; } VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected); } @@ -181,6 +363,9 @@ static void test_imagenet_patches() if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { expected = l_in(d, r-5+i, c-5+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -208,6 +393,9 @@ static void test_imagenet_patches() if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { expected = l_in(d, r-4+i, c-4+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -235,6 +423,9 @@ static void test_imagenet_patches() if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { expected = l_in(d, r-3+i, c-3+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -262,6 +453,9 @@ static void test_imagenet_patches() if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { expected = l_in(d, r-1+i, c-1+j, b); } + if (l_out(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected); } } @@ -271,10 +465,12 @@ static void test_imagenet_patches() } } - void test_cxx11_tensor_image_patch() { CALL_SUBTEST(test_simple_patch()); CALL_SUBTEST(test_patch_no_extra_dim()); + CALL_SUBTEST(test_patch_padding_valid()); + CALL_SUBTEST(test_patch_padding_valid_same_value()); + CALL_SUBTEST(test_patch_padding_same()); CALL_SUBTEST(test_imagenet_patches()); } diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index 478c20306..9cf2eb150 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -29,6 +29,7 @@ static void test_1d() vec1(4) = 23; vec2(4) = 4; vec1(5) = 42; vec2(5) = 5; + VERIFY_IS_EQUAL(vec1.rank(), 1); VERIFY_IS_EQUAL(vec1.size(), 6); VERIFY_IS_EQUAL(vec1.dimension(0), 6); @@ -69,10 +70,12 @@ static void test_2d() TensorMap> mat3(mat1.data(), 2, 3); TensorMap> mat4(mat2.data(), 2, 3); + VERIFY_IS_EQUAL(mat3.rank(), 2); VERIFY_IS_EQUAL(mat3.size(), 6); VERIFY_IS_EQUAL(mat3.dimension(0), 2); VERIFY_IS_EQUAL(mat3.dimension(1), 3); + VERIFY_IS_EQUAL(mat4.rank(), 2); VERIFY_IS_EQUAL(mat4.size(), 6); VERIFY_IS_EQUAL(mat4.dimension(0), 2); VERIFY_IS_EQUAL(mat4.dimension(1), 3); @@ -109,13 +112,15 @@ static void test_3d() } TensorMap> mat3(mat1.data(), 2, 3, 7); - TensorMap> mat4(mat2.data(), 2, 3, 7); + TensorMap> mat4(mat2.data(), array{{2, 3, 7}}); + VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); VERIFY_IS_EQUAL(mat3.dimension(0), 2); VERIFY_IS_EQUAL(mat3.dimension(1), 3); VERIFY_IS_EQUAL(mat3.dimension(2), 7); + VERIFY_IS_EQUAL(mat4.rank(), 3); VERIFY_IS_EQUAL(mat4.size(), 2*3*7); VERIFY_IS_EQUAL(mat4.dimension(0), 2); VERIFY_IS_EQUAL(mat4.dimension(1), 3); diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index 78b0dade0..b4b0a55b6 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -89,19 +89,19 @@ static void test_reshape_as_lvalue() } } - +template static void test_simple_slice() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); - Tensor slice1(1,1,1,1,1); + Tensor slice1(1,1,1,1,1); Eigen::DSizes indices(1,2,3,4,5); Eigen::DSizes sizes(1,1,1,1,1); slice1 = tensor.slice(indices, sizes); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - Tensor slice2(1,1,2,2,3); + Tensor slice2(1,1,2,2,3); Eigen::DSizes indices2(1,1,3,4,5); Eigen::DSizes sizes2(1,1,2,2,3); slice2 = tensor.slice(indices2, sizes2); @@ -114,7 +114,7 @@ static void test_simple_slice() } } - +// TODO(andydavis) Add RowMajor support when TensorContract supports RowMajor. static void test_slice_in_expr() { MatrixXf m1(7,7); MatrixXf m2(3,3); @@ -141,21 +141,28 @@ static void test_slice_in_expr() { VERIFY_IS_APPROX(res(i,j), m3(i,j)); } } + + // Take an arbitrary slice of an arbitrarily sized tensor. + TensorMap> tensor4(m1.data(), 7, 7); + Tensor tensor6 = tensor4.reshape(DSizes(7*7)).exp().slice(DSizes(0), DSizes(35)); + for (int i = 0; i < 35; ++i) { + VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i])); + } } - +template static void test_slice_as_lvalue() { - Tensor tensor1(2,2,7); + Tensor tensor1(2,2,7); tensor1.setRandom(); - Tensor tensor2(2,2,7); + Tensor tensor2(2,2,7); tensor2.setRandom(); - Tensor tensor3(4,3,5); + Tensor tensor3(4,3,5); tensor3.setRandom(); - Tensor tensor4(4,3,2); + Tensor tensor4(4,3,2); tensor4.setRandom(); - Tensor result(4,5,7); + Tensor result(4,5,7); Eigen::DSizes sizes12(2,2,7); Eigen::DSizes first_slice(0,0,0); result.slice(first_slice, sizes12) = tensor1; @@ -190,10 +197,10 @@ static void test_slice_as_lvalue() } } - +template static void test_slice_raw_data() { - Tensor tensor(3,5,7,11); + Tensor tensor(3,5,7,11); tensor.setRandom(); Eigen::DSizes offsets(1,2,3,4); @@ -203,40 +210,78 @@ static void test_slice_raw_data() VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1ul); VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4)); - extents = Eigen::DSizes(2,1,1,1); - auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); - VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); - VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); - VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + if (DataLayout == ColMajor) { + extents = Eigen::DSizes(2,1,1,1); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4)); + } else { + extents = Eigen::DSizes(1,1,1,2); + auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2ul); + VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4)); + VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5)); + } extents = Eigen::DSizes(1,2,1,1); auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2ul); VERIFY_IS_EQUAL(slice3.data(), static_cast(0)); - offsets = Eigen::DSizes(0,2,3,4); - extents = Eigen::DSizes(3,2,1,1); - auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); - VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 2; ++j) { - VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + if (DataLayout == ColMajor) { + offsets = Eigen::DSizes(0,2,3,4); + extents = Eigen::DSizes(3,2,1,1); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4)); + } + } + } else { + offsets = Eigen::DSizes(1,2,3,0); + extents = Eigen::DSizes(1,1,2,11); + auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22ul); + for (int l = 0; l < 11; ++l) { + for (int k = 0; k < 2; ++k) { + VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l)); + } } } - offsets = Eigen::DSizes(0,0,0,4); - extents = Eigen::DSizes(3,5,7,2); - auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); - VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 5; ++j) { - for (int k = 0; k < 7; ++k) { - for (int l = 0; l < 2; ++l) { - int slice_index = i + 3 * (j + 5 * (k + 7 * l)); - VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + if (DataLayout == ColMajor) { + offsets = Eigen::DSizes(0,0,0,4); + extents = Eigen::DSizes(3,5,7,2); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210ul); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + for (int l = 0; l < 2; ++l) { + int slice_index = i + 3 * (j + 5 * (k + 7 * l)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4)); + } } } } + } else { + offsets = Eigen::DSizes(1,0,0,0); + extents = Eigen::DSizes(2,5,7,11); + auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); + VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770ul); + for (int l = 0; l < 11; ++l) { + for (int k = 0; k < 7; ++k) { + for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 2; ++i) { + int slice_index = l + 11 * (k + 7 * (j + 5 * i)); + VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l)); + } + } + } + } + } offsets = Eigen::DSizes(0,0,0,0); @@ -247,14 +292,38 @@ static void test_slice_raw_data() } +static void test_composition() +{ + Eigen::Tensor matrix(7, 11); + matrix.setRandom(); + + const DSizes newDims{{1, 1, 11}}; + Eigen::Tensor tensor = + matrix.slice(DSizes(2, 0), DSizes(1, 11)).reshape(newDims); + + VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11ul); + VERIFY_IS_EQUAL(tensor.dimension(0), 1); + VERIFY_IS_EQUAL(tensor.dimension(1), 1); + VERIFY_IS_EQUAL(tensor.dimension(2), 11); + for (int i = 0; i < 11; ++i) { + VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i)); + } +} + + void test_cxx11_tensor_morphing() { CALL_SUBTEST(test_simple_reshape()); CALL_SUBTEST(test_reshape_in_expr()); CALL_SUBTEST(test_reshape_as_lvalue()); - CALL_SUBTEST(test_simple_slice()); + CALL_SUBTEST(test_simple_slice()); + CALL_SUBTEST(test_simple_slice()); CALL_SUBTEST(test_slice_in_expr()); - CALL_SUBTEST(test_slice_as_lvalue()); - CALL_SUBTEST(test_slice_raw_data()); + CALL_SUBTEST(test_slice_as_lvalue()); + CALL_SUBTEST(test_slice_as_lvalue()); + CALL_SUBTEST(test_slice_raw_data()); + CALL_SUBTEST(test_slice_raw_data()); + + CALL_SUBTEST(test_composition()); } diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp index 0ffa341c4..8d05d154e 100644 --- a/unsupported/test/cxx11_tensor_of_strings.cpp +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -8,19 +8,18 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "main.h" -#include + #include -using std::string; using Eigen::Tensor; using Eigen::TensorMap; static void test_assign() { - string data1[6]; - TensorMap> mat1(data1, 2, 3); - string data2[6]; - const TensorMap> mat2(data2, 2, 3); + std::string data1[6]; + TensorMap> mat1(data1, 2, 3); + std::string data2[6]; + const TensorMap> mat2(data2, 2, 3); for (int i = 0; i < 6; ++i) { std::ostringstream s1; @@ -31,16 +30,16 @@ static void test_assign() data2[i] = s2.str(); } - Tensor rslt1; + Tensor rslt1; rslt1 = mat1; - Tensor rslt2; + Tensor rslt2; rslt2 = mat2; - Tensor rslt3 = mat1; - Tensor rslt4 = mat2; + Tensor rslt3 = mat1; + Tensor rslt4 = mat2; - Tensor rslt5(mat1); - Tensor rslt6(mat2); + Tensor rslt5(mat1); + Tensor rslt6(mat2); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -57,8 +56,8 @@ static void test_assign() static void test_concat() { - Tensor t1(2, 3); - Tensor t2(2, 3); + Tensor t1(2, 3); + Tensor t2(2, 3); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -71,7 +70,7 @@ static void test_concat() } } - Tensor result = t1.concatenate(t2, 1); + Tensor result = t1.concatenate(t2, 1); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 6); @@ -86,7 +85,7 @@ static void test_concat() static void test_slices() { - Tensor data(2, 6); + Tensor data(2, 6); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { std::ostringstream s1; @@ -99,8 +98,8 @@ static void test_slices() const Eigen::DSizes first_half{{0, 0}}; const Eigen::DSizes second_half{{0, 3}}; - Tensor t1 = data.slice(first_half, half_size); - Tensor t2 = data.slice(second_half, half_size); + Tensor t1 = data.slice(first_half, half_size); + Tensor t2 = data.slice(second_half, half_size); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { @@ -113,8 +112,8 @@ static void test_slices() static void test_additions() { - Tensor data1(3); - Tensor data2(3); + Tensor data1(3); + Tensor data2(3); for (int i = 0; i < 3; ++i) { data1(i) = "abc"; std::ostringstream s1; @@ -122,16 +121,26 @@ static void test_additions() data2(i) = s1.str(); } - Tensor sum = data1 + data2; + Tensor sum = data1 + data2; for (int i = 0; i < 3; ++i) { std::ostringstream concat; concat << "abc" << i; - string expected = concat.str(); + std::string expected = concat.str(); VERIFY_IS_EQUAL(sum(i), expected); } } +static void test_initialization() +{ + Tensor a(2, 3); + a.setConstant(std::string("foo")); + for (int i = 0; i < 2*3; ++i) { + VERIFY_IS_EQUAL(a(i), std::string("foo")); + } +} + + void test_cxx11_tensor_of_strings() { // Beware: none of this is likely to ever work on a GPU. @@ -139,4 +148,5 @@ void test_cxx11_tensor_of_strings() CALL_SUBTEST(test_concat()); CALL_SUBTEST(test_slices()); CALL_SUBTEST(test_additions()); + CALL_SUBTEST(test_initialization()); } diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index 6f74216dd..ffa19896e 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -13,9 +13,10 @@ using Eigen::Tensor; +template static void test_simple_padding() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array, 4> paddings; @@ -24,7 +25,7 @@ static void test_simple_padding() paddings[2] = std::make_pair(3, 4); paddings[3] = std::make_pair(0, 0); - Tensor padded; + Tensor padded; padded = tensor.pad(paddings); VERIFY_IS_EQUAL(padded.dimension(0), 2+0); @@ -47,9 +48,10 @@ static void test_simple_padding() } } +template static void test_padded_expr() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array, 4> paddings; @@ -62,17 +64,19 @@ static void test_padded_expr() reshape_dims[0] = 12; reshape_dims[1] = 84; - Tensor result; + Tensor result; result = tensor.pad(paddings).reshape(reshape_dims); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 6; ++j) { for (int k = 0; k < 12; ++k) { for (int l = 0; l < 7; ++l) { + const float result_value = DataLayout == ColMajor ? + result(i+2*j,k+12*l) : result(j+6*i,l+7*k); if (j >= 2 && j < 5 && k >= 3 && k < 8) { - VERIFY_IS_EQUAL(result(i+2*j,k+12*l), tensor(i,j-2,k-3,l)); + VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l)); } else { - VERIFY_IS_EQUAL(result(i+2*j,k+12*l), 0.0f); + VERIFY_IS_EQUAL(result_value, 0.0f); } } } @@ -80,9 +84,10 @@ static void test_padded_expr() } } - void test_cxx11_tensor_padding() { - CALL_SUBTEST(test_simple_padding()); - CALL_SUBTEST(test_padded_expr()); + CALL_SUBTEST(test_simple_padding()); + CALL_SUBTEST(test_simple_padding()); + CALL_SUBTEST(test_padded_expr()); + CALL_SUBTEST(test_padded_expr()); } diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp index e2ba5bfd8..0ee7b46d4 100644 --- a/unsupported/test/cxx11_tensor_patch.cpp +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -36,6 +36,23 @@ static void test_simple_patch() VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); } + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + Tensor single_patch; + single_patch = tensor.extract_patches(patch_dims); + + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + patch_dims[0] = 1; patch_dims[1] = 2; patch_dims[2] = 2; diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index da9885166..99e19eba4 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -13,15 +13,15 @@ using Eigen::Tensor; -static void test_simple_reductions() -{ - Tensor tensor(2,3,5,7); +template +static void test_simple_reductions() { + Tensor tensor(2, 3, 5, 7); tensor.setRandom(); array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; - Tensor result = tensor.sum(reduction_axis); + Tensor result = tensor.sum(reduction_axis); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 5); for (int i = 0; i < 2; ++i) { @@ -36,6 +36,53 @@ static void test_simple_reductions() } } + { + Tensor sum1 = tensor.sum(); + VERIFY_IS_EQUAL(sum1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor sum2 = tensor.sum(reduction_axis); + VERIFY_IS_EQUAL(sum2.dimension(0), 1); + + VERIFY_IS_APPROX(sum1(0), sum2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 2; + result = tensor.prod(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 3); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 7; ++j) { + float prod = 1.0f; + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 5; ++l) { + prod *= tensor(k, i, l, j); + } + } + VERIFY_IS_APPROX(result(i, j), prod); + } + } + + { + Tensor prod1 = tensor.prod(); + VERIFY_IS_EQUAL(prod1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor prod2 = tensor.prod(reduction_axis); + VERIFY_IS_EQUAL(prod2.dimension(0), 1); + + VERIFY_IS_APPROX(prod1(0), prod2(0)); + } + reduction_axis[0] = 0; reduction_axis[1] = 2; result = tensor.maximum(reduction_axis); @@ -53,6 +100,21 @@ static void test_simple_reductions() } } + { + Tensor max1 = tensor.maximum(); + VERIFY_IS_EQUAL(max1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor max2 = tensor.maximum(reduction_axis); + VERIFY_IS_EQUAL(max2.dimension(0), 1); + + VERIFY_IS_APPROX(max1(0), max2(0)); + } + reduction_axis[0] = 0; reduction_axis[1] = 1; result = tensor.minimum(reduction_axis); @@ -63,24 +125,72 @@ static void test_simple_reductions() float min_val = (std::numeric_limits::max)(); for (int k = 0; k < 2; ++k) { for (int l = 0; l < 3; ++l) { - min_val = (std::min)(min_val, tensor(k, l, i, j)); + min_val = (std::min)(min_val, tensor(k, l, i, j)); } } VERIFY_IS_APPROX(result(i, j), min_val); } } + + { + Tensor min1 = tensor.minimum(); + VERIFY_IS_EQUAL(min1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor min2 = tensor.minimum(reduction_axis); + VERIFY_IS_EQUAL(min2.dimension(0), 1); + + VERIFY_IS_APPROX(min1(0), min2(0)); + } + + reduction_axis[0] = 0; + reduction_axis[1] = 1; + result = tensor.mean(reduction_axis); + VERIFY_IS_EQUAL(result.dimension(0), 5); + VERIFY_IS_EQUAL(result.dimension(1), 7); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + float sum = 0.0f; + int count = 0; + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 3; ++l) { + sum += tensor(k, l, i, j); + ++count; + } + } + VERIFY_IS_APPROX(result(i, j), sum / count); + } + } + + { + Tensor mean1 = tensor.mean(); + VERIFY_IS_EQUAL(mean1.dimension(0), 1); + + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; + reduction_axis[2] = 2; + reduction_axis[3] = 3; + Tensor mean2 = tensor.mean(reduction_axis); + VERIFY_IS_EQUAL(mean2.dimension(0), 1); + + VERIFY_IS_APPROX(mean1(0), mean2(0)); + } } - -static void test_full_reductions() -{ - Tensor tensor(2,3); +template +static void test_full_reductions() { + Tensor tensor(2, 3); tensor.setRandom(); array reduction_axis; reduction_axis[0] = 0; reduction_axis[1] = 1; - Tensor result = tensor.sum(reduction_axis); + Tensor result = tensor.sum(reduction_axis); VERIFY_IS_EQUAL(result.dimension(0), 1); float sum = 0.0f; @@ -103,30 +213,26 @@ static void test_full_reductions() VERIFY_IS_APPROX(result(0), sqrtf(sum)); } - struct UserReducer { - UserReducer(float offset) : offset_(offset), sum_(0.0f) {} - void reduce(const float val) { - sum_ += val * val; - } - float finalize() const { - return 1.0f / (sum_ + offset_); - } + static const bool PacketAccess = false; + UserReducer(float offset) : offset_(offset) {} + void reduce(const float val, float* accum) { *accum += val * val; } + float initialize() const { return 0; } + float finalize(const float accum) const { return 1.0f / (accum + offset_); } private: - float offset_; - float sum_; + const float offset_; }; -static void test_user_defined_reductions() -{ - Tensor tensor(5,7); +template +static void test_user_defined_reductions() { + Tensor tensor(5, 7); tensor.setRandom(); array reduction_axis; reduction_axis[0] = 1; UserReducer reducer(10.0f); - Tensor result = tensor.reduce(reduction_axis, reducer); + Tensor result = tensor.reduce(reduction_axis, reducer); VERIFY_IS_EQUAL(result.dimension(0), 5); for (int i = 0; i < 5; ++i) { float expected = 10.0f; @@ -138,22 +244,24 @@ static void test_user_defined_reductions() } } - -static void test_tensor_maps() -{ - int inputs[2*3*5*7]; - TensorMap > tensor_map(inputs, 2,3,5,7); - TensorMap > tensor_map_const(inputs, 2,3,5,7); - const TensorMap > tensor_map_const_const(inputs, 2,3,5,7); +template +static void test_tensor_maps() { + int inputs[2 * 3 * 5 * 7]; + TensorMap > tensor_map(inputs, 2, 3, 5, 7); + TensorMap > tensor_map_const(inputs, 2, 3, 5, + 7); + const TensorMap > tensor_map_const_const( + inputs, 2, 3, 5, 7); tensor_map.setRandom(); array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; - Tensor result = tensor_map.sum(reduction_axis); - Tensor result2 = tensor_map_const.sum(reduction_axis); - Tensor result3 = tensor_map_const_const.sum(reduction_axis); + Tensor result = tensor_map.sum(reduction_axis); + Tensor result2 = tensor_map_const.sum(reduction_axis); + Tensor result3 = + tensor_map_const_const.sum(reduction_axis); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 5; ++j) { @@ -170,11 +278,110 @@ static void test_tensor_maps() } } +template +static void test_static_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(72, 97); + in.setRandom(); -void test_cxx11_tensor_reduction() -{ - CALL_SUBTEST(test_simple_reductions()); - CALL_SUBTEST(test_full_reductions()); - CALL_SUBTEST(test_user_defined_reductions()); - CALL_SUBTEST(test_tensor_maps()); +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 3; +#else + Eigen::IndexList, Eigen::type2index<3> > reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 97; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 113; ++l) { + expected = (std::max)(expected, in(i, k, j, l)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template +static void test_innermost_last_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(97, 113); + in.setRandom(); + +// Reduce on the innermost dimensions. +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 0; + reduction_axis[1] = 1; +#else + // This triggers the use of packets for ColMajor. + Eigen::IndexList, Eigen::type2index<1> > reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 97; ++i) { + for (int j = 0; j < 113; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 72; ++l) { + expected = (std::max)(expected, in(l, k, i, j)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +template +static void test_innermost_first_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(72, 53); + in.setRandom(); + +// Reduce on the innermost dimensions. +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 2; + reduction_axis[1] = 3; +#else + // This triggers the use of packets for RowMajor. + Eigen::IndexList, Eigen::type2index<3>> reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + float expected = -1e10f; + for (int k = 0; k < 97; ++k) { + for (int l = 0; l < 113; ++l) { + expected = (std::max)(expected, in(i, j, k, l)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + +void test_cxx11_tensor_reduction() { + CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_simple_reductions()); + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_full_reductions()); + CALL_SUBTEST(test_user_defined_reductions()); + CALL_SUBTEST(test_user_defined_reductions()); + CALL_SUBTEST(test_tensor_maps()); + CALL_SUBTEST(test_tensor_maps()); + CALL_SUBTEST(test_static_dims()); + CALL_SUBTEST(test_static_dims()); + CALL_SUBTEST(test_innermost_last_dims()); + CALL_SUBTEST(test_innermost_last_dims()); + CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_innermost_first_dims()); } diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 39c623499..ec623e1f9 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -14,9 +14,10 @@ using Eigen::Tensor; using Eigen::array; +template static void test_simple_shuffling() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array shuffles; shuffles[0] = 0; @@ -24,7 +25,7 @@ static void test_simple_shuffling() shuffles[2] = 2; shuffles[3] = 3; - Tensor no_shuffle; + Tensor no_shuffle; no_shuffle = tensor.shuffle(shuffles); VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); @@ -46,7 +47,7 @@ static void test_simple_shuffling() shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - Tensor shuffle; + Tensor shuffle; shuffle = tensor.shuffle(shuffles); VERIFY_IS_EQUAL(shuffle.dimension(0), 5); @@ -66,9 +67,10 @@ static void test_simple_shuffling() } +template static void test_expr_shuffling() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array shuffles; @@ -76,10 +78,10 @@ static void test_expr_shuffling() shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - Tensor expected; + Tensor expected; expected = tensor.shuffle(shuffles); - Tensor result(5,7,3,2); + Tensor result(5,7,3,2); array src_slice_dim{{2,3,1,7}}; array src_slice_start{{0,0,0,0}}; @@ -128,16 +130,17 @@ static void test_expr_shuffling() } +template static void test_shuffling_as_value() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array shuffles; shuffles[2] = 0; shuffles[3] = 1; shuffles[1] = 2; shuffles[0] = 3; - Tensor shuffle(5,7,3,2); + Tensor shuffle(5,7,3,2); shuffle.shuffle(shuffles) = tensor; VERIFY_IS_EQUAL(shuffle.dimension(0), 5); @@ -158,7 +161,10 @@ static void test_shuffling_as_value() void test_cxx11_tensor_shuffling() { - CALL_SUBTEST(test_simple_shuffling()); - CALL_SUBTEST(test_expr_shuffling()); - CALL_SUBTEST(test_shuffling_as_value()); + CALL_SUBTEST(test_simple_shuffling()); + CALL_SUBTEST(test_simple_shuffling()); + CALL_SUBTEST(test_expr_shuffling()); + CALL_SUBTEST(test_expr_shuffling()); + CALL_SUBTEST(test_shuffling_as_value()); + CALL_SUBTEST(test_shuffling_as_value()); } diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index a70591c82..23855fca0 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -32,6 +32,7 @@ static void test_1d() vec1(5) = 42; vec2(5) = 5; vec3(5) = 0; vec4.setZero(); + VERIFY_IS_EQUAL((vec1.rank()), 1); VERIFY_IS_EQUAL((vec1.size()), 6); VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6); @@ -99,10 +100,12 @@ static void test_2d() mat2(1,1) = 4; mat2(1,2) = 5; + VERIFY_IS_EQUAL((mat1.rank()), 2); VERIFY_IS_EQUAL((mat1.size()), 6); VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2); VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3); + VERIFY_IS_EQUAL((mat2.rank()), 2); VERIFY_IS_EQUAL((mat2.size()), 6); VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2); VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3); diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp index 502569d1d..1feb39dca 100644 --- a/unsupported/test/cxx11_tensor_striding.cpp +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -13,9 +13,10 @@ using Eigen::Tensor; +template static void test_simple_striding() { - Tensor tensor(2,3,5,7); + Tensor tensor(2,3,5,7); tensor.setRandom(); array strides; strides[0] = 1; @@ -23,7 +24,7 @@ static void test_simple_striding() strides[2] = 1; strides[3] = 1; - Tensor no_stride; + Tensor no_stride; no_stride = tensor.stride(strides); VERIFY_IS_EQUAL(no_stride.dimension(0), 2); @@ -45,7 +46,7 @@ static void test_simple_striding() strides[1] = 4; strides[2] = 2; strides[3] = 3; - Tensor stride; + Tensor stride; stride = tensor.stride(strides); VERIFY_IS_EQUAL(stride.dimension(0), 1); @@ -65,7 +66,36 @@ static void test_simple_striding() } +template +static void test_striding_as_lvalue() +{ + Tensor tensor(2,3,5,7); + tensor.setRandom(); + array strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + Tensor result(3, 12, 10, 21); + result.stride(strides) = tensor; + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l)); + } + } + } + } +} + + void test_cxx11_tensor_striding() { - CALL_SUBTEST(test_simple_striding()); + CALL_SUBTEST(test_simple_striding()); + CALL_SUBTEST(test_simple_striding()); + CALL_SUBTEST(test_striding_as_lvalue()); + CALL_SUBTEST(test_striding_as_lvalue()); } diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index f0de61f8b..e25912279 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -9,10 +9,10 @@ #define EIGEN_USE_THREADS -#include -#include "main.h" -#include +#include "main.h" +#include +#include using Eigen::Tensor; @@ -60,12 +60,12 @@ static void test_multithread_compound_assignment() } } - +template static void test_multithread_contraction() { - Tensor t_left(30, 50, 37, 31); - Tensor t_right(37, 31, 70, 2, 10); - Tensor t_result(30, 50, 70, 2, 10); + Tensor t_left(30, 50, 37, 31); + Tensor t_right(37, 31, 70, 2, 10); + Tensor t_result(30, 50, 70, 2, 10); t_left.setRandom(); t_right.setRandom(); @@ -74,11 +74,10 @@ static void test_multithread_contraction() typedef Tensor::DimensionPair DimPair; Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); - - typedef Map MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 1500, 1147); MapXf m_right(t_right.data(), 1147, 1400); - MatrixXf m_result(1500, 1400); + Matrix m_result(1500, 1400); Eigen::ThreadPoolDevice thread_pool_device(4); @@ -95,12 +94,12 @@ static void test_multithread_contraction() } } - +template static void test_contraction_corner_cases() { - Tensor t_left(32, 500); - Tensor t_right(32, 28*28); - Tensor t_result(500, 28*28); + Tensor t_left(32, 500); + Tensor t_right(32, 28*28); + Tensor t_result(500, 28*28); t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; @@ -110,10 +109,10 @@ static void test_contraction_corner_cases() typedef Tensor::DimensionPair DimPair; Eigen::array dims{{DimPair(0, 0)}}; - typedef Map MapXf; + typedef Map> MapXf; MapXf m_left(t_left.data(), 32, 500); MapXf m_right(t_right.data(), 32, 28*28); - MatrixXf m_result(500, 28*28); + Matrix m_result(500, 28*28); Eigen::ThreadPoolDevice thread_pool_device(12); @@ -181,18 +180,18 @@ static void test_contraction_corner_cases() } } - +template static void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random(1, 5000); - Tensor left(internal::random(1, 80), - contract_size, - internal::random(1, 100)); + Tensor left(internal::random(1, 80), + contract_size, + internal::random(1, 100)); - Tensor right(internal::random(1, 25), - internal::random(1, 37), - contract_size, - internal::random(1, 51)); + Tensor right(internal::random(1, 25), + internal::random(1, 37), + contract_size, + internal::random(1, 51)); left.setRandom(); right.setRandom(); @@ -206,13 +205,13 @@ static void test_multithread_contraction_agrees_with_singlethread() { Eigen::ThreadPoolDevice thread_pool_device(internal::random(2, 11)); - Tensor st_result; + Tensor st_result; st_result = left.contract(right, dims); - Tensor tp_result(st_result.dimensions()); + Tensor tp_result(st_result.dimensions()); tp_result.device(thread_pool_device) = left.contract(right, dims); - VERIFY(internal::dimensions_match(st_result.dimensions(), tp_result.dimensions())); + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); for (ptrdiff_t i = 0; i < st_result.size(); i++) { // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) @@ -241,17 +240,30 @@ static void test_memcpy() { } +static void test_multithread_random() +{ + Eigen::ThreadPoolDevice device(2); + Tensor t(1 << 20); + t.device(device) = t.random>(); +} + + void test_cxx11_tensor_thread_pool() { CALL_SUBTEST(test_multithread_elementwise()); CALL_SUBTEST(test_multithread_compound_assignment()); - CALL_SUBTEST(test_multithread_contraction()); + CALL_SUBTEST(test_multithread_contraction()); + CALL_SUBTEST(test_multithread_contraction()); - CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); + CALL_SUBTEST(test_multithread_contraction_agrees_with_singlethread()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST(test_contraction_corner_cases()); + CALL_SUBTEST(test_contraction_corner_cases()); + CALL_SUBTEST(test_contraction_corner_cases()); CALL_SUBTEST(test_memcpy()); + + CALL_SUBTEST(test_multithread_random()); } From 641e824c56db8fffb2f6091d18f913e040c1ea95 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 15 Jan 2015 11:11:48 -0800 Subject: [PATCH 160/214] Added cube() operation --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 6 ++++++ unsupported/test/cxx11_tensor_expr.cpp | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 8860f622b..e08ac6aa1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -83,6 +83,12 @@ class TensorBase return unaryExpr(internal::scalar_square_op()); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> inverse() const { diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 792fdeade..695565e9b 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -32,6 +32,9 @@ static void test_1d() float data4[6]; TensorMap> vec4(data4, 6); vec4 = vec2.square(); + float data5[6]; + TensorMap> vec5(data5, 6); + vec5 = vec2.cube(); VERIFY_IS_APPROX(vec3(0), sqrtf(4.0)); VERIFY_IS_APPROX(vec3(1), sqrtf(8.0)); @@ -47,6 +50,13 @@ static void test_1d() VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f); VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f); + VERIFY_IS_APPROX(vec5(0), 0.0f); + VERIFY_IS_APPROX(vec5(1), 1.0f); + VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f); + VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f); + VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f); + VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f); + vec3 = vec1 + vec2; VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f); VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f); From 14f537c296710173c76379d8efec59bfb1d78eb7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 16 Jan 2015 09:09:23 -0800 Subject: [PATCH 161/214] gcc doesn't consider that template TensorStridingOp& operator = (const OtherDerived& other) provides a valid assignment operator for the striding operation, and therefore refuses to compile code like: result.stride(foo) = source.stride(bar); Added the explicit TensorStridingOp& operator = (const TensorStridingOp& other) as a workaround to get the code to compile, and did the same in all the operations that can be used as lvalues. --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 10 +++++++++ .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 19 +++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 9 ++++++++ .../Eigen/CXX11/src/Tensor/TensorStriding.h | 9 ++++++++ unsupported/test/cxx11_tensor_chipping.cpp | 21 +++++++++++++++++++ unsupported/test/cxx11_tensor_morphing.cpp | 13 ++++++++++++ unsupported/test/cxx11_tensor_shuffling.cpp | 17 +++++++++++++++ unsupported/test/cxx11_tensor_striding.cpp | 18 ++++++++++++++++ 8 files changed, 116 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index bc336e488..503803d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -101,6 +101,16 @@ class TensorChippingOp : public TensorBase > const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + static const bool Vectorize = TensorEvaluator::PacketAccess; + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 23b595ac3..87a4b0758 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -73,6 +73,15 @@ class TensorReshapingOp : public TensorBase::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) @@ -257,6 +266,16 @@ class TensorSlicingOp : public TensorBase Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + protected: typename XprType::Nested m_xpr; const StartIndices m_indices; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index ab5fc6a69..620a63ae7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -73,6 +73,15 @@ class TensorShufflingOp : public TensorBase const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2fbdfadfe..5aa2c8d3b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -73,6 +73,15 @@ class TensorStridingOp : public TensorBase > const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other) diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 0de7bbac6..d83417872 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -318,8 +318,29 @@ static void test_chip_as_lvalue() } } } + + Tensor input7(2,3,5,7,11); + input7.setRandom(); + tensor = input1; + tensor.chip(0, 0) = input7.chip(0, 0); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + for (int m = 0; m < 11; ++m) { + if (i != 0) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m)); + } + } + } + } + } + } } + template static void test_chip_raw_data() { diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index b4b0a55b6..7fd7a283a 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -161,6 +161,8 @@ static void test_slice_as_lvalue() tensor3.setRandom(); Tensor tensor4(4,3,2); tensor4.setRandom(); + Tensor tensor5(10,13,12); + tensor5.setRandom(); Tensor result(4,5,7); Eigen::DSizes sizes12(2,2,7); @@ -195,6 +197,17 @@ static void test_slice_as_lvalue() } } } + + Eigen::DSizes sizes5(4,5,7); + Eigen::DSizes fifth_slice(0,0,0); + result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5); + for (int i = 0; i < 4; ++i) { + for (int j = 2; j < 5; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k)); + } + } + } } template diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index ec623e1f9..2f7fd9e50 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -157,6 +157,23 @@ static void test_shuffling_as_value() } } } + + array no_shuffle; + no_shuffle[0] = 0; + no_shuffle[1] = 1; + no_shuffle[2] = 2; + no_shuffle[3] = 3; + Tensor shuffle2(5,7,3,2); + shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 2; ++l) { + VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l)); + } + } + } + } } void test_cxx11_tensor_shuffling() diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp index 1feb39dca..935b908cc 100644 --- a/unsupported/test/cxx11_tensor_striding.cpp +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -89,6 +89,24 @@ static void test_striding_as_lvalue() } } } + + array no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; + Tensor result2(3, 12, 10, 21); + result2.stride(strides) = tensor.stride(no_strides); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l)); + } + } + } + } } From b9d314ae19b1c857adfe42b7eaecf2695428f0ed Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 17 Jan 2015 21:55:33 +0100 Subject: [PATCH 162/214] bug #329: fix typo --- Eigen/src/Eigenvalues/RealQZ.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index ae10ff91e..128ef9028 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -313,7 +313,7 @@ namespace Eigen { using std::abs; using std::sqrt; const Index dim=m_S.cols(); - if (abs(m_S.coeff(i+1,i)==Scalar(0))) + if (abs(m_S.coeff(i+1,i))==Scalar(0)) return; Index z = findSmallDiagEntry(i,i+1); if (z==i-1) From e1f1091fde660581d64b54ff1019bc494dbbca89 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 24 Jan 2015 10:32:49 +0100 Subject: [PATCH 163/214] Add support for dense ?= diagonal --- Eigen/src/Core/DiagonalMatrix.h | 6 ++++++ test/diagonalmatrices.cpp | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index e3dc71336..49b9b7925 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -326,6 +326,12 @@ struct Assignment dst.setZero(); dst.diagonal() = src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.diagonal() += src.diagonal(); } + + static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.diagonal() -= src.diagonal(); } }; } // namespace internal diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index 149f1db2f..0227ba577 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -84,6 +84,13 @@ template void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX(m1 * (rdm1 * s1), (m1 * rdm1) * s1); VERIFY_IS_APPROX(m1 * (s1 * rdm1), (m1 * rdm1) * s1); + + // Diagonal to dense + sq_m1.setRandom(); + sq_m2 = sq_m1; + VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() ); + VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() ); + VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() ); } void test_diagonalmatrices() From c6eb84aabcf102aaa3ba1c288e890984f4b49277 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 26 Jan 2015 17:09:01 +0100 Subject: [PATCH 164/214] Enable vectorization of transposeInPlace for PacketSize x PacketSize matrices --- Eigen/src/Core/Transpose.h | 27 ++++++++++++++++++++++++--- test/adjoint.cpp | 22 ++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index a3b95256f..3bab6092c 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -217,18 +217,39 @@ MatrixBase::adjoint() const namespace internal { template + bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic, + bool MatchPacketSize = + (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits::size)) + && (internal::evaluator::Flags&PacketAccessBit) > struct inplace_transpose_selector; template -struct inplace_transpose_selector { // square matrix +struct inplace_transpose_selector { // square matrix static void run(MatrixType& m) { m.matrix().template triangularView().swap(m.matrix().transpose()); } }; +// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template -struct inplace_transpose_selector { // non square matrix +struct inplace_transpose_selector { // PacketSize x PacketSize + static void run(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + typedef typename MatrixType::Index Index; + const Index PacketSize = internal::packet_traits::size; + const Index Alignment = internal::evaluator::Flags&AlignedBit ? Aligned : Unaligned; + PacketBlock A; + for (Index i=0; i(i,0); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]); + } +}; + +template +struct inplace_transpose_selector { // non square matrix static void run(MatrixType& m) { if (m.rows()==m.cols()) m.matrix().template triangularView().swap(m.matrix().transpose()); diff --git a/test/adjoint.cpp b/test/adjoint.cpp index ea36f7841..3b2a53c91 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -64,6 +64,7 @@ template void adjoint(const MatrixType& m) typedef typename NumTraits::Real RealScalar; typedef Matrix VectorType; typedef Matrix SquareMatrixType; + const Index PacketSize = internal::packet_traits::size; Index rows = m.rows(); Index cols = m.cols(); @@ -108,6 +109,17 @@ template void adjoint(const MatrixType& m) VERIFY_IS_APPROX(m3,m1.transpose()); m3.transposeInPlace(); VERIFY_IS_APPROX(m3,m1); + + if(PacketSize(0,m3.rows()-PacketSize); + Index j = internal::random(0,m3.cols()-PacketSize); + m3.template block(i,j).transposeInPlace(); + VERIFY_IS_APPROX( (m3.template block(i,j)), (m1.template block(i,j).transpose()) ); + m3.template block(i,j).transposeInPlace(); + VERIFY_IS_APPROX(m3,m1); + } // check inplace adjoint m3 = m1; @@ -129,9 +141,19 @@ void test_adjoint() CALL_SUBTEST_1( adjoint(Matrix()) ); CALL_SUBTEST_2( adjoint(Matrix3d()) ); CALL_SUBTEST_3( adjoint(Matrix4f()) ); + CALL_SUBTEST_4( adjoint(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( adjoint(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( adjoint(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + + // Complement for 128 bits vectorization: + CALL_SUBTEST_8( adjoint(Matrix2d()) ); + CALL_SUBTEST_9( adjoint(Matrix()) ); + + // 256 bits vectorization: + CALL_SUBTEST_10( adjoint(Matrix()) ); + CALL_SUBTEST_11( adjoint(Matrix()) ); + CALL_SUBTEST_12( adjoint(Matrix()) ); } // test a large static matrix only once CALL_SUBTEST_7( adjoint(Matrix()) ); From 46fc881e4ae23ef577ee20dcd61a5a74cba8b874 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 26 Jan 2015 17:46:40 -0800 Subject: [PATCH 165/214] Added a few benchmarks for the tensor code --- bench/tensors/tensor_benchmarks.h | 305 +++++++++++++++++++++++++ bench/tensors/tensor_benchmarks_cpu.cc | 156 +++++++++++++ bench/tensors/tensor_benchmarks_gpu.cc | 75 ++++++ 3 files changed, 536 insertions(+) create mode 100644 bench/tensors/tensor_benchmarks.h create mode 100644 bench/tensors/tensor_benchmarks_cpu.cc create mode 100644 bench/tensors/tensor_benchmarks_gpu.cc diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h new file mode 100644 index 000000000..525b9acda --- /dev/null +++ b/bench/tensors/tensor_benchmarks.h @@ -0,0 +1,305 @@ +#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ +#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ + +typedef int TensorIndex; +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "testing/base/public/benchmark.h" + +using Eigen::Tensor; +using Eigen::TensorMap; + + +// TODO(bsteiner): also templatize on the input type since we have users +// for int8 as well as floats. +template class BenchmarkSuite { + public: + BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) + : m_(m), k_(k), n_(n), device_(device) { + initialize(); + } + + BenchmarkSuite(const Device& device, size_t m) + : m_(m), k_(m), n_(m), device_(device) { + initialize(); + } + + ~BenchmarkSuite() { + device_.deallocate(a_); + device_.deallocate(b_); + device_.deallocate(c_); + } + + void memcpy(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(float)); + } + // Record the number of values copied per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void random(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = C.random(); + } + // Record the number of random numbers generated per second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void slicing(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + const Eigen::DSizes quarter_sizes(Eigen::array(m_/2, m_/2)); + const Eigen::DSizes first_quadrant(Eigen::array(0, 0)); + const Eigen::DSizes second_quadrant(Eigen::array(0, m_/2)); + const Eigen::DSizes third_quadrant(Eigen::array(m_/2, 0)); + const Eigen::DSizes fourth_quadrant(Eigen::array(m_/2, m_/2)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } + // Record the number of values copied from the rhs slice to the lhs slice + // each second + finalizeBenchmark(m_ * m_ * num_iters); + } + + void shuffling(int num_iters) { + eigen_assert(m_ == n_); + const Eigen::array size_a(m_, k_); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_b(k_, m_); + TensorMap, Eigen::Aligned> B(b_, size_b); + + const Eigen::array shuffle(1, 0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } + // Record the number of values shuffled from A and copied to B each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void padding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array size_a(m_, k_-3); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_b(k_, m_); + TensorMap, Eigen::Aligned> B(b_, size_b); + + Eigen::array, 2> paddings; + paddings[0] = Eigen::IndexPair(0, 0); + paddings[1] = Eigen::IndexPair(2, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.pad(paddings); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void striding(int num_iters) { + eigen_assert(m_ == k_); + const Eigen::array size_a(m_, k_); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_b(m_, k_ / 2); + TensorMap, Eigen::Aligned> B(b_, size_b); + + const Eigen::array strides(1, 2); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + B.device(device_) = A.stride(strides); + } + // Record the number of values copied from the padded tensor A each second + finalizeBenchmark(m_ * k_ * num_iters); + } + + void broadcasting(int num_iters) { + const Eigen::array size_a(m_, 1); + const TensorMap, Eigen::Aligned> A(a_, size_a); + const Eigen::array size_c(m_, n_); + TensorMap, Eigen::Aligned> C(c_, size_c); + +#if defined(__CUDACC__) + // nvcc doesn't support cxx11 + const Eigen::array broadcast(1, n_); +#else + // Take advantage of cxx11 to give the compiler information it can use to + // optimize the code. + Eigen::IndexList, int> broadcast; + broadcast.set(1, n_); +#endif + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } + // Record the number of values broadcasted from A and copied to C each second + finalizeBenchmark(m_ * n_ * num_iters); + } + + void coeffWiseOp(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7); + } + // Record the number of FLOP executed per second (2 multiplications and + // 1 addition per value) + finalizeBenchmark(3 * m_ * m_ * num_iters); + } + + void algebraicFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + void transcendentalFunc(int num_iters) { + eigen_assert(m_ == k_ && k_ == n_); + const Eigen::array sizes(m_, m_); + const TensorMap, Eigen::Aligned> A(a_, sizes); + const TensorMap, Eigen::Aligned> B(b_, sizes); + TensorMap, Eigen::Aligned> C(c_, sizes); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.exp() + B.log(); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // Simple reduction + void reduction(int num_iters) { + const Eigen::array input_size(k_, n_); + const TensorMap, Eigen::Aligned> B(b_, input_size); + const Eigen::array output_size(n_); + TensorMap, Eigen::Aligned> C(c_, output_size); + + const Eigen::array sum_along_dim(0); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } + // Record the number of FLOP executed per second (assuming one operation + // per value) + finalizeBenchmark(m_ * m_ * num_iters); + } + + // do a contraction which is equivalent to a matrix multiplication + void contraction(int num_iters) { + const Eigen::array sizeA(m_, k_); + const Eigen::array sizeB(k_, n_); + const Eigen::array sizeC(m_, n_); + + const TensorMap, Eigen::Aligned> A(a_, sizeA); + const TensorMap, Eigen::Aligned> B(b_, sizeB); + TensorMap, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor::DimensionPair DimPair; + const Eigen::array dims(DimPair(1, 0)); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast(2) * m_ * n_ * k_ * num_iters); + } + + void convolution(int num_iters, int kernel_x, int kernel_y) { + const Eigen::array input_sizes(m_, n_); + TensorMap, Eigen::Aligned> A(a_, input_sizes); + const Eigen::array kernel_sizes(kernel_x, kernel_y); + TensorMap, Eigen::Aligned> B(b_, kernel_sizes); + const Eigen::array result_sizes( + m_ - kernel_x + 1, n_ - kernel_y + 1); + TensorMap, Eigen::Aligned> C(c_, result_sizes); + Eigen::array::Index, 2> dims(0, 1); + + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.convolve(B, dims); + } + // Record the number of FLOP executed per second (kernel_size + // multiplications and additions for each value in the resulting tensor) + finalizeBenchmark( + (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters); + } + + private: + void initialize() { + a_ = (float *) device_.allocate(m_ * k_ * sizeof(float)); + b_ = (float *) device_.allocate(k_ * n_ * sizeof(float)); + c_ = (float *) device_.allocate(m_ * n_ * sizeof(float)); + + // Initialize the content of the memory pools to prevent asan from + // complaining. + device_.memset(a_, 12, m_ * k_ * sizeof(float)); + device_.memset(b_, 23, k_ * n_ * sizeof(float)); + device_.memset(c_, 31, m_ * n_ * sizeof(float)); + + BenchmarkUseRealTime(); + } + + inline void finalizeBenchmark(int64 num_items) { +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + if (Eigen::internal::is_same::value) { + device_.synchronize(); + } +#endif + StopBenchmarkTiming(); + SetBenchmarkItemsProcessed(num_items); + } + + + size_t m_; + size_t k_; + size_t n_; + float* a_; + float* b_; + float* c_; + Device device_; +}; +#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ diff --git a/bench/tensors/tensor_benchmarks_cpu.cc b/bench/tensors/tensor_benchmarks_cpu.cc new file mode 100644 index 000000000..68653ba15 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_cpu.cc @@ -0,0 +1,156 @@ +#define EIGEN_USE_THREADS + +#include "base/sysinfo.h" +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" +#include "thread/threadpool.h" + +#ifdef __ANDROID__ +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPoolDevice device(threads); +#else +#define CREATE_THREAD_POOL(threads) \ +ThreadPool tp(threads); \ +tp.StartWorkers(); \ +Eigen::ThreadPoolDevice device(&tp, threads); +#endif + +// Simple functions +#define BM_FuncCPU(FUNC, THREADS) \ + static void BM_##FUNC##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000); + +BM_FuncCPU(memcpy, 4); +BM_FuncCPU(memcpy, 8); +BM_FuncCPU(memcpy, 12); + +BM_FuncCPU(random, 4); +BM_FuncCPU(random, 8); +BM_FuncCPU(random, 12); + +BM_FuncCPU(slicing, 4); +BM_FuncCPU(slicing, 8); +BM_FuncCPU(slicing, 12); + +BM_FuncCPU(shuffling, 4); +BM_FuncCPU(shuffling, 8); +BM_FuncCPU(shuffling, 12); + +BM_FuncCPU(padding, 4); +BM_FuncCPU(padding, 8); +BM_FuncCPU(padding, 12); + +BM_FuncCPU(striding, 4); +BM_FuncCPU(striding, 8); +BM_FuncCPU(striding, 12); + +BM_FuncCPU(broadcasting, 4); +BM_FuncCPU(broadcasting, 8); +BM_FuncCPU(broadcasting, 12); + +BM_FuncCPU(coeffWiseOp, 4); +BM_FuncCPU(coeffWiseOp, 8); +BM_FuncCPU(coeffWiseOp, 12); + +BM_FuncCPU(algebraicFunc, 4); +BM_FuncCPU(algebraicFunc, 8); +BM_FuncCPU(algebraicFunc, 12); + +BM_FuncCPU(transcendentalFunc, 4); +BM_FuncCPU(transcendentalFunc, 8); +BM_FuncCPU(transcendentalFunc, 12); + +BM_FuncCPU(reduction, 4); +BM_FuncCPU(reduction, 8); +BM_FuncCPU(reduction, 12); + + +// Contractions +#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \ + static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\ + StopBenchmarkTiming(); \ + if (THREADS == 1) { \ + Eigen::DefaultDevice device; \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } else { \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000); + + +BM_FuncWithInputDimsCPU(contraction, N, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12); +BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16); + +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12); +BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16); + +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12); +BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16); + + +// Convolutions +#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \ + static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(THREADS); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000); + +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12); + +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12); + +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12); + +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8); +BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12); + +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8); +BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12); diff --git a/bench/tensors/tensor_benchmarks_gpu.cc b/bench/tensors/tensor_benchmarks_gpu.cc new file mode 100644 index 000000000..adea754ad --- /dev/null +++ b/bench/tensors/tensor_benchmarks_gpu.cc @@ -0,0 +1,75 @@ +#define EIGEN_USE_GPU + +#include +#include +#include +#include "strings/strcat.h" +#include "third_party/eigen3/tensor_benchmarks.h" + + + +// Simple functions +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(memcpy); +BM_FuncGPU(random); +BM_FuncGPU(slicing); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(reduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cudaStream_t stream; \ + cudaStreamCreate(&stream); \ + Eigen::GpuDevice device(&stream); \ + BenchmarkSuite suite(device, N); \ + cudaDeviceSynchronize(); \ + suite.FUNC(iters, DIM1, DIM2); \ + cudaStreamDestroy(stream); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); From 9dfdbd7e568bd3aa9a4610986dcfc679b9ea425d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 27 Jan 2015 14:15:31 -0800 Subject: [PATCH 166/214] mproved the performance of tensor reductions that preserve the inner most dimension(s). --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 64 ++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index eebcc4850..c6a8ecb5d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -48,6 +48,11 @@ template struct are_inner_most_dims { static const bool value = false; }; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + #if __cplusplus > 199711L template struct are_inner_most_dims{ @@ -61,6 +66,16 @@ struct are_inner_most_dims{ index_statically_eq()(0, NumTensorDims - array_size::value) && index_statically_eq()(array_size::value - 1, NumTensorDims - 1); }; +template +struct preserve_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_gt()(0, 0); +}; +template +struct preserve_inner_most_dims{ + static const bool value = indices_statically_known_to_increase()() && + index_statically_lt()(array_size::value - 1, NumTensorDims - 1); +}; #endif @@ -108,7 +123,35 @@ struct InnerMostDimReducer { for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); } - return reducer.finalizePacket(accum, p); + return reducer.finalizeBoth(accum, p); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reducePacket(self.m_impl.template packet(input), accum); + } } }; @@ -168,11 +211,14 @@ struct TensorEvaluator, Device> }; static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()) { EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); // Bitmap indicating if an input dimension is reduced or not. array reduced; @@ -291,6 +337,20 @@ struct TensorEvaluator, Device> values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, num_values_to_reduce, reducer); } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (Layout == ColMajor) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index + i); + } + } } else { for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index + i); @@ -305,6 +365,7 @@ struct TensorEvaluator, Device> private: template friend struct internal::GenericDimReducer; template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimPreserver; // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". @@ -316,6 +377,7 @@ struct TensorEvaluator, Device> return index * m_preservedStrides[NumOutputDims - 1]; } } + // TBD: optimize the case where we preserve the innermost dimensions. Index startInput = 0; if (Layout == ColMajor) { for (int i = NumOutputDims - 1; i > 0; --i) { From a727a2c4edfa85303ad1ad406e65415187fbb770 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 28 Jan 2015 16:07:51 +0100 Subject: [PATCH 167/214] bug #933: RealSchur, do not consider the input matrix norm to check negligible sub-diag entries. This also makes this test consistent with the complex and self-adjoint cases. --- Eigen/src/Eigenvalues/RealSchur.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 10f5fb174..51e61ba38 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -234,7 +234,7 @@ template class RealSchur typedef Matrix Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu, const Scalar& norm); + Index findSmallSubdiagEntry(Index iu); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -286,7 +286,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu, norm); + Index il = findSmallSubdiagEntry(iu); // Check for convergence if (il == iu) // One root found @@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template -inline typename MatrixType::Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar& norm) +inline typename MatrixType::Index RealSchur::findSmallSubdiagEntry(Index iu) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (s == 0.0) - s = norm; - if (abs(m_matT.coeff(res,res-1)) < NumTraits::epsilon() * s) + if (abs(m_matT.coeff(res,res-1)) <= NumTraits::epsilon() * s) break; res--; } @@ -457,9 +455,7 @@ inline void RealSchur::initFrancisQRStep(Index il, Index iu, const V const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2))); const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1))); if (abs(lhs) < NumTraits::epsilon() * rhs) - { break; - } } } From 5a6ea4edf61b5626a781070c6342fc16606b490a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 28 Jan 2015 10:02:47 -0800 Subject: [PATCH 168/214] Added more tests to cover tensor reductions --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 43 +++++++++---- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 62 ++++++++++++++++++- unsupported/test/cxx11_tensor_reduction.cpp | 37 ++++++++++- 3 files changed, 128 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 7b8d34321..38586d067 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -37,7 +37,11 @@ template struct SumReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return saccum + predux(vaccum); } }; @@ -45,16 +49,16 @@ template struct SumReducer template struct MeanReducer { static const bool PacketAccess = true; - MeanReducer() : count_(0) { } + MeanReducer() : scalarCount_(0), packetCount_(0) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { (*accum) += t; - count_++; + scalarCount_++; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { (*accum) = padd(*accum, p); - count_ += packet_traits::size; + packetCount_++; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { @@ -65,15 +69,20 @@ template struct MeanReducer return pset1(0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / count_; + return accum / scalarCount_; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { - return (saccum + predux(vaccum)) / count_; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(packetCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits::size); } protected: - int count_; + int scalarCount_; + int packetCount_; }; template struct MaxReducer @@ -99,7 +108,11 @@ template struct MaxReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return (std::max)(saccum, predux_max(vaccum)); } }; @@ -127,7 +140,11 @@ template struct MinReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return (std::min)(saccum, predux_min(vaccum)); } }; @@ -156,7 +173,11 @@ template struct ProdReducer return accum; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizePacket(const T saccum, const Packet& vaccum) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { return saccum * predux_mul(vaccum); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 209749042..7ff47673d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -181,7 +181,7 @@ template size_t array_prod(const Ind result *= sizes[i]; } return result; -} +}; template struct array_size > { static const size_t value = std::tuple_size >::value; @@ -307,6 +307,52 @@ struct index_statically_ne > { }; +template +struct index_statically_gt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_gt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] > value; + } +}; + +template +struct index_statically_gt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] > value; + } +}; + +template +struct index_statically_lt { + constexpr bool operator() (DenseIndex, DenseIndex) const { + return false; + } +}; + +template +struct index_statically_lt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] < value; + } +}; + +template +struct index_statically_lt > { + constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { + return IndexList().value_known_statically(i) & + IndexList()[i] < value; + } +}; + } // end namespace internal } // end namespace Eigen @@ -351,6 +397,20 @@ struct index_statically_ne { } }; +template +struct index_statically_gt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + +template +struct index_statically_lt { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{ + return false; + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 99e19eba4..5c3184833 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -369,6 +369,37 @@ static void test_innermost_first_dims() { } } +template +static void test_reduce_middle_dims() { + Tensor in(72, 53, 97, 113); + Tensor out(72, 53); + in.setRandom(); + +// Reduce on the innermost dimensions. +#if __cplusplus <= 199711L + array reduction_axis; + reduction_axis[0] = 1; + reduction_axis[1] = 2; +#else + // This triggers the use of packets for RowMajor. + Eigen::IndexList, Eigen::type2index<2>> reduction_axis; +#endif + + out = in.maximum(reduction_axis); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 113; ++j) { + float expected = -1e10f; + for (int k = 0; k < 53; ++k) { + for (int l = 0; l < 97; ++l) { + expected = (std::max)(expected, in(i, k, l, j)); + } + } + VERIFY_IS_APPROX(out(i, j), expected); + } + } +} + void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_simple_reductions()); CALL_SUBTEST(test_simple_reductions()); @@ -380,8 +411,10 @@ void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_tensor_maps()); CALL_SUBTEST(test_static_dims()); CALL_SUBTEST(test_static_dims()); - CALL_SUBTEST(test_innermost_last_dims()); CALL_SUBTEST(test_innermost_last_dims()); - CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_innermost_last_dims()); CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_innermost_first_dims()); + CALL_SUBTEST(test_reduce_middle_dims()); + CALL_SUBTEST(test_reduce_middle_dims()); } From e896c0ade7c77a18acb1b3ef01f22ef698c1a2a2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 29 Jan 2015 10:29:47 -0800 Subject: [PATCH 169/214] Marked the contraction operation as read only, since its result can't be assigned. --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index a02a273e7..af843654c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -423,7 +423,7 @@ struct traits -class TensorContractionOp : public TensorBase > +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; From 9d82f7e30d53086d090ae13d69dffef771eb6263 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Jan 2015 17:24:40 +0100 Subject: [PATCH 170/214] Supernodes was disabled. --- Eigen/src/SparseLU/SparseLU.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index d72d7f150..79b78da99 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase >, // Functions void initperfvalues() { - m_perfv.panel_size = 1; + m_perfv.panel_size = 16; m_perfv.relax = 1; m_perfv.maxsuper = 128; m_perfv.rowblk = 16; From f1092d2f736bbe541b3d8b5cca893f668f9c6b5f Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 30 Jan 2015 19:04:04 +0100 Subject: [PATCH 171/214] bug #941: fix accuracy issue in ColPivHouseholderQR, do not stop decomposition on a small pivot --- Eigen/src/QR/ColPivHouseholderQR.h | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index de77e8411..370cb69e3 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -476,20 +476,10 @@ ColPivHouseholderQR& ColPivHouseholderQR::compute(const // we store that back into our table: it can't hurt to correct our table. m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm; - // if the current biggest column is smaller than epsilon times the initial biggest column, - // terminate to avoid generating nan/inf values. - // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so) - // repetitions of the unit test, with the result of solve() filled with large values of the order - // of 1/(size*epsilon). - if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) - { + // Track the number of meaningful pivots but do not stop the decomposition to make + // sure that the initial matrix is properly reproduced. See bug 941. + if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k)) m_nonzero_pivots = k; - m_hCoeffs.tail(size-k).setZero(); - m_qr.bottomRightCorner(rows-k,cols-k) - .template triangularView() - .setZero(); - break; - } // apply the transposition to the columns m_colsTranspositions.coeffRef(k) = biggest_col_index; @@ -518,7 +508,7 @@ ColPivHouseholderQR& ColPivHouseholderQR::compute(const } m_colsPermutation.setIdentity(PermIndexType(cols)); - for(PermIndexType k = 0; k < m_nonzero_pivots; ++k) + for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k) m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k))); m_det_pq = (number_of_transpositions%2) ? -1 : 1; @@ -574,13 +564,15 @@ struct Assignment >, interna } // end namespace internal -/** \returns the matrix Q as a sequence of householder transformations */ +/** \returns the matrix Q as a sequence of householder transformations. + * You can extract the meaningful part only by using: + * \code qr.householderQ().setLength(qr.nonzeroPivots()) */ template typename ColPivHouseholderQR::HouseholderSequenceType ColPivHouseholderQR ::householderQ() const { eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots); + return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()); } #ifndef __CUDACC__ From 759bd92a85393617a56405ec0372e87416cfaebb Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:27:56 -0500 Subject: [PATCH 172/214] bug #935: Add asm comments in GEBP kernels to work around a bug in both GCC and Clang on ARM/NEON, whereby they spill registers, severely harming performance. The reason why the asm comments make a difference is that they prevent the compiler from reordering code across these boundaries, which has the effect of extending the lifetime of local variables and increasing register pressure on this register-tight code. --- .../Core/products/GeneralBlockPanelKernel.h | 160 +++++++++++------- Eigen/src/Core/util/Macros.h | 8 +- 2 files changed, 108 insertions(+), 60 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 7b2ed6728..1b39642fb 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -760,31 +760,36 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k for(Index k=0; k blB += pk*RhsProgress; blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); } // process remaining peeled loop @@ -963,21 +977,26 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k for(Index k=0; k blB += pk*RhsProgress; blA += pk*2*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop @@ -1137,16 +1165,21 @@ void gebp_kernel for(Index k=0; k blB += pk*4*RhsProgress; blA += pk*1*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); } // process remaining peeled loop for(Index k=peeled_kc; k for(Index k=0; k blB += pk*RhsProgress; blA += pk*1*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } // process remaining peeled loop diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 687ba41dd..13f8fdd4e 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -160,6 +160,12 @@ #define EIGEN_ARCH_ARM64 0 #endif +#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 + #define EIGEN_ARCH_ARM_OR_ARM64 1 +#else + #define EIGEN_ARCH_ARM_OR_ARM64 0 +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -526,7 +532,7 @@ namespace Eigen { #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) - #if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 + #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) From 9f99f61e69a70e0a209d5f93e78a9257685cd70d Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:43:56 -0500 Subject: [PATCH 173/214] bug #936, patch 1/3: some cleanup and renaming for consistency. --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 ++-- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- .../Core/products/GeneralBlockPanelKernel.h | 18 +++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index fa02f57a1..27df5a025 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,8 +22,8 @@ namespace internal { #define EIGEN_HAS_FUSED_MADD 1 #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_FUSED_CJMADD +#define EIGEN_HAS_FUSED_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index f83f8db0e..5a6eb8c1d 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,8 +24,8 @@ namespace internal { #define EIGEN_HAS_FUSED_MADD 1 #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_FUSED_CJMADD +#define EIGEN_HAS_FUSED_CJMADD #endif // FIXME NEON has 16 quad registers, but since the current register allocator diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1b39642fb..ae2fd9006 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -120,8 +120,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) computeProductBlockingSizes(k, m, n); } -#ifdef EIGEN_HAS_FUSE_CJMADD - #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#ifdef EIGEN_HAS_FUSED_CJMADD + #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else // FIXME (a bit overkill maybe ?) @@ -146,8 +146,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) gebp_madd_selector::run(cj,a,b,c,t); } - #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); + #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif /* Vectorization logic @@ -1402,13 +1402,13 @@ void gebp_kernel B_0 = blB[0]; B_1 = blB[1]; - MADD(cj,A0,B_0,C0, B_0); - MADD(cj,A0,B_1,C1, B_1); + CJMADD(cj,A0,B_0,C0, B_0); + CJMADD(cj,A0,B_1,C1, B_1); B_0 = blB[2]; B_1 = blB[3]; - MADD(cj,A0,B_0,C2, B_0); - MADD(cj,A0,B_1,C3, B_1); + CJMADD(cj,A0,B_0,C2, B_0); + CJMADD(cj,A0,B_1,C3, B_1); blB += 4; } @@ -1434,7 +1434,7 @@ void gebp_kernel { LhsScalar A0 = blA[k]; RhsScalar B_0 = blB[k]; - MADD(cj, A0, B_0, C0, B_0); + CJMADD(cj, A0, B_0, C0, B_0); } res[(j2+0)*resStride + i] += alpha*C0; } From 340b8afb14bb06788570ba22ba4ccba674402f09 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Sat, 31 Jan 2015 14:15:57 -0500 Subject: [PATCH 174/214] bug #936, patch 1.5/3: rename _FUSED_ macros to _SINGLE_INSTRUCTION_, because this is what they are about. "Fused" means "no intermediate rounding between the mul and the add, only one rounding at the end". Instead, what we are concerned about here is whether a temporary register is needed, i.e. whether the MUL and ADD are separate instructions. Concretely, on ARM NEON, a single-instruction mul-add is always available: VMLA. But a true fused mul-add is only available on VFPv4: VFMA. --- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 8 ++++---- Eigen/src/Core/arch/NEON/PacketMath.h | 8 ++++---- Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++-- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 12 ++++++------ 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index e2376bd1f..1d8c674a6 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -23,8 +23,8 @@ namespace internal { #endif #ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 27df5a025..578b303a0 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -18,12 +18,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif -#ifndef EIGEN_HAS_FUSED_CJMADD -#define EIGEN_HAS_FUSED_CJMADD +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 5a6eb8c1d..9cfb9c358 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -20,12 +20,12 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif -#ifndef EIGEN_HAS_FUSED_CJMADD -#define EIGEN_HAS_FUSED_CJMADD +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // FIXME NEON has 16 quad registers, but since the current register allocator diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 28427c308..202aaa72f 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -23,8 +23,8 @@ namespace internal { #endif #ifdef EIGEN_VECTORIZE_FMA -#ifndef EIGEN_HAS_FUSED_MADD -#define EIGEN_HAS_FUSED_MADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif #endif diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index ae2fd9006..b5f06d831 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -120,7 +120,7 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) computeProductBlockingSizes(k, m, n); } -#ifdef EIGEN_HAS_FUSED_CJMADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else @@ -182,7 +182,7 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -248,7 +248,7 @@ public: // let gcc allocate the register in which to store the result of the pmul // (in the case where there is no FMA) gcc fails to figure out how to avoid // spilling register. -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c = pmadd(a,b,c); #else @@ -290,7 +290,7 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, -#if defined(EIGEN_HAS_FUSED_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers mr = 3*LhsPacketSize, #else @@ -353,7 +353,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a.v,b,c.v); #else @@ -637,7 +637,7 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { -#ifdef EIGEN_HAS_FUSED_MADD +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); c.v = pmadd(a,b.v,c.v); #else From 0f216136980503c3792a90e382b4d6bbdbb870c0 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:44:26 -0500 Subject: [PATCH 175/214] bug #936, patch 2/3: Remove EIGEN_VECTORIZE_FMA, was redundant with EIGEN_HAS_SINGLE_INSTRUCTION_MADD --- Eigen/Core | 4 +--- Eigen/src/Core/arch/AVX/PacketMath.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 2 +- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index dcb20bfd0..b5af63623 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -125,9 +125,7 @@ #define EIGEN_VECTORIZE_SSE4_1 #define EIGEN_VECTORIZE_SSE4_2 #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif + // include files // This extern "C" works around a MINGW-w64 compilation issue diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 1d8c674a6..485bac10b 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -22,9 +22,9 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 578b303a0..6b68fc7a5 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -19,7 +19,7 @@ namespace internal { #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9cfb9c358..71255ac85 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -21,7 +21,7 @@ namespace internal { #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 202aaa72f..3f6fb0254 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,7 +22,7 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif From 5ef95fabee5e9a9357c082cd32ae3b4affb2eff6 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 30 Jan 2015 17:45:03 -0500 Subject: [PATCH 176/214] bug #936, patch 3/3: Properly detect FMA support on ARM (requires VFPv4) and use it instead of MLA when available, because it's both more accurate, and faster. --- Eigen/src/Core/arch/NEON/PacketMath.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 71255ac85..9afd86bec 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co return pset1(0); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. +// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. +// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. +// MLA is not fused i.e. does 2 roundings. +// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): +// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +#endif + +// No FMA instruction for int, so use MLA unconditionally. template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } @@ -551,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } -// for some weird raisons, it has to be overloaded for packet of integers +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } From 590f4b0aa3583c98fe9a0682e26c24ebfaffeaa6 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 30 Jan 2015 19:46:30 -0800 Subject: [PATCH 177/214] Silenced some compilation warnings --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 22 +++++++++---------- .../CXX11/src/Tensor/TensorInitializer.h | 12 ---------- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 7ff47673d..c94ed977e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -124,18 +124,18 @@ struct tuple_coeff<0> { update_value(std::get<0>(t), value); } template - static constexpr bool value_known_statically(const DenseIndex i, const std::tuple& t) { + static constexpr bool value_known_statically(const DenseIndex i, const std::tuple&) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return is_compile_time_constant >::type>::value & (i == 0); } template - static constexpr bool values_up_to_known_statically(const std::tuple& t) { + static constexpr bool values_up_to_known_statically(const std::tuple&) { return is_compile_time_constant >::type>::value; } template - static constexpr bool values_up_to_statically_known_to_increase(const std::tuple& t) { + static constexpr bool values_up_to_statically_known_to_increase(const std::tuple&) { return true; } }; @@ -271,7 +271,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -279,7 +279,7 @@ template struct index_statically_eq > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] == value; + (IndexList()[i] == value); } }; @@ -294,7 +294,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -302,7 +302,7 @@ template struct index_statically_ne > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] != value; + (IndexList()[i] != value); } }; @@ -318,7 +318,7 @@ template struct index_statically_gt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] > value; + (IndexList()[i] > value); } }; @@ -326,7 +326,7 @@ template struct index_statically_gt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] > value; + (IndexList()[i] > value); } }; @@ -341,7 +341,7 @@ template struct index_statically_lt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] < value; + (IndexList()[i] < value); } }; @@ -349,7 +349,7 @@ template struct index_statically_lt > { constexpr bool operator() (const DenseIndex i, const DenseIndex value) const { return IndexList().value_known_statically(i) & - IndexList()[i] < value; + (IndexList()[i] < value); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index 6afef0fbb..4303e3536 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -55,18 +55,6 @@ struct Initializer { } }; -template -struct Initializer { - typedef std::initializer_list::Scalar> InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>* indices, - const InitList& vals) { - // Static initialization not implemented for VarDims tensors. - eigen_assert(false); - } -}; - template void initialize_tensor(TensorEvaluator& tensor, const typename Initializer::NumDimensions>::InitList& vals) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index c6a8ecb5d..83ba1df71 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -129,7 +129,7 @@ struct InnerMostDimReducer { template struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { eigen_assert(false && "should never be called"); } }; From f64045a060ae22c6445b78ecea3783cef7c1ca3b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 30 Jan 2015 19:52:01 -0800 Subject: [PATCH 178/214] Silenced a few more compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index e125ca799..0e8a4b8d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -369,7 +369,7 @@ class Tensor : public TensorBase > void resize(const DSizes& dimensions) { array dims; - for (int i = 0; i < NumIndices; ++i) { + for (std::size_t i = 0; i < NumIndices; ++i) { dims[i] = dimensions[i]; } resize(dims); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 6c9a67c58..d81197e6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -93,7 +93,7 @@ struct Sizes : internal::numeric_list { // todo: add assertion } #ifdef EIGEN_HAS_VARIADIC_TEMPLATES - template Sizes(DenseIndex... indices) { } + template Sizes(DenseIndex...) { } explicit Sizes(std::initializer_list /*l*/) { // todo: add assertion } @@ -333,7 +333,7 @@ static const size_t value = Sizes::count; template struct array_size > { static const size_t value = Sizes::count; }; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes& a) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { return get >::value; } #else From ebdf6a2dbbc66d0f6fb045a3c5b0023bc2e89851 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Feb 2015 22:32:34 +0100 Subject: [PATCH 179/214] SPQR: fix default threshold value --- Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 51 +++++++++++++++----- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 44f6a1acb..54a1b21b8 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase > typedef Map > PermutationType; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase > if(m_isInitialized) SPQR_free(); MatrixType mat(matrix); + + /* Compute the default threshold as in MatLab, see: + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 + */ + RealScalar pivotThreshold = m_tolerance; + if(m_useDefaultThreshold) + { + RealScalar max2Norm = 0.0; + for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm()); + if(max2Norm==RealScalar(0)) + max2Norm = RealScalar(1); + pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits::epsilon(); + } + cholmod_sparse A; A = viewAsCholmod(mat); Index col = matrix.cols(); - m_rank = SuiteSparseQR(m_ordering, m_tolerance, col, &A, + m_rank = SuiteSparseQR(m_ordering, pivotThreshold, col, &A, &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc); if (!m_cR) @@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase > /** * Get the number of rows of the input matrix and the Q matrix */ - inline Index rows() const {return m_H->nrow; } + inline Index rows() const {return m_cR->nrow; } /** * Get the number of columns of the input matrix. @@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase > { eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()"); eigen_assert(b.cols()==1 && "This method is for vectors only"); - + //Compute Q^T * b - typename Dest::PlainObject y; + typename Dest::PlainObject y, y2; y = matrixQ().transpose() * b; - // Solves with the triangular matrix R + + // Solves with the triangular matrix R Index rk = this->rank(); - y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView().solve(y.topRows(rk)); - y.bottomRows(cols()-rk).setZero(); + y2 = y; + y.resize((std::max)(cols(),Index(y.rows())),y.cols()); + y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView().solve(y2.topRows(rk)); + // Apply the column permutation - dest.topRows(cols()) = colsPermutation() * y.topRows(cols()); + // colsPermutation() performs a copy of the permutation, + // so let's apply it manually: + for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i); + for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero(); + +// y.bottomRows(y.rows()-rk).setZero(); +// dest = colsPermutation() * y.topRows(cols()); m_info = Success; } @@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase > /// Set the fill-reducing ordering method to be used void setSPQROrdering(int ord) { m_ordering = ord;} /// Set the tolerance tol to treat columns with 2-norm < =tol as zero - void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; } + void setPivotThreshold(const RealScalar& tol) + { + m_useDefaultThreshold = false; + m_tolerance = tol; + } /** \returns a pointer to the SPQR workspace */ cholmod_common *cholmodCommon() const { return &m_cc; } @@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase > mutable cholmod_dense *m_HTau; // The Householder coefficients mutable Index m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters + bool m_useDefaultThreshold; // Use default threshold template friend struct SPQR_QProduct; }; From b1eca55328436f9778254c6bbc8852910a0d3d2a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 3 Feb 2015 23:46:05 +0100 Subject: [PATCH 180/214] Use Ref<> to ensure that both x and b in Ax=b are compatible with Umfpack/SuperLU expectations --- Eigen/src/SuperLUSupport/SuperLUSupport.h | 23 +++++++++++++++++++---- Eigen/src/UmfPackSupport/UmfPackSupport.h | 13 ++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index ef73587a7..6de5b3dc5 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -627,8 +627,12 @@ void SuperLU::_solve_impl(const MatrixBase &b, MatrixBase m_sluFerr.resize(rhsCols); m_sluBerr.resize(rhsCols); - m_sluB = SluMatrix::Map(b.const_cast_derived()); - m_sluX = SluMatrix::Map(x.derived()); + + Ref > b_ref(b); + Ref > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -651,6 +655,10 @@ void SuperLU::_solve_impl(const MatrixBase &b, MatrixBase &m_sluFerr[0], &m_sluBerr[0], &m_sluStat, &info, Scalar()); StatFree(&m_sluStat); + + if(&x.coeffRef(0) != x_ref.data()) + x = x_ref; + m_info = info==0 ? Success : NumericalIssue; } @@ -938,8 +946,12 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase > b_ref(b); + Ref > x_ref(x); + + m_sluB = SluMatrix::Map(b_ref.const_cast_derived()); + m_sluX = SluMatrix::Map(x_ref.const_cast_derived()); typename Rhs::PlainObject b_cpy; if(m_sluEqued!='N') @@ -962,6 +974,9 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); int errorCode; + Scalar* x_ptr = 0; + Matrix x_tmp; + if(x.innerStride()!=1) + { + x_tmp.resize(x.rows()); + x_ptr = x_tmp.data(); + } for (int j=0; j Date: Wed, 4 Feb 2015 18:37:51 +0000 Subject: [PATCH 181/214] Using numext::pow instead of std::pow in poly_eval function. --- unsupported/Eigen/src/Polynomials/PolynomialUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h index 2bb8bc84a..40ba65b7e 100644 --- a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h +++ b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h @@ -56,7 +56,7 @@ T poly_eval( const Polynomials& poly, const T& x ) for( DenseIndex i=1; i Date: Fri, 6 Feb 2015 02:51:59 -0800 Subject: [PATCH 182/214] Added the EIGEN_HAS_CONSTEXPR define Gate the tensor index list code based on the value of EIGEN_HAS_CONSTEXPR --- Eigen/src/Core/util/Macros.h | 6 ++++++ unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- unsupported/test/cxx11_tensor_index_list.cpp | 4 ++++ unsupported/test/cxx11_tensor_reduction.cpp | 8 ++++---- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 001907a0b..40a28d4d6 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -133,6 +133,12 @@ #define EIGEN_HAS_VARIADIC_TEMPLATES 1 #endif +// Does the compiler support const expressions? +#if (defined(__plusplus) && __cplusplus >= 201402L) || \ + EIGEN_GNUC_AT_LEAST(4,9) +#define EIGEN_HAS_CONSTEXPR 1 +#endif + /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index c94ed977e..eed0a9f05 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#if __cplusplus > 199711L +#ifdef EIGEN_HAS_CONSTEXPR namespace Eigen { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 83ba1df71..21416afe0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -53,7 +53,7 @@ struct preserve_inner_most_dims { static const bool value = false; }; -#if __cplusplus > 199711L +#ifdef EIGEN_HAS_CONSTEXPR template struct are_inner_most_dims{ static const bool value = indices_statically_known_to_increase()() && diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index d79a3ed45..c4d4f244f 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -11,6 +11,7 @@ #include +#ifdef EIGEN_HAS_CONSTEXPR static void test_static_index_list() { @@ -254,11 +255,14 @@ static void test_mixed_index_list() VERIFY_IS_APPROX(result3(0), expected); } +#endif void test_cxx11_tensor_index_list() { +#ifdef EIGEN_HAS_CONSTEXPR CALL_SUBTEST(test_static_index_list()); CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); +#endif } diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 5c3184833..0269853a9 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -284,7 +284,7 @@ static void test_static_dims() { Tensor out(72, 97); in.setRandom(); -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; @@ -314,7 +314,7 @@ static void test_innermost_last_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 0; reduction_axis[1] = 1; @@ -345,7 +345,7 @@ static void test_innermost_first_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 2; reduction_axis[1] = 3; @@ -376,7 +376,7 @@ static void test_reduce_middle_dims() { in.setRandom(); // Reduce on the innermost dimensions. -#if __cplusplus <= 199711L +#ifndef EIGEN_HAS_CONSTEXPR array reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 2; From 2559fa9b0f20ea138cfb019d441ad1757221568d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 6 Feb 2015 02:55:18 -0800 Subject: [PATCH 183/214] Fixed compilation error in the tensor broadcasting test --- unsupported/test/cxx11_tensor_broadcasting.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index f0792bdcf..2ddf47234 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -114,7 +114,15 @@ static void test_static_broadcasting() { Tensor tensor(8,3,5); tensor.setRandom(); + +#ifdef EIGEN_HAS_CONSTEXPR Eigen::IndexList, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; +#else + Eigen::array broadcasts; + broadcasts[0] = 2; + broadcasts[1] = 3; + broadcasts[2] = 4; +#endif Tensor broadcast; broadcast = tensor.broadcast(broadcasts); From 668518aed69c3d20efb480acd5944a79df7e5410 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:25:41 +0100 Subject: [PATCH 184/214] Fix non initialized entries and comparison of very small numbers --- unsupported/test/cxx11_tensor_contraction.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 6124818fd..2bcae90b8 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -389,7 +389,7 @@ static void test_matrix_vector() m_result = m_left * m_right; for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1)); } } @@ -399,6 +399,10 @@ static void test_tensor_vector() { Tensor t_left(7, 13, 17); Tensor t_right(1, 7); + + t_left.setRandom(); + t_right.setRandom(); + typedef typename Tensor::DimensionPair DimensionPair; Eigen::array dim_pair01{{{0, 1}}}; Tensor t_result = t_left.contract(t_right, dim_pair01); @@ -409,7 +413,7 @@ static void test_tensor_vector() Eigen::Matrix m_result = m_left.transpose() * m_right.transpose(); for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) { - VERIFY_IS_APPROX(t_result(i), m_result(i, 0)); + VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1)); } } From c03c73c9b7032f984bcd6c52d9ca3a430ce19c69 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:26:12 +0100 Subject: [PATCH 185/214] Fix clang compilation --- unsupported/test/cxx11_tensor_thread_pool.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e25912279..f49523683 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -15,6 +15,7 @@ #include using Eigen::Tensor; +using std::isnan; static void test_multithread_elementwise() { From 74e460b9950503ef5a306337a136e1d37795deae Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Feb 2015 14:26:24 +0100 Subject: [PATCH 186/214] Fix symmetric product --- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 21f8175d2..860e233b9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -374,7 +374,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Date: Fri, 6 Feb 2015 06:00:59 -0800 Subject: [PATCH 187/214] Fixed the cxx11_meta test --- unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h index 36d91e780..3a08628be 100644 --- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Meta.h @@ -42,14 +42,14 @@ struct numeric_list { constexpr static std::size_t count = sizeof.. * typename gen_numeric_list_repeated::type numeric_list */ -template struct gen_numeric_list : gen_numeric_list {}; -template struct gen_numeric_list { typedef numeric_list type; }; +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; -template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; -template struct gen_numeric_list_reversed { typedef numeric_list type; }; +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; -template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; -template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; template struct gen_numeric_list_repeated { typedef numeric_list type; }; @@ -112,7 +112,7 @@ template struct get<0, type_lis template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; template struct get> : get> {}; -template struct get<0, numeric_list> { constexpr static T value = a; }; +template struct get<0, numeric_list> { constexpr static int value = a; }; template struct get> { static_assert((n - n) < 0, "meta-template get: The element to extract from a list must be smaller than the size of the list."); }; /* always get type, regardless of dummy; good for parameter pack expansion */ From 7838fda82cfc49b40e8d3a615bb05711815b50e1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:00:46 +0100 Subject: [PATCH 188/214] Add a SparseCompressedBase class providing (un)compressed accessors (like data()/*Stride() for dense matrices), and a CompressedAccessBit flag (similar to DirectAccessBit for dense matrices). --- Eigen/SparseCore | 1 + Eigen/src/Core/util/Constants.h | 13 ++ Eigen/src/SparseCore/SparseCompressedBase.h | 199 ++++++++++++++++++++ Eigen/src/SparseCore/SparseMatrix.h | 131 ++----------- Eigen/src/SparseCore/SparseUtil.h | 6 +- 5 files changed, 231 insertions(+), 119 deletions(-) create mode 100644 Eigen/src/SparseCore/SparseCompressedBase.h diff --git a/Eigen/SparseCore b/Eigen/SparseCore index d5c0f6271..e2071519b 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -31,6 +31,7 @@ #include "src/SparseCore/SparseAssign.h" #include "src/SparseCore/CompressedStorage.h" #include "src/SparseCore/AmbiVector.h" +#include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 9b40093f0..d1855b50b 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100; * \sa \ref RowMajorBit, \ref TopicStorageOrders */ const unsigned int NoPreferredStorageOrderBit = 0x200; +/** \ingroup flags + * + * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format, + * that is, the expression provides: + * \code + inline const Scalar* valuePtr() const; + inline const Index* innerIndexPtr() const; + inline const Index* outerIndexPtr() const; + inline const Index* innerNonZeroPtr() const; + \endcode + */ +const unsigned int CompressedAccessBit = 0x400; + // list of flags that are inherited by default const unsigned int HereditaryBits = RowMajorBit diff --git a/Eigen/src/SparseCore/SparseCompressedBase.h b/Eigen/src/SparseCore/SparseCompressedBase.h new file mode 100644 index 000000000..00658181e --- /dev/null +++ b/Eigen/src/SparseCore/SparseCompressedBase.h @@ -0,0 +1,199 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H +#define EIGEN_SPARSE_COMPRESSED_BASE_H + +namespace Eigen { + +template class SparseCompressedBase; + +namespace internal { + +template +struct traits > : traits +{}; + +} // end namespace internal + +template +class SparseCompressedBase + : public SparseMatrixBase +{ + public: + typedef SparseMatrixBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase) + using Base::operator=; + using Base::IsRowMajor; + + class InnerIterator; + class ReverseInnerIterator; + + /** \returns a const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline const Scalar* valuePtr() const { return derived().valuePtr(); } + /** \returns a non-const pointer to the array of values. + * This function is aimed at interoperability with other libraries. + * \sa innerIndexPtr(), outerIndexPtr() */ + inline Scalar* valuePtr() { return derived().valuePtr(); } + + /** \returns a const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline const Index* innerIndexPtr() const { return derived().innerIndexPtr(); } + /** \returns a non-const pointer to the array of inner indices. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), outerIndexPtr() */ + inline Index* innerIndexPtr() { return derived().innerIndexPtr(); } + + /** \returns a const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline const Index* outerIndexPtr() const { return derived().outerIndexPtr(); } + /** \returns a non-const pointer to the array of the starting positions of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \sa valuePtr(), innerIndexPtr() */ + inline Index* outerIndexPtr() { return derived().outerIndexPtr(); } + + /** \returns a const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline const Index* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); } + /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors. + * This function is aimed at interoperability with other libraries. + * \warning it returns the null pointer 0 in compressed mode */ + inline Index* innerNonZeroPtr() { return derived().innerNonZeroPtr(); } + + /** \returns whether \c *this is in compressed form. */ + inline bool isCompressed() const { return innerNonZeroPtr()==0; } + +}; + +template +class SparseCompressedBase::InnerIterator +{ + public: + InnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_end = mat.outerIndexPtr()[outer+1]; + else + m_end = m_id + mat.innerNonZeroPtr()[outer]; + } + + inline InnerIterator& operator++() { m_id++; return *this; } + + inline const Scalar& value() const { return m_values[m_id]; } + inline Scalar& valueRef() { return const_cast(m_values[m_id]); } + + inline Index index() const { return m_indices[m_id]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id < m_end); } + + protected: + const Scalar* m_values; + const Index* m_indices; + const Index m_outer; + Index m_id; + Index m_end; + private: + // If you get here, then you're not using the right InnerIterator type, e.g.: + // SparseMatrix A; + // SparseMatrix::InnerIterator it(A,0); + template InnerIterator(const SparseMatrixBase&,Index outer); +}; + +template +class SparseCompressedBase::ReverseInnerIterator +{ + public: + ReverseInnerIterator(const SparseCompressedBase& mat, Index outer) + : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer]) + { + if(mat.isCompressed()) + m_id = mat.outerIndexPtr()[outer+1]; + else + m_id = m_start + mat.innerNonZeroPtr()[outer]; + } + + inline ReverseInnerIterator& operator--() { --m_id; return *this; } + + inline const Scalar& value() const { return m_values[m_id-1]; } + inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } + + inline Index index() const { return m_indices[m_id-1]; } + inline Index outer() const { return m_outer; } + inline Index row() const { return IsRowMajor ? m_outer : index(); } + inline Index col() const { return IsRowMajor ? index() : m_outer; } + + inline operator bool() const { return (m_id > m_start); } + + protected: + const Scalar* m_values; + const Index* m_indices; + const Index m_outer; + Index m_id; + const Index m_start; +}; + +namespace internal { + +template +struct evaluator > + : evaluator_base +{ + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Index Index; + typedef typename Derived::InnerIterator InnerIterator; + typedef typename Derived::ReverseInnerIterator ReverseInnerIterator; + + enum { + CoeffReadCost = NumTraits::ReadCost, + Flags = Derived::Flags + }; + + evaluator() : m_matrix(0) {} + explicit evaluator(const Derived &mat) : m_matrix(&mat) {} + + operator Derived&() { return m_matrix->const_cast_derived(); } + operator const Derived&() const { return *m_matrix; } + + typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; + Scalar coeff(Index row, Index col) const + { return m_matrix->coeff(row,col); } + + Scalar& coeffRef(Index row, Index col) + { + eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = m_matrix->outerIndexPtr()[outer]; + Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; + eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) + - m_matrix->innerIndexPtr(); + eigen_assert((pinnerIndexPtr()[p]==inner) && "written coefficient does not exist"); + return m_matrix->const_cast_derived().valuePtr()[p]; + } + + const Derived *m_matrix; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_COMPRESSED_BASE_H diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 93677c786..74b4c6a9d 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -51,7 +51,7 @@ struct traits > ColsAtCompileTime = Dynamic, MaxRowsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic, - Flags = _Options | NestByRefBit | LvalueBit, + Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit, SupportedAccessPatterns = InnerRandomAccessPattern }; }; @@ -90,16 +90,20 @@ struct traits, DiagIndex> template class SparseMatrix - : public SparseMatrixBase > + : public SparseCompressedBase > { public: - EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) + typedef SparseCompressedBase Base; + using Base::isCompressed; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=) typedef MappedSparseMatrix Map; typedef Diagonal DiagonalReturnType; typedef Diagonal ConstDiagonalReturnType; + typedef typename Base::InnerIterator InnerIterator; + typedef typename Base::ReverseInnerIterator ReverseInnerIterator; using Base::IsRowMajor; @@ -123,9 +127,6 @@ class SparseMatrix public: - /** \returns whether \c *this is in compressed form. */ - inline bool isCompressed() const { return m_innerNonZeros==0; } - /** \returns the number of rows of the matrix */ inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } /** \returns the number of columns of the matrix */ @@ -241,9 +242,6 @@ class SparseMatrix public: - class InnerIterator; - class ReverseInnerIterator; - /** Removes all non zeros but keep allocated memory */ inline void setZero() { @@ -875,76 +873,6 @@ private: }; }; -template -class SparseMatrix::InnerIterator -{ - public: - InnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_end = mat.m_outerIndex[outer+1]; - else - m_end = m_id + mat.m_innerNonZeros[outer]; - } - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline const Scalar& value() const { return m_values[m_id]; } - inline Scalar& valueRef() { return const_cast(m_values[m_id]); } - - inline Index index() const { return m_indices[m_id]; } - inline Index outer() const { return m_outer; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end); } - - protected: - const Scalar* m_values; - const Index* m_indices; - const Index m_outer; - Index m_id; - Index m_end; - private: - // If you get here, then you're not using the right InnerIterator type, e.g.: - // SparseMatrix A; - // SparseMatrix::InnerIterator it(A,0); - template InnerIterator(const SparseMatrixBase&,Index outer); -}; - -template -class SparseMatrix::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const SparseMatrix& mat, Index outer) - : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer]) - { - if(mat.isCompressed()) - m_id = mat.m_outerIndex[outer+1]; - else - m_id = m_start + mat.m_innerNonZeros[outer]; - } - - inline ReverseInnerIterator& operator--() { --m_id; return *this; } - - inline const Scalar& value() const { return m_values[m_id-1]; } - inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } - - inline Index index() const { return m_indices[m_id-1]; } - inline Index outer() const { return m_outer; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id > m_start); } - - protected: - const Scalar* m_values; - const Index* m_indices; - const Index m_outer; - Index m_id; - const Index m_start; -}; namespace internal { @@ -1075,6 +1003,10 @@ EIGEN_DONT_INLINE SparseMatrix& SparseMatrix::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator::Flags & RowMajorBit); if (needToTranspose) { @@ -1277,45 +1209,12 @@ namespace internal { template struct evaluator > - : evaluator_base > + : evaluator > > { - typedef _Scalar Scalar; - typedef _Index Index; + typedef evaluator > > Base; typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType; - typedef typename SparseMatrixType::InnerIterator InnerIterator; - typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator; - - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = SparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {} - - operator SparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const SparseMatrixType&() const { return *m_matrix; } - - typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; - Scalar coeff(Index row, Index col) const - { return m_matrix->coeff(row,col); } - - Scalar& coeffRef(Index row, Index col) - { - eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - - const Index outer = SparseMatrixType::IsRowMajor ? row : col; - const Index inner = SparseMatrixType::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner); - eigen_assert((pdata().index(p)==inner) && "written coefficient does not exist"); - return m_matrix->const_cast_derived().data().value(p); - } - - const SparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const SparseMatrixType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseUtil.h b/Eigen/src/SparseCore/SparseUtil.h index 8de227b88..ba4803646 100644 --- a/Eigen/src/SparseCore/SparseUtil.h +++ b/Eigen/src/SparseCore/SparseUtil.h @@ -43,8 +43,7 @@ EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) -#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \ - typedef BaseClass Base; \ +#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ typedef typename Eigen::internal::traits::Scalar Scalar; \ typedef typename Eigen::NumTraits::Real RealScalar; \ typedef typename Eigen::internal::nested::type Nested; \ @@ -59,7 +58,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) using Base::const_cast_derived; #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \ - _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase) + typedef Eigen::SparseMatrixBase Base; \ + _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) const int CoherentAccessPattern = 0x1; const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern; From 08081f829397de11651d8d779b9eed916ccc88d7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:02:14 +0100 Subject: [PATCH 189/214] Make SparseTranspose inherit SparseCompressBase when possible --- Eigen/src/SparseCore/SparseTranspose.h | 34 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/Eigen/src/SparseCore/SparseTranspose.h b/Eigen/src/SparseCore/SparseTranspose.h index c3d2d1a16..37ce7b0d5 100644 --- a/Eigen/src/SparseCore/SparseTranspose.h +++ b/Eigen/src/SparseCore/SparseTranspose.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2014 Gael Guennebaud +// Copyright (C) 2008-2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -12,13 +12,41 @@ namespace Eigen { +namespace internal { + template + class SparseTransposeImpl + : public SparseMatrixBase > + {}; + + template + class SparseTransposeImpl + : public SparseCompressedBase > + { + typedef SparseCompressedBase > Base; + public: + using Base::derived; + typedef typename Base::Scalar Scalar; + typedef typename Base::Index Index; + + inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); } + inline const Index* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); } + inline const Index* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); } + inline const Index* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); } + + inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); } + inline Index* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); } + inline Index* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); } + inline Index* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); } + }; +} + // Implement nonZeros() for transpose. I'm not sure that's the best approach for that. // Perhaps it should be implemented in Transpose<> itself. template class TransposeImpl - : public SparseMatrixBase > + : public internal::SparseTransposeImpl { protected: - typedef SparseMatrixBase > Base; + typedef internal::SparseTransposeImpl Base; public: inline typename MatrixType::Index nonZeros() const { return Base::derived().nestedExpression().nonZeros(); } }; From f3be317614b6b6709737d6ef7bea0ffe96c285d4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:03:25 +0100 Subject: [PATCH 190/214] Add a Map specialization. --- Eigen/SparseCore | 1 + Eigen/src/SparseCore/SparseMap.h | 211 +++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 Eigen/src/SparseCore/SparseMap.h diff --git a/Eigen/SparseCore b/Eigen/SparseCore index e2071519b..91cdfd3f0 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -33,6 +33,7 @@ #include "src/SparseCore/AmbiVector.h" #include "src/SparseCore/SparseCompressedBase.h" #include "src/SparseCore/SparseMatrix.h" +#include "src/SparseCore/SparseMap.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h new file mode 100644 index 000000000..91e8f7480 --- /dev/null +++ b/Eigen/src/SparseCore/SparseMap.h @@ -0,0 +1,211 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_MAP_H +#define EIGEN_SPARSE_MAP_H + +namespace Eigen { + +template::has_write_access ? WriteAccessors : ReadOnlyAccessors +> class SparseMapBase; + +template +class SparseMapBase + : public SparseCompressedBase +{ + public: + typedef SparseCompressedBase Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::Index Index; + enum { IsRowMajor = Base::IsRowMajor }; + + protected: + + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, const Scalar *>::type ScalarPointer; + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Index *, const Index *>::type IndexPointer; + + Index m_outerSize; + Index m_innerSize; + Index m_nnz; + IndexPointer m_outerIndex; + IndexPointer m_innerIndices; + ScalarPointer m_values; + IndexPointer m_innerNonZeros; + + public: + + inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } + inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } + inline Index innerSize() const { return m_innerSize; } + inline Index outerSize() const { return m_outerSize; } + + bool isCompressed() const { return m_innerNonZeros==0; } + + //---------------------------------------- + // direct access interface + inline const Scalar* valuePtr() const { return m_values; } + inline const Index* innerIndexPtr() const { return m_innerIndices; } + inline const Index* outerIndexPtr() const { return m_outerIndex; } + inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; } + //---------------------------------------- + + inline Scalar coeff(Index row, Index col) const + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = m_outerIndex[outer]; + Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer]; + if (start==end) + return Scalar(0); + else if (end>0 && inner==m_innerIndices[end-1]) + return m_values[end-1]; + // ^^ optimization: let's first check if it is the last coefficient + // (very common in high level algorithms) + + const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); + const Index id = r-&m_innerIndices[0]; + return ((*r==inner) && (id +class SparseMapBase + : public SparseMapBase +{ + typedef MapBase ReadOnlyMapBase; + + public: + typedef SparseMapBase Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::Index Index; + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + //---------------------------------------- + // direct access interface + using Base::valuePtr; + using Base::innerIndexPtr; + using Base::outerIndexPtr; + using Base::innerNonZeroPtr; + inline Scalar* valuePtr() { return Base::m_values; } + inline Index* innerIndexPtr() { return Base::m_innerIndices; } + inline Index* outerIndexPtr() { return Base::m_outerIndex; } + inline Index* innerNonZeroPtr() { return Base::m_innerNonZeros; } + //---------------------------------------- + + inline Scalar& coeffRef(Index row, Index col) + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + Index start = Base::m_outerIndex[outer]; + Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer]; + eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix"); + eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); + Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner); + const Index id = r - &Base::m_innerIndices[0]; + eigen_assert((*r==inner) && (id(Base::m_values)[id]; + } + + inline SparseMapBase(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, + Scalar* valuePtr, Index* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~SparseMapBase() {} +}; + +template +class Map, Options, StrideType> + : public SparseMapBase, Options, StrideType> > +{ + public: + typedef SparseMapBase > Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, Index* outerIndexPtr, + Index* innerIndexPtr, Scalar* valuePtr, Index* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +template +class Map, Options, StrideType> + : public SparseMapBase, Options, StrideType> > +{ + public: + typedef SparseMapBase > Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) + enum { IsRowMajor = Base::IsRowMajor }; + + public: + + inline Map(Index rows, Index cols, Index nnz, const Index* outerIndexPtr, + const Index* innerIndexPtr, const Scalar* valuePtr, const Index* innerNonZerosPtr = 0) + : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) + {} + + /** Empty destructor */ + inline ~Map() {} +}; + +namespace internal { + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Map, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Map, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_MAP_H From f2ff8c091e02b4aab8c7568807c09e43f51d1156 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 7 Feb 2015 22:04:18 +0100 Subject: [PATCH 191/214] Add a Ref specialization. --- Eigen/SparseCore | 1 + Eigen/src/SparseCore/SparseRef.h | 200 +++++++++++++++++++++++++++++++ test/CMakeLists.txt | 1 + test/sparse_ref.cpp | 96 +++++++++++++++ 4 files changed, 298 insertions(+) create mode 100644 Eigen/src/SparseCore/SparseRef.h create mode 100644 test/sparse_ref.cpp diff --git a/Eigen/SparseCore b/Eigen/SparseCore index 91cdfd3f0..48ed967b8 100644 --- a/Eigen/SparseCore +++ b/Eigen/SparseCore @@ -36,6 +36,7 @@ #include "src/SparseCore/SparseMap.h" #include "src/SparseCore/MappedSparseMatrix.h" #include "src/SparseCore/SparseVector.h" +#include "src/SparseCore/SparseRef.h" #include "src/SparseCore/SparseCwiseUnaryOp.h" #include "src/SparseCore/SparseCwiseBinaryOp.h" #include "src/SparseCore/SparseTranspose.h" diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h new file mode 100644 index 000000000..cfea6ce8a --- /dev/null +++ b/Eigen/src/SparseCore/SparseRef.h @@ -0,0 +1,200 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_REF_H +#define EIGEN_SPARSE_REF_H + +namespace Eigen { + +namespace internal { + +template class SparseRefBase; + +template +struct traits, _Options, _StrideType> > + : public traits > +{ + typedef SparseMatrix PlainObjectType; + enum { + Options = _Options, + Flags = traits >::Flags | CompressedAccessBit | NestByRefBit + }; + + template struct match { + enum { + StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch + }; + typedef typename internal::conditional::type type; + }; + +}; + +template +struct traits, _Options, _StrideType> > + : public traits, _Options, _StrideType> > +{ + enum { + Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit + }; +}; + +template +struct traits > : public traits {}; + +template class SparseRefBase +// : public MappedSparseMatrix + : public SparseMapBase +{ +// typedef typename internal::traits::PlainObjectType PlainObjectType; + +public: + + typedef SparseMapBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase) + + SparseRefBase() + : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0) + {} + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(SparseRefBase) + +protected: + + + template + void construct(Expression& expr) + { + ::new (static_cast(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr()); + } +}; + +} // namespace internal + +template +class Ref, Options, StrideType > + : public internal::SparseRefBase, Options, StrideType > > +{ + typedef SparseMatrix PlainObjectType; + typedef internal::traits Traits; + template + inline Ref(const SparseMatrix& expr); + template + inline Ref(const MappedSparseMatrix& expr); + public: + + typedef internal::SparseRefBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + + #ifndef EIGEN_PARSED_BY_DOXYGEN + template + inline Ref(SparseMatrix& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template + inline Ref(MappedSparseMatrix& expr) + { + EIGEN_STATIC_ASSERT(bool(Traits::template match >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.derived()); + } + + template + inline Ref(const SparseCompressedBase& expr) + #else + template + inline Ref(SparseCompressedBase& expr) + #endif + { + EIGEN_STATIC_ASSERT(bool(internal::is_lvalue::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); + EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + Base::construct(expr.const_cast_derived()); + } + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) + +}; + +// this is the const ref version +template +class Ref, Options, StrideType> + : public internal::SparseRefBase, Options, StrideType> > +{ + typedef SparseMatrix TPlainObjectType; + typedef internal::traits Traits; + public: + + typedef internal::SparseRefBase Base; + _EIGEN_SPARSE_PUBLIC_INTERFACE(Ref) + + template + inline Ref(const SparseMatrixBase& expr) + { + construct(expr.derived(), typename Traits::template match::type()); + } + + inline Ref(const Ref& other) : Base(other) { + // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy + } + + template + inline Ref(const RefBase& other) { + construct(other.derived(), typename Traits::template match::type()); + } + + protected: + + template + void construct(const Expression& expr,internal::true_type) + { + Base::construct(expr); + } + + template + void construct(const Expression& expr, internal::false_type) + { + m_object = expr; + Base::construct(m_object); + } + + protected: + TPlainObjectType m_object; +}; + + +namespace internal { + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Ref, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +template +struct evaluator, Options, StrideType> > + : evaluator, Options, StrideType> > > +{ + typedef evaluator, Options, StrideType> > > Base; + typedef Ref, Options, StrideType> XprType; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_REF_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f57d8ce36..168749634 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -228,6 +228,7 @@ ei_add_test(stddeque) ei_add_test(sparse_basic) ei_add_test(sparse_vector) ei_add_test(sparse_product) +ei_add_test(sparse_ref) ei_add_test(sparse_solvers) ei_add_test(sparse_permutations) ei_add_test(simplicial_cholesky) diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp new file mode 100644 index 000000000..b261cecf3 --- /dev/null +++ b/test/sparse_ref.cpp @@ -0,0 +1,96 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20015 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// This unit test cannot be easily written to work with EIGEN_DEFAULT_TO_ROW_MAJOR +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#undef EIGEN_DEFAULT_TO_ROW_MAJOR +#endif + +static long int nb_temporaries; + +inline void on_temporary_creation() { + // here's a great place to set a breakpoint when debugging failures in this test! + nb_temporaries++; +} + +#define EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN { on_temporary_creation(); } + +#include "main.h" +#include + +#define VERIFY_EVALUATION_COUNT(XPR,N) {\ + nb_temporaries = 0; \ + XPR; \ + if(nb_temporaries!=N) std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; \ + VERIFY( (#XPR) && nb_temporaries==N ); \ + } + +template void check_const_correctness(const PlainObjectType&) +{ + // verify that ref-to-const don't have LvalueBit + typedef typename internal::add_const::type ConstPlainObjectType; + VERIFY( !(internal::traits >::Flags & LvalueBit) ); + VERIFY( !(internal::traits >::Flags & LvalueBit) ); + VERIFY( !(Ref::Flags & LvalueBit) ); + VERIFY( !(Ref::Flags & LvalueBit) ); +} + +template +EIGEN_DONT_INLINE void call_ref_1(Ref > a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); } + +template +EIGEN_DONT_INLINE void call_ref_2(const Ref >& a, const B &b) { VERIFY_IS_EQUAL(a.toDense(),b.toDense()); } + +void call_ref() +{ +// SparseVector > ca = VectorXcf::Random(10).sparseView(); +// SparseVector a = VectorXf::Random(10).sparseView(); + SparseMatrix A = MatrixXf::Random(10,10).sparseView(); + SparseMatrix B = MatrixXf::Random(10,10).sparseView(); + const SparseMatrix& Ac(A); + Block > Ab(A,0,1, 3,3); + const Block > Abc(A,0,1,3,3); + + + VERIFY_EVALUATION_COUNT( call_ref_1(A, A), 0); +// VERIFY_EVALUATION_COUNT( call_ref_1(Ac, Ac), 0); // does not compile on purpose + VERIFY_EVALUATION_COUNT( call_ref_2(A, A), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A.transpose(), A.transpose()), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(Ac,Ac), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A+A,2*Ac), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(B, B), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(B.transpose(), B.transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A), 1); + + Ref > Ar(A); + VERIFY_EVALUATION_COUNT( call_ref_1(Ar, Ar), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Ar, Ar), 0); + + Ref > Br(B); + VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Br, Br), 1); + VERIFY_EVALUATION_COUNT( call_ref_2(Br.transpose(), Br.transpose()), 0); + + Ref > Arc(A); +// VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc), 0); // does not compile on purpose + VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 1); // should be 0 + + VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)), 1); // should be 0 (allocate starts/nnz only) +} + +void test_sparse_ref() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( check_const_correctness(SparseMatrix()) ); + CALL_SUBTEST_1( check_const_correctness(SparseMatrix()) ); + CALL_SUBTEST_2( call_ref() ); + } +} From 3af29caae87b00ed8f002605573d227ad1b629a4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 10:23:45 +0100 Subject: [PATCH 192/214] Cleaning and add more unit tests for Ref and Map --- Eigen/src/SparseCore/MappedSparseMatrix.h | 172 ++-------------------- Eigen/src/SparseCore/SparseMap.h | 34 ++++- Eigen/src/SparseCore/SparseRef.h | 12 +- test/sparse_basic.cpp | 20 +++ test/sparse_ref.cpp | 5 +- 5 files changed, 72 insertions(+), 171 deletions(-) diff --git a/Eigen/src/SparseCore/MappedSparseMatrix.h b/Eigen/src/SparseCore/MappedSparseMatrix.h index 2852c669a..533479fd0 100644 --- a/Eigen/src/SparseCore/MappedSparseMatrix.h +++ b/Eigen/src/SparseCore/MappedSparseMatrix.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Gael Guennebaud +// Copyright (C) 2008-2014 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,9 +10,10 @@ #ifndef EIGEN_MAPPED_SPARSEMATRIX_H #define EIGEN_MAPPED_SPARSEMATRIX_H -namespace Eigen { +namespace Eigen { -/** \class MappedSparseMatrix +/** \deprecated Use Map > + * \class MappedSparseMatrix * * \brief Sparse matrix * @@ -25,179 +26,38 @@ namespace internal { template struct traits > : traits > {}; -} +} // end namespace internal template class MappedSparseMatrix - : public SparseMatrixBase > + : public Map > { - public: - EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix) - enum { IsRowMajor = Base::IsRowMajor }; - - protected: - - Index m_outerSize; - Index m_innerSize; - Index m_nnz; - Index* m_outerIndex; - Index* m_innerIndices; - Scalar* m_values; + typedef Map > Base; public: - - inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; } - inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; } - inline Index innerSize() const { return m_innerSize; } - inline Index outerSize() const { return m_outerSize; } - bool isCompressed() const { return true; } + typedef typename Base::Index Index; + typedef typename Base::Scalar Scalar; - //---------------------------------------- - // direct access interface - inline const Scalar* valuePtr() const { return m_values; } - inline Scalar* valuePtr() { return m_values; } - - inline const Index* innerIndexPtr() const { return m_innerIndices; } - inline Index* innerIndexPtr() { return m_innerIndices; } - - inline const Index* outerIndexPtr() const { return m_outerIndex; } - inline Index* outerIndexPtr() { return m_outerIndex; } - //---------------------------------------- - - inline Scalar coeff(Index row, Index col) const - { - const Index outer = IsRowMajor ? row : col; - const Index inner = IsRowMajor ? col : row; - - Index start = m_outerIndex[outer]; - Index end = m_outerIndex[outer+1]; - if (start==end) - return Scalar(0); - else if (end>0 && inner==m_innerIndices[end-1]) - return m_values[end-1]; - // ^^ optimization: let's first check if it is the last coefficient - // (very common in high level algorithms) - - const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner); - const Index id = r-&m_innerIndices[0]; - return ((*r==inner) && (id=start && "you probably called coeffRef on a non finalized matrix"); - eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient"); - Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner); - const Index id = r-&m_innerIndices[0]; - eigen_assert((*r==inner) && (id -class MappedSparseMatrix::InnerIterator -{ - public: - InnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(outer), - m_id(mat.outerIndexPtr()[outer]), - m_start(m_id), - m_end(mat.outerIndexPtr()[outer+1]) - {} - - inline InnerIterator& operator++() { m_id++; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id]; } - inline Scalar& valueRef() { return const_cast(m_matrix.valuePtr()[m_id]); } - - inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const Index m_outer; - Index m_id; - const Index m_start; - const Index m_end; -}; - -template -class MappedSparseMatrix::ReverseInnerIterator -{ - public: - ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer) - : m_matrix(mat), - m_outer(outer), - m_id(mat.outerIndexPtr()[outer+1]), - m_start(mat.outerIndexPtr()[outer]), - m_end(m_id) - {} - - inline ReverseInnerIterator& operator--() { m_id--; return *this; } - - inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; } - inline Scalar& valueRef() { return const_cast(m_matrix.valuePtr()[m_id-1]); } - - inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; } - inline Index row() const { return IsRowMajor ? m_outer : index(); } - inline Index col() const { return IsRowMajor ? index() : m_outer; } - - inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); } - - protected: - const MappedSparseMatrix& m_matrix; - const Index m_outer; - Index m_id; - const Index m_start; - const Index m_end; -}; - namespace internal { template struct evaluator > - : evaluator_base > + : evaluator > > { - typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType; - typedef typename MappedSparseMatrixType::InnerIterator InnerIterator; - typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator; + typedef MappedSparseMatrix<_Scalar,_Options,_Index> XprType; + typedef evaluator > Base; - enum { - CoeffReadCost = NumTraits<_Scalar>::ReadCost, - Flags = MappedSparseMatrixType::Flags - }; - - evaluator() : m_matrix(0) {} - explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {} - - operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); } - operator const MappedSparseMatrixType&() const { return *m_matrix; } - - const MappedSparseMatrixType *m_matrix; + evaluator() : Base() {} + explicit evaluator(const XprType &mat) : Base(mat) {} }; } diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index 91e8f7480..72dedb1ec 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -12,6 +12,32 @@ namespace Eigen { +namespace internal { + +template +struct traits, Options, StrideType> > + : public traits > +{ + typedef SparseMatrix PlainObjectType; + typedef traits TraitsBase; + enum { + Flags = TraitsBase::Flags & (~NestByRefBit) + }; +}; + +template +struct traits, Options, StrideType> > + : public traits > +{ + typedef SparseMatrix PlainObjectType; + typedef traits TraitsBase; + enum { + Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit)) + }; +}; + +} // end namespace internal + template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class SparseMapBase; @@ -25,7 +51,7 @@ class SparseMapBase typedef typename Base::Scalar Scalar; typedef typename Base::Index Index; enum { IsRowMajor = Base::IsRowMajor }; - + using Base::operator=; protected: typedef typename internal::conditional< @@ -103,6 +129,8 @@ class SparseMapBase typedef typename Base::Scalar Scalar; typedef typename Base::Index Index; enum { IsRowMajor = Base::IsRowMajor }; + + using Base::operator=; public: @@ -147,7 +175,7 @@ class Map, Options, StrideType> : public SparseMapBase, Options, StrideType> > { public: - typedef SparseMapBase > Base; + typedef SparseMapBase Base; _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) enum { IsRowMajor = Base::IsRowMajor }; @@ -167,7 +195,7 @@ class Map, Options, StrideType : public SparseMapBase, Options, StrideType> > { public: - typedef SparseMapBase > Base; + typedef SparseMapBase Base; _EIGEN_SPARSE_PUBLIC_INTERFACE(Map) enum { IsRowMajor = Base::IsRowMajor }; diff --git a/Eigen/src/SparseCore/SparseRef.h b/Eigen/src/SparseCore/SparseRef.h index cfea6ce8a..2ca039323 100644 --- a/Eigen/src/SparseCore/SparseRef.h +++ b/Eigen/src/SparseCore/SparseRef.h @@ -23,7 +23,7 @@ struct traits, _Options, _Stride typedef SparseMatrix PlainObjectType; enum { Options = _Options, - Flags = traits >::Flags | CompressedAccessBit | NestByRefBit + Flags = traits >::Flags | CompressedAccessBit | NestByRefBit }; template struct match { @@ -41,7 +41,7 @@ struct traits, _Options, _ : public traits, _Options, _StrideType> > { enum { - Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit + Flags = (traits >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit }; }; @@ -49,11 +49,8 @@ template struct traits > : public traits {}; template class SparseRefBase -// : public MappedSparseMatrix : public SparseMapBase { -// typedef typename internal::traits::PlainObjectType PlainObjectType; - public: typedef SparseMapBase Base; @@ -63,8 +60,6 @@ public: : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0) {} - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(SparseRefBase) - protected: @@ -119,9 +114,6 @@ class Ref, Options, StrideType > EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); Base::construct(expr.const_cast_derived()); } - - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) - }; // this is the const ref version diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 097959c84..a19de296a 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -408,6 +408,26 @@ template void sparse_basic(const SparseMatrixType& re m.setFromTriplets(triplets.begin(), triplets.end()); VERIFY_IS_APPROX(m, refMat); } + + // test Map + { + DenseMatrix refMat2(rows, cols), refMat3(rows, cols); + SparseMatrixType m2(rows, cols), m3(rows, cols); + initSparse(density, refMat2, m2); + initSparse(density, refMat3, m3); + { + Map mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr()); + Map mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr()); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + } + { + MappedSparseMatrix mapMat2(m2.rows(), m2.cols(), m2.nonZeros(), m2.outerIndexPtr(), m2.innerIndexPtr(), m2.valuePtr(), m2.innerNonZeroPtr()); + MappedSparseMatrix mapMat3(m3.rows(), m3.cols(), m3.nonZeros(), m3.outerIndexPtr(), m3.innerIndexPtr(), m3.valuePtr(), m3.innerNonZeroPtr()); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + VERIFY_IS_APPROX(mapMat2+mapMat3, refMat2+refMat3); + } + } // test triangularView { diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index b261cecf3..27700f827 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -69,8 +69,9 @@ void call_ref() VERIFY_EVALUATION_COUNT( call_ref_2(A*A, A*A), 1); Ref > Ar(A); - VERIFY_EVALUATION_COUNT( call_ref_1(Ar, Ar), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(Ar, Ar), 0); + VERIFY_IS_APPROX(Ar+Ar, A+A); + VERIFY_EVALUATION_COUNT( call_ref_1(Ar, A), 0); + VERIFY_EVALUATION_COUNT( call_ref_2(Ar, A), 0); Ref > Br(B); VERIFY_EVALUATION_COUNT( call_ref_1(Br.transpose(), Br.transpose()), 0); From 554aa9b31de06cf1d4464b1fe8e5a956091641ba Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 10:24:07 +0100 Subject: [PATCH 193/214] Add failtests for Ref --- failtest/CMakeLists.txt | 6 ++++++ failtest/sparse_ref_1.cpp | 18 ++++++++++++++++++ failtest/sparse_ref_2.cpp | 15 +++++++++++++++ failtest/sparse_ref_3.cpp | 15 +++++++++++++++ failtest/sparse_ref_4.cpp | 15 +++++++++++++++ failtest/sparse_ref_5.cpp | 16 ++++++++++++++++ 6 files changed, 85 insertions(+) create mode 100644 failtest/sparse_ref_1.cpp create mode 100644 failtest/sparse_ref_2.cpp create mode 100644 failtest/sparse_ref_3.cpp create mode 100644 failtest/sparse_ref_4.cpp create mode 100644 failtest/sparse_ref_5.cpp diff --git a/failtest/CMakeLists.txt b/failtest/CMakeLists.txt index d2fea7bdc..c8795a344 100644 --- a/failtest/CMakeLists.txt +++ b/failtest/CMakeLists.txt @@ -41,6 +41,12 @@ ei_add_failtest("ref_5") ei_add_failtest("swap_1") ei_add_failtest("swap_2") +ei_add_failtest("sparse_ref_1") +ei_add_failtest("sparse_ref_2") +ei_add_failtest("sparse_ref_3") +ei_add_failtest("sparse_ref_4") +ei_add_failtest("sparse_ref_5") + if (EIGEN_FAILTEST_FAILURE_COUNT) message(FATAL_ERROR "${EIGEN_FAILTEST_FAILURE_COUNT} out of ${EIGEN_FAILTEST_COUNT} failtests FAILED. " diff --git a/failtest/sparse_ref_1.cpp b/failtest/sparse_ref_1.cpp new file mode 100644 index 000000000..d78d1f9b1 --- /dev/null +++ b/failtest/sparse_ref_1.cpp @@ -0,0 +1,18 @@ +#include "../Eigen/Sparse" + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +#define CV_QUALIFIER const +#else +#define CV_QUALIFIER +#endif + +using namespace Eigen; + +void call_ref(Ref > a) { } + +int main() +{ + SparseMatrix a(10,10); + CV_QUALIFIER SparseMatrix& ac(a); + call_ref(ac); +} diff --git a/failtest/sparse_ref_2.cpp b/failtest/sparse_ref_2.cpp new file mode 100644 index 000000000..46c9440c2 --- /dev/null +++ b/failtest/sparse_ref_2.cpp @@ -0,0 +1,15 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +void call_ref(Ref > a) { } + +int main() +{ + SparseMatrix A(10,10); +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD + call_ref(A.row(3)); +#else + call_ref(A.col(3)); +#endif +} diff --git a/failtest/sparse_ref_3.cpp b/failtest/sparse_ref_3.cpp new file mode 100644 index 000000000..a9949b552 --- /dev/null +++ b/failtest/sparse_ref_3.cpp @@ -0,0 +1,15 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD +void call_ref(Ref > a) { } +#else +void call_ref(const Ref > &a) { } +#endif + +int main() +{ + SparseMatrix a(10,10); + call_ref(a+a); +} diff --git a/failtest/sparse_ref_4.cpp b/failtest/sparse_ref_4.cpp new file mode 100644 index 000000000..57bb6a1fc --- /dev/null +++ b/failtest/sparse_ref_4.cpp @@ -0,0 +1,15 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +void call_ref(Ref > a) {} + +int main() +{ + SparseMatrix A(10,10); +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD + call_ref(A.transpose()); +#else + call_ref(A); +#endif +} diff --git a/failtest/sparse_ref_5.cpp b/failtest/sparse_ref_5.cpp new file mode 100644 index 000000000..4478f6f2f --- /dev/null +++ b/failtest/sparse_ref_5.cpp @@ -0,0 +1,16 @@ +#include "../Eigen/Sparse" + +using namespace Eigen; + +void call_ref(Ref > a) { } + +int main() +{ + SparseMatrix a(10,10); + SparseMatrixBase > &ac(a); +#ifdef EIGEN_SHOULD_FAIL_TO_BUILD + call_ref(ac); +#else + call_ref(ac.derived()); +#endif +} From d4ec48575e94ec469ef9fbf005a0a6e67571f528 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 11:14:36 +0100 Subject: [PATCH 194/214] Make Block inherit SparseCompressedBase in the case of an inner-panels and fix valuePtr() innerIndexPtr() --- Eigen/src/Core/Block.h | 2 +- Eigen/src/SparseCore/SparseBlock.h | 17 ++++++++++++----- test/sparse_ref.cpp | 8 +++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 9cf9d5432..5f6307206 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -87,7 +87,7 @@ struct traits > : traits::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, - Flags = (traits::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit + Flags = (traits::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit // FIXME DirectAccessBit should not be handled by expressions }; }; diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 9e4da2057..6b8ade5aa 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -74,7 +74,7 @@ namespace internal { template class sparse_matrix_block_impl - : public SparseMatrixBase > + : public SparseCompressedBase > { typedef typename internal::remove_all::type _MatrixTypeNested; typedef Block BlockType; @@ -172,19 +172,24 @@ public: } inline const Scalar* valuePtr() const - { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.valuePtr(); } inline Scalar* valuePtr() - { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.const_cast_derived().valuePtr(); } inline const Index* innerIndexPtr() const - { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.innerIndexPtr(); } inline Index* innerIndexPtr() - { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; } + { return m_matrix.const_cast_derived().innerIndexPtr(); } inline const Index* outerIndexPtr() const { return m_matrix.outerIndexPtr() + m_outerStart; } inline Index* outerIndexPtr() { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; } + + inline const Index* innerNonZeroPtr() const + { return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); } + inline Index* innerNonZeroPtr() + { return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); } Index nonZeros() const { @@ -196,6 +201,8 @@ public: else return Map >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum(); } + + bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; } const Scalar& lastCoeff() const { diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index 27700f827..e7380ba21 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -51,8 +51,8 @@ void call_ref() { // SparseVector > ca = VectorXcf::Random(10).sparseView(); // SparseVector a = VectorXf::Random(10).sparseView(); - SparseMatrix A = MatrixXf::Random(10,10).sparseView(); - SparseMatrix B = MatrixXf::Random(10,10).sparseView(); + SparseMatrix A = MatrixXf::Random(10,10).sparseView(0.5,1); + SparseMatrix B = MatrixXf::Random(10,10).sparseView(0.5,1); const SparseMatrix& Ac(A); Block > Ab(A,0,1, 3,3); const Block > Abc(A,0,1,3,3); @@ -82,7 +82,9 @@ void call_ref() // VERIFY_EVALUATION_COUNT( call_ref_1(Arc, Arc), 0); // does not compile on purpose VERIFY_EVALUATION_COUNT( call_ref_2(Arc, Arc), 0); - VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 1); // should be 0 + VERIFY_EVALUATION_COUNT( call_ref_2(A.middleCols(1,3), A.middleCols(1,3)), 0); + + VERIFY_EVALUATION_COUNT( call_ref_2(A.col(2), A.col(2)), 0); VERIFY_EVALUATION_COUNT( call_ref_2(A.block(1,1,3,3), A.block(1,1,3,3)), 1); // should be 0 (allocate starts/nnz only) } From 87629cd6395433966977e868ec091c3fc754956c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 9 Feb 2015 11:41:25 +0100 Subject: [PATCH 195/214] bug #897: makes iterative sparse solvers use a Ref instead of a SparseMatrix pointer. This fixes usage of iterative solvers with a Map. --- Eigen/src/IterativeLinearSolvers/BiCGSTAB.h | 2 +- .../ConjugateGradient.h | 2 +- .../IterativeSolverBase.h | 53 +++++++++++++------ test/sparse_solver.h | 5 +- 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 224fe913f..a50680133 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -193,7 +193,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) + if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error)) failed = true; } m_info = failed ? NumericalIssue diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index b5ef6d60f..3e024bda1 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -206,7 +206,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::conjugate_gradient(mp_matrix->template selfadjointView(), b.col(j), xj, + internal::conjugate_gradient(mp_matrix.template selfadjointView(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index f33c868bb..6f48075b4 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -37,7 +37,7 @@ public: /** Default constructor. */ IterativeSolverBase() - : mp_matrix(0) + : m_dummy(0,0), mp_matrix(m_dummy) { init(); } @@ -52,10 +52,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - explicit IterativeSolverBase(const MatrixType& A) + template + explicit IterativeSolverBase(const SparseMatrixBase& A) { init(); - compute(A); + compute(A.derived()); } ~IterativeSolverBase() {} @@ -65,9 +66,11 @@ public: * Currently, this function mostly calls analyzePattern on the preconditioner. In the future * we might, for instance, implement column reordering for faster matrix vector products. */ - Derived& analyzePattern(const MatrixType& A) + template + Derived& analyzePattern(const SparseMatrixBase& A) { - m_preconditioner.analyzePattern(A); + grab(A); + m_preconditioner.analyzePattern(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_info = Success; @@ -83,11 +86,12 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& factorize(const MatrixType& A) + template + Derived& factorize(const SparseMatrixBase& A) { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); - mp_matrix = &A; - m_preconditioner.factorize(A); + grab(A); + m_preconditioner.factorize(mp_matrix); m_factorizationIsOk = true; m_info = Success; return derived(); @@ -103,10 +107,11 @@ public: * this class becomes invalid. Call compute() to update it with the new * matrix A, or modify a copy of A. */ - Derived& compute(const MatrixType& A) + template + Derived& compute(const SparseMatrixBase& A) { - mp_matrix = &A; - m_preconditioner.compute(A); + grab(A); + m_preconditioner.compute(mp_matrix); m_isInitialized = true; m_analysisIsOk = true; m_factorizationIsOk = true; @@ -115,9 +120,9 @@ public: } /** \internal */ - Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; } + Index rows() const { return mp_matrix.rows(); } /** \internal */ - Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; } + Index cols() const { return mp_matrix.cols(); } /** \returns the tolerance threshold used by the stopping criteria */ RealScalar tolerance() const { return m_tolerance; } @@ -135,13 +140,18 @@ public: /** \returns a read-only reference to the preconditioner. */ const Preconditioner& preconditioner() const { return m_preconditioner; } - /** \returns the max number of iterations */ + /** \returns the max number of iterations. + * It is either the value setted by setMaxIterations or, by default, + * twice the number of columns of the matrix. + */ int maxIterations() const { - return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations; + return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations; } - /** Sets the max number of iterations */ + /** Sets the max number of iterations. + * Default is twice the number of columns of the matrix. + */ Derived& setMaxIterations(int maxIters) { m_maxIterations = maxIters; @@ -210,7 +220,16 @@ protected: m_maxIterations = -1; m_tolerance = NumTraits::epsilon(); } - const MatrixType* mp_matrix; + + template + void grab(const SparseMatrixBase &A) + { + mp_matrix.~Ref(); + ::new (&mp_matrix) Ref(A); + } + + MatrixType m_dummy; + Ref mp_matrix; Preconditioner m_preconditioner; int m_maxIterations; diff --git a/test/sparse_solver.h b/test/sparse_solver.h index ee350d561..887ff02a9 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -32,7 +32,7 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, x = solver.solve(b); if (solver.info() != Success) { - std::cerr << "sparse solver testing: solving failed\n"; + std::cerr << "sparse solver testing: solving failed (" << typeid(Solver).name() << ")\n"; return; } VERIFY(oldb.isApprox(b) && "sparse solver testing: the rhs should not be modified!"); @@ -75,7 +75,8 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, xm = solver.solve(bm); if (solver.info() != Success) { - std::cerr << "sparse solver testing: solving failed\n"; + std::cerr << "sparse solver testing: solving with a Map failed\n"; + exit(0); return; } VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!"); From d10d6a40dda3fb5ac9f401b8e6d9cede3f3ca34a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 13:02:59 +0100 Subject: [PATCH 196/214] bug #897: Update unsupported iterative solvers based on IterativeSolverBased. --- unsupported/Eigen/src/IterativeSolvers/DGMRES.h | 2 +- unsupported/Eigen/src/IterativeSolvers/GMRES.h | 2 +- unsupported/Eigen/src/IterativeSolvers/MINRES.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h index 0e1b7d977..52eb65a2f 100644 --- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h @@ -150,7 +150,7 @@ class DGMRES : public IterativeSolverBase > m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - dgmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner); + dgmres(mp_matrix, b.col(j), xj, Base::m_preconditioner); } m_info = failed ? NumericalIssue : m_error <= Base::m_tolerance ? Success diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h index cd15ce0bf..39610c074 100644 --- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h @@ -327,7 +327,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - if(!internal::gmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error)) + if(!internal::gmres(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error)) failed = true; } m_info = failed ? NumericalIssue diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index aaf42c78a..a34902001 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -259,7 +259,7 @@ namespace Eigen { m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::minres(mp_matrix->template selfadjointView(), b.col(j), xj, + internal::minres(mp_matrix.template selfadjointView(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } From c6e8caf0900ae303e9e7399bed00af705015ff17 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 18:57:41 +0100 Subject: [PATCH 197/214] Allows Lower|Upper as a template argument of CG and MINRES: in this case the full matrix will be considered. --- Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 11 +++++++---- test/conjugate_gradient.cpp | 6 ++++-- test/sparse_solver.h | 5 ++++- unsupported/Eigen/src/IterativeSolvers/MINRES.h | 7 ++++++- unsupported/test/minres.cpp | 2 ++ 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 3e024bda1..4857dd9e9 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -113,8 +113,8 @@ struct traits > * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse. * * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner * * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() @@ -197,6 +197,10 @@ public: template void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional&, + SparseSelfAdjointView, UpLo> + >::type MatrixWrapperType; m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -206,8 +210,7 @@ public: m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::conjugate_gradient(mp_matrix.template selfadjointView(), b.col(j), xj, - Base::m_preconditioner, m_iterations, m_error); + internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } m_isInitialized = true; diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 869051b31..019cc4d64 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -12,13 +12,15 @@ template void test_conjugate_gradient_T() { - ConjugateGradient, Lower> cg_colmajor_lower_diag; - ConjugateGradient, Upper> cg_colmajor_upper_diag; + ConjugateGradient, Lower > cg_colmajor_lower_diag; + ConjugateGradient, Upper > cg_colmajor_upper_diag; + ConjugateGradient, Lower|Upper> cg_colmajor_loup_diag; ConjugateGradient, Lower, IdentityPreconditioner> cg_colmajor_lower_I; ConjugateGradient, Upper, IdentityPreconditioner> cg_colmajor_upper_I; CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_loup_diag) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_lower_I) ); CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I) ); } diff --git a/test/sparse_solver.h b/test/sparse_solver.h index 887ff02a9..6cf99e0b0 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -195,7 +195,10 @@ int generate_sparse_spd_problem(Solver& , typename Solver::MatrixType& A, typena dA = dM * dM.adjoint(); halfA.resize(size,size); - halfA.template selfadjointView().rankUpdate(M); + if(Solver::UpLo==(Lower|Upper)) + halfA = A; + else + halfA.template selfadjointView().rankUpdate(M); return size; } diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index a34902001..65cffc255 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -250,6 +250,11 @@ namespace Eigen { template void _solve_with_guess_impl(const Rhs& b, Dest& x) const { + typedef typename internal::conditional&, + SparseSelfAdjointView, UpLo> + >::type MatrixWrapperType; + m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; @@ -259,7 +264,7 @@ namespace Eigen { m_error = Base::m_tolerance; typename Dest::ColXpr xj(x,j); - internal::minres(mp_matrix.template selfadjointView(), b.col(j), xj, + internal::minres(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error); } diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index 81b762c37..f6e526bbd 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -21,6 +21,7 @@ template void test_minres_T() // Diagonal preconditioner MINRES, Lower, DiagonalPreconditioner > minres_colmajor_lower_diag; MINRES, Upper, DiagonalPreconditioner > minres_colmajor_upper_diag; + MINRES, Upper, DiagonalPreconditioner > minres_colmajor_uplo_diag; // call tests for SPD matrix CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) ); @@ -28,6 +29,7 @@ template void test_minres_T() CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag) ); CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag) ); + CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag) ); // TO DO: symmetric semi-definite matrix // TO DO: symmetric indefinite matrix From deecff97edfb6f75e7613e1db97a1e3e5504e971 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 19:22:05 +0100 Subject: [PATCH 198/214] typo --- blas/common.h | 3 +-- blas/level3_impl.h | 2 +- blas/xerbla.cpp | 4 ++-- unsupported/Eigen/src/IterativeSolvers/MINRES.h | 4 ++-- unsupported/test/minres.cpp | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/blas/common.h b/blas/common.h index c39cc63a8..5ecb153e2 100644 --- a/blas/common.h +++ b/blas/common.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2009-2010 Gael Guennebaud +// Copyright (C) 2009-2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -13,7 +13,6 @@ #include #include -#include #include #ifndef SCALAR diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..32313e814 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -8,7 +8,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "common.h" - +#include int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) { // std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n"; diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp index 0422f79b7..8775b88cd 100644 --- a/blas/xerbla.cpp +++ b/blas/xerbla.cpp @@ -1,5 +1,5 @@ -#include +#include #if (defined __GNUC__) && (!defined __MINGW32__) #define EIGEN_WEAK_LINKING __attribute__ ((weak)) @@ -14,7 +14,7 @@ extern "C" EIGEN_WEAK_LINKING int xerbla_(const char * msg, int *info, int) { - std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n"; + printf("Eigen BLAS ERROR #%i: %s\n", *info, msg ); return 0; } diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index 65cffc255..93a83e5b7 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -165,8 +165,8 @@ namespace Eigen { * The vectors x and b can be either dense or sparse. * * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. - * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower - * or Upper. Default is Lower. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner * * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index f6e526bbd..8b300b78a 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -21,7 +21,7 @@ template void test_minres_T() // Diagonal preconditioner MINRES, Lower, DiagonalPreconditioner > minres_colmajor_lower_diag; MINRES, Upper, DiagonalPreconditioner > minres_colmajor_upper_diag; - MINRES, Upper, DiagonalPreconditioner > minres_colmajor_uplo_diag; + MINRES, Lower|Upper, DiagonalPreconditioner > minres_colmajor_uplo_diag; // call tests for SPD matrix CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) ); From c3f3580b8f7e4af89f4c7cdbe036ac1cac750128 Mon Sep 17 00:00:00 2001 From: Jan Blechta Date: Tue, 10 Feb 2015 14:24:39 +0100 Subject: [PATCH 199/214] Fix bug #733: step by step solving is not a good example for solveWithGuess --- Eigen/src/IterativeLinearSolvers/BiCGSTAB.h | 6 +----- .../IterativeLinearSolvers/ConjugateGradient.h | 15 +-------------- unsupported/Eigen/src/IterativeSolvers/GMRES.h | 17 ++--------------- unsupported/Eigen/src/IterativeSolvers/MINRES.h | 15 +-------------- 4 files changed, 5 insertions(+), 48 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index a50680133..31a43cb56 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -139,11 +139,7 @@ struct traits > * \include BiCGSTAB_simple.cpp * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * \include BiCGSTAB_step_by_step.cpp - * Note that such a step by step execution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 4857dd9e9..1e819fc9f 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -137,20 +137,7 @@ struct traits > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * cg.setMaxIterations(1); - * int i = 0; - * do { - * x = cg.solveWithGuess(b,x); - * std::cout << i << " : " << cg.error() << std::endl; - * ++i; - * } while (cg.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h index 39610c074..30f82b52e 100644 --- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h @@ -250,21 +250,8 @@ struct traits > * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * solver.setMaxIterations(1); - * int i = 0; - * do { - * x = solver.solveWithGuess(b,x); - * std::cout << i << " : " << solver.error() << std::endl; - * ++i; - * } while (solver.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. - * + * One can control the start using the solveWithGuess() method. + * * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ template< typename _MatrixType, typename _Preconditioner> diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h index 93a83e5b7..c4d969a72 100644 --- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h +++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h @@ -189,20 +189,7 @@ namespace Eigen { * \endcode * * By default the iterations start with x=0 as an initial guess of the solution. - * One can control the start using the solveWithGuess() method. Here is a step by - * step execution example starting with a random guess and printing the evolution - * of the estimated error: - * * \code - * x = VectorXd::Random(n); - * mr.setMaxIterations(1); - * int i = 0; - * do { - * x = mr.solveWithGuess(b,x); - * std::cout << i << " : " << mr.error() << std::endl; - * ++i; - * } while (mr.info()!=Success && i<100); - * \endcode - * Note that such a step by step excution is slightly slower. + * One can control the start using the solveWithGuess() method. * * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner */ From 91fe3a30043874e51225c8f25964687320c9b601 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 10:29:28 -0800 Subject: [PATCH 200/214] Removed a debug printf statement. --- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index a96d705a4..7e448f7c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -89,9 +89,6 @@ class TensorLayoutSwapOp : public TensorBase, WriteA EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) { - -std::cout << "In assignment operator " << std::endl; - typedef TensorAssignOp Assign; Assign assign(*this, other); internal::TensorExecutor::run(assign, DefaultDevice()); From 4716c2c6666eb7018dac2e2ed050ead45c8933e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:06:19 -0800 Subject: [PATCH 201/214] Fixed compilation error --- unsupported/test/cxx11_tensor_thread_pool.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index f49523683..6fe65c7f9 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -15,7 +15,7 @@ #include using Eigen::Tensor; -using std::isnan; + static void test_multithread_elementwise() { @@ -122,7 +122,7 @@ static void test_contraction_corner_cases() m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); @@ -137,7 +137,7 @@ static void test_contraction_corner_cases() new(&m_left) MapXf(t_left.data(), 32, 1); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); @@ -155,7 +155,7 @@ static void test_contraction_corner_cases() new(&m_right) MapXf(t_right.data(), 32, 4); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); @@ -173,7 +173,7 @@ static void test_contraction_corner_cases() new(&m_right) MapXf(t_right.data(), 32, 4); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { - assert(!isnan(t_result.data()[i])); + assert(!std::isnan(t_result.data()[i])); if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); From 410895a7e4276fa2e1f78dbb953c7045818a86ae Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:13:19 -0800 Subject: [PATCH 202/214] Silenced several compilation warnings --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 16 ++++++++-------- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 18 +++++++++--------- .../src/Tensor/TensorContractionThreadPool.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 93938bd1b..a4f73b2a1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -103,7 +103,7 @@ struct TensorEvaluator, Device> m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // The dimensions of the lhs and the rhs tensors should be equal to prevent // overflows and ensure the result is fully initialized. eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions())); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 503803d23..698bcfe18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -257,13 +257,13 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if ((Layout == ColMajor && m_dim.actualDim() == 0) || - (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims-1) || - (Layout == RowMajor && m_dim.actualDim() == 0)) { + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -322,8 +322,8 @@ struct TensorEvaluator, Device> static const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - if ((this->Layout == ColMajor && this->m_dim.actualDim() == 0) || - (this->Layout == RowMajor && this->m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; @@ -333,8 +333,8 @@ struct TensorEvaluator, Device> this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } - } else if ((this->Layout == ColMajor && this->m_dim.actualDim() == NumInputDims-1) || - (this->Layout == RowMajor && this->m_dim.actualDim() == 0)) { + } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index af843654c..e750c21e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -499,9 +499,9 @@ struct TensorContractionEvaluatorBase // If we want to compute A * B = C, where A is LHS and B is RHS, the code // will pretend B is LHS and A is RHS. typedef typename internal::conditional< - Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; @@ -520,14 +520,14 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), m_device(device), m_result(NULL) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == - TensorEvaluator::Layout), + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert((internal::array_size::value > 0) && "Must contract on some indices"); @@ -681,7 +681,7 @@ struct TensorContractionEvaluatorBase } // If the layout is RowMajor, we need to reverse the m_dimensions - if (Layout == RowMajor) { + if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { std::swap(m_dimensions[i], m_dimensions[j]); } @@ -855,9 +855,9 @@ struct TensorEvaluator::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index e358e6a3a..8b87f1045 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -79,9 +79,9 @@ struct TensorEvaluator::type EvalLeftArgType; + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; typedef typename internal::conditional< - Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType; + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; static const int LDims = internal::array_size::Dimensions>::value; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 97f225f0a..5e167d4aa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -94,14 +94,14 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { eigen_assert(m_data); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { return m_data[m_dims.IndexOfRowMajor(coords)]; } } - Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } protected: Scalar* m_data; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 7e448f7c0..c00810594 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -112,7 +112,7 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented }; @@ -169,7 +169,7 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - Layout = (TensorEvaluator::Layout == ColMajor) ? RowMajor : ColMajor, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented }; From 114e863f086077fc949baf5dfe1f4102222c938e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:20:24 -0800 Subject: [PATCH 203/214] Silcenced a few compilation warnings --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 ++++---- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 12 ++++++------ .../Eigen/CXX11/src/Tensor/TensorReduction.h | 14 +++++++------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index ef134adf2..5790e19d6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device> m_dimensions[i] = input_dims[i] * broadcast[i]; } - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -139,7 +139,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return coeffColMajor(index); } else { return coeffRowMajor(index); @@ -210,7 +210,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } else { return packetRowMajor(index); @@ -326,7 +326,7 @@ struct TensorEvaluator, Device> } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index e750c21e7..f7254a24d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -536,7 +536,7 @@ struct TensorContractionEvaluatorBase DSizes eval_left_dims; DSizes eval_right_dims; array, ContractDims> eval_op_indices; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { // For ColMajor, we keep using the existing dimensions for (int i = 0; i < LDims; i++) { eval_left_dims[i] = m_leftImpl.dimensions()[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 5e167d4aa..488d32cb4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -167,7 +167,7 @@ struct TensorEvaluator #endif } - const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: const Scalar* m_data; @@ -218,7 +218,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(index); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const NullaryOp m_functor; @@ -273,7 +273,7 @@ struct TensorEvaluator, Device> return m_functor.packetOp(m_argImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const UnaryOp m_functor; @@ -301,7 +301,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); } @@ -337,7 +337,7 @@ struct TensorEvaluator(index), m_rightImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: const BinaryOp m_functor; @@ -413,7 +413,7 @@ struct TensorEvaluator m_elseImpl.template packet(index)); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_condImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 21416afe0..7643d4cdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -245,7 +245,7 @@ struct TensorEvaluator, Device> } // Precompute output strides. - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; @@ -259,7 +259,7 @@ struct TensorEvaluator, Device> // Precompute input strides. array input_strides; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { input_strides[0] = 1; for (int i = 1; i < NumInputDims; ++i) { input_strides[i] = input_strides[i-1] * input_dims[i-1]; @@ -309,7 +309,7 @@ struct TensorEvaluator, Device> Op reducer(m_reducer); if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; return internal::InnerMostDimReducer::reduce(*this, firstInput(index), num_values_to_reduce, reducer); } else { @@ -330,7 +330,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_DEFAULT typename internal::remove_const::type values[packetSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1]; const Index firstIndex = firstInput(index); for (Index i = 0; i < packetSize; ++i) { Op reducer(m_reducer); @@ -339,7 +339,7 @@ struct TensorEvaluator, Device> } } else if (PreservingInnerMostDims) { const Index firstIndex = firstInput(index); - const int innermost_dim = (Layout == ColMajor) ? 0 : NumOutputDims - 1; + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; // TBD: extend this the the n innermost dimensions that we preserve. if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) { Op reducer(m_reducer); @@ -371,7 +371,7 @@ struct TensorEvaluator, Device> // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { if (ReducingInnerMostDims) { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return index * m_preservedStrides[0]; } else { return index * m_preservedStrides[NumOutputDims - 1]; @@ -379,7 +379,7 @@ struct TensorEvaluator, Device> } // TBD: optimize the case where we preserve the innermost dimensions. Index startInput = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumOutputDims - 1; i > 0; --i) { // This is index_i in the output tensor. const Index idx = index / m_outputStrides[i]; From 057cfd2f02f06650db0634aca6abfbd09da36897 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:25:02 -0800 Subject: [PATCH 204/214] Silenced more compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 12 ++++++------ unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 488d32cb4..d084880de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -85,7 +85,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { eigen_assert(m_data); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { return m_data[m_dims.IndexOfRowMajor(coords)]; @@ -158,7 +158,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { eigen_assert(m_data); - const Index index = (Layout == ColMajor) ? m_dims.IndexOfColMajor(coords) + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); #ifdef __CUDA_ARCH__ return __ldg(m_data+index); @@ -366,8 +366,8 @@ struct TensorEvaluator m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) { - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 87a4b0758..1191b2411 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -308,7 +308,7 @@ struct TensorEvaluator, Devi const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; @@ -348,7 +348,7 @@ struct TensorEvaluator, Devi m_impl.evalSubExprsIfNeeded(NULL); if (internal::is_arithmetic::value && data && m_impl.data()) { Index contiguous_values = 1; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { contiguous_values *= dimensions()[i]; if (dimensions()[i] != m_impl.dimensions()[i]) { @@ -394,7 +394,7 @@ struct TensorEvaluator, Devi Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -446,7 +446,7 @@ struct TensorEvaluator, Devi Scalar* result = m_impl.data(); if (result) { Index offset = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { if (m_dimensions[i] != m_impl.dimensions()[i]) { offset += m_offsets[i] * m_inputStrides[i]; @@ -482,7 +482,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -547,7 +547,7 @@ struct TensorEvaluator, Device> const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 439cf3230..82969b4c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device { // Compute strides m_dimensions = m_impl.dimensions(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; @@ -138,7 +138,7 @@ struct TensorEvaluator, Device { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { Index idx = index / m_strides[i]; index -= idx * m_strides[i]; From c21e45fbc5b82a2f99113e8b6ab0005ca01a7428 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:36:26 -0800 Subject: [PATCH 205/214] Fixed a few more compilation warnings --- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index fb4e7fb11..57a14a037 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -111,7 +111,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(0 <= m_axis && m_axis < NumDims); const Dimensions& lhs_dims = m_leftImpl.dimensions(); @@ -131,7 +131,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { m_leftStrides[0] = 1; m_rightStrides[0] = 1; m_outputStrides[0] = 1; @@ -176,7 +176,7 @@ struct TensorEvaluator subs; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { subs[i] = index / m_outputStrides[i]; index -= subs[i] * m_outputStrides[i]; @@ -193,7 +193,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { left_index = subs[0]; for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; @@ -209,7 +209,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { right_index = subs[0]; for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 620a63ae7..1012ecd69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -126,7 +126,7 @@ struct TensorEvaluator, Device> array inputStrides; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -180,12 +180,12 @@ struct TensorEvaluator, Device> return rslt; } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 5aa2c8d3b..00cb8e373 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -123,7 +123,7 @@ struct TensorEvaluator, Device> } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_outputStrides[0] = 1; m_inputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -172,7 +172,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_outputStrides[i]; const Index idx1 = indices[1] / m_outputStrides[i]; @@ -211,13 +211,13 @@ struct TensorEvaluator, Device> } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; inputIndex += idx * m_inputStrides[i]; @@ -281,7 +281,7 @@ struct TensorEvaluator, Device> Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_outputStrides[i]; const Index idx1 = indices[1] / this->m_outputStrides[i]; From 780b2422e2b3fd2b50121a6e5642c94b030fbf5b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 12:43:55 -0800 Subject: [PATCH 206/214] Silenced the last batch of compilation warnings triggered by gcc 4.8 --- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 10 +++++----- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 4 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 698bcfe18..dc9586cbc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -167,7 +167,7 @@ struct TensorEvaluator, Device> m_stride = 1; m_inputStride = 1; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < m_dim.actualDim(); ++i) { m_stride *= input_dims[i]; m_inputStride *= input_dims[i]; @@ -208,8 +208,8 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - if ((Layout == ColMajor && m_dim.actualDim() == 0) || - (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; @@ -220,8 +220,8 @@ struct TensorEvaluator, Device> } PacketReturnType rslt = internal::pload(values); return rslt; - } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims - 1) || - (Layout == RowMajor && m_dim.actualDim() == 0)) { + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index aecef3313..591fd2464 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -236,9 +236,9 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); @@ -339,7 +339,7 @@ struct TensorEvaluator::Layout == TensorEvaluator::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 585ebc778..bf0e7edfb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -121,7 +121,7 @@ struct TensorEvaluator, Device> : m_impl(op.expression(), device) { // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -295,7 +295,7 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 9b14e01f4..2a7dd45c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -104,7 +104,7 @@ struct TensorEvaluator, Device m_dimensions[i] += m_padding[i].first + m_padding[i].second; } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { m_inputStrides[0] = 1; m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { @@ -141,7 +141,7 @@ struct TensorEvaluator, Device { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) { @@ -175,7 +175,7 @@ struct TensorEvaluator, Device template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { return packetColMajor(index); } return packetRowMajor(index); @@ -184,7 +184,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { Index inputIndex; - if (Layout == ColMajor) { + if (static_cast(Layout) == static_cast(ColMajor)) { const Index idx = coords[0]; if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) { return Scalar(0); @@ -214,7 +214,7 @@ struct TensorEvaluator, Device return m_impl.coeff(inputIndex); } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 1c03d202f..8a42ab6b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -100,7 +100,7 @@ struct TensorEvaluator, Device> : m_impl(op.expression(), device) { // Only column major tensors are supported for now. - EIGEN_STATIC_ASSERT((Layout == ColMajor), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); Index num_patches = 1; const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -232,7 +232,7 @@ struct TensorEvaluator, Device> } } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; From fefec723aa44703c1b7884b2ccfa73877a58f500 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 13:16:22 -0800 Subject: [PATCH 207/214] Fixed compilation error triggered when trying to vectorize a non vectorizable cuda kernel. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 84 +++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93fdd907..05ac9bd2f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -22,8 +22,13 @@ namespace Eigen { */ namespace internal { +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + // Default strategy: the expression is evaluated with a single cpu thread. -template::PacketAccess> +template::value> class TensorExecutor { public: @@ -153,34 +158,45 @@ class TensorExecutor template __global__ void __launch_bounds__(1024) - EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } - } - else { - // Use the vector path - const Index PacketSize = unpacket_traits::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + // Use the scalar path + for (Index i = first_index; i < size; i += step_size) { + eval.evalScalar(i); } } -template -class TensorExecutor +template +__global__ void +__launch_bounds__(1024) +EigenMetaKernel_Vectorizable(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Use the vector path + const Index PacketSize = unpacket_traits::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first_index; i < size; i += step_size) { + eval.evalScalar(i); + } +} + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess && TensorEvaluator::IsAligned; +}; + +template +class TensorExecutor { public: typedef typename Expression::Index Index; @@ -192,13 +208,33 @@ class TensorExecutor { const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock(); - const Index size = array_prod(evaluator.dimensions()); - LAUNCH_CUDA_KERNEL((EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } }; + +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); + } +}; + #endif } // end namespace internal From d771295554b4d0238f136588ae526a4c23be9c6c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 22:59:27 +0100 Subject: [PATCH 208/214] remove useless include --- blas/level3_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 32313e814..a05872666 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -8,7 +8,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "common.h" -#include + int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) { // std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n"; From cc5d7ff5238da45ef7416ec94f18227486ed9643 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:02:38 -0800 Subject: [PATCH 209/214] Added vectorized implementation of the exponential function for ARM/NEON --- Eigen/Core | 1 + Eigen/src/Core/arch/NEON/MathFunctions.h | 91 ++++++++++++++++++++++++ Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 Eigen/src/Core/arch/NEON/MathFunctions.h diff --git a/Eigen/Core b/Eigen/Core index b5af63623..416e901ae 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -300,6 +300,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" #endif diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h new file mode 100644 index 000000000..6bb05bb92 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -0,0 +1,91 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_NEON_H +#define EIGEN_MATH_FUNCTIONS_NEON_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9afd86bec..559682cf7 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -88,7 +88,7 @@ template<> struct packet_traits : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, + HasExp = 1, HasSqrt = 0 }; }; From fe25f3b8e3ee86d808b5f8f75f789cee8dffd581 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 10 Feb 2015 23:11:35 +0100 Subject: [PATCH 210/214] FMA has been wrongly disabled --- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 485bac10b..fc2e78b2d 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -133,7 +133,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 3f6fb0254..63a859699 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -227,7 +227,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif From f669f5656ab550010c5dd92ce2da7d3fab07babd Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:29:47 -0800 Subject: [PATCH 211/214] Marked a few functions as EIGEN_DEVICE_FUNC to enable the use of tensors in cuda kernels. --- Eigen/src/Core/util/Memory.h | 48 +++++++++---------- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 4 +- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index bacf236fb..16f8cc1b0 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = *** Implementation of generic aligned realloc (when no realloc can be used)*** *****************************************************************************/ -void* aligned_malloc(std::size_t size); -void aligned_free(void *ptr); +EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size); +EIGEN_DEVICE_FUNC void aligned_free(void *ptr); /** \internal * \brief Reallocates aligned memory. @@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size) *****************************************************************************/ #ifdef EIGEN_NO_MALLOC -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)"); } #elif defined EIGEN_RUNTIME_NO_MALLOC -inline bool is_malloc_allowed_impl(bool update, bool new_value = false) +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) { static bool value = true; if (update == 1) value = new_value; return value; } -inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } -inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } +EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } #else -inline void check_that_malloc_is_allowed() +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ -inline void* aligned_malloc(size_t size) +EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size) } /** \internal Frees memory allocated with aligned_malloc. */ -inline void aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if !EIGEN_ALIGN std::free(ptr); @@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. */ -template inline void* conditional_aligned_malloc(size_t size) +template EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { return aligned_malloc(size); } -template<> inline void* conditional_aligned_malloc(size_t size) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) { check_that_malloc_is_allowed(); @@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc(size_t size) } /** \internal Frees memory allocated with conditional_aligned_malloc */ -template inline void conditional_aligned_free(void *ptr) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { aligned_free(ptr); } -template<> inline void conditional_aligned_free(void *ptr) +template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { std::free(ptr); } @@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc(void* ptr, size_t new /** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void destruct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) { // always destruct an array starting from the end. if(ptr) @@ -351,7 +351,7 @@ template inline void destruct_elements_of_array(T *ptr, size_t size) /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template inline T* construct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) { size_t i; EIGEN_TRY @@ -371,7 +371,7 @@ template inline T* construct_elements_of_array(T *ptr, size_t size) *****************************************************************************/ template -EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) { if(size > size_t(-1) / sizeof(T)) throw_std_bad_alloc(); @@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called. */ -template inline T* aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); @@ -396,7 +396,7 @@ template inline T* aligned_new(size_t size) } } -template inline T* conditional_aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); @@ -414,7 +414,7 @@ template inline T* conditional_aligned_new(size_t size) /** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) { destruct_elements_of_array(ptr, size); aligned_free(ptr); @@ -423,13 +423,13 @@ template inline void aligned_delete(T *ptr, size_t size) /** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template inline void conditional_aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) { destruct_elements_of_array(ptr, size); conditional_aligned_free(ptr); } -template inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) { check_size_for_overflow(new_size); check_size_for_overflow(old_size); @@ -452,7 +452,7 @@ template inline T* conditional_aligned_realloc_new(T* pt } -template inline T* conditional_aligned_new_auto(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) { if(size==0) return 0; // short-cut. Also fixes Bug 884 @@ -495,7 +495,7 @@ template inline T* conditional_aligned_realloc_new_auto( return result; } -template inline void conditional_aligned_delete_auto(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) { if(NumTraits::RequireInitialization) destruct_elements_of_array(ptr, size); diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 0e8a4b8d6..037219f23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -350,7 +350,7 @@ class Tensor : public TensorBase > } #endif - void resize(const array& dimensions) + EIGEN_DEVICE_FUNC void resize(const array& dimensions) { std::size_t i; Index size = Index(1); @@ -367,7 +367,7 @@ class Tensor : public TensorBase > #endif } - void resize(const DSizes& dimensions) { + EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { array dims; for (std::size_t i = 0; i < NumIndices; ++i) { dims[i] = dimensions[i]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index d81197e6d..2ad52b2f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -275,7 +275,7 @@ struct DSizes : array { } #endif - DSizes& operator = (const array& other) { + EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { *static_cast(this) = other; return *this; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index dfe85602a..1b227e8c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -112,9 +112,9 @@ class TensorStorage& dimensions() const {return m_dimensions;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& dimensions() const {return m_dimensions;} - void resize(DenseIndex size, const array& nbDimensions) + EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array& nbDimensions) { const DenseIndex currentSz = internal::array_prod(m_dimensions); if(size != currentSz) From 6620aaa4b3ad3ae9f38b7b6213e874021579bcd7 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:34:42 -0800 Subject: [PATCH 212/214] Silenced a few compilation warnings generated by nvcc --- unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 57a14a037..a1dec76d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -240,7 +240,7 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } private: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index a9501336e..41a36cb75 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -136,7 +136,7 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - Scalar* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } private: TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index c00810594..c119b30e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -148,7 +148,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 1191b2411..a93f48ccb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -145,7 +145,7 @@ struct TensorEvaluator, Device> return m_impl.template packet(index); } - CoeffReturnType* data() const { return m_impl.data(); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 7643d4cdc..de5747905 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -360,7 +360,7 @@ struct TensorEvaluator, Device> return rslt; } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: template friend struct internal::GenericDimReducer; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 82969b4c0..ad21e966b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -190,7 +190,7 @@ struct TensorEvaluator, Device return rslt; } - Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; From 4470c9997559522e9b81810948d9783b58444ae4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 14:40:18 -0800 Subject: [PATCH 213/214] Added a test to validate tensor casting on cuda devices --- unsupported/test/cxx11_tensor_cuda.cpp | 40 ++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/unsupported/test/cxx11_tensor_cuda.cpp b/unsupported/test/cxx11_tensor_cuda.cpp index 059d23de1..8c1ca1bf8 100644 --- a/unsupported/test/cxx11_tensor_cuda.cpp +++ b/unsupported/test/cxx11_tensor_cuda.cpp @@ -460,6 +460,45 @@ static void test_cuda_constant_broadcast() } } + +void test_cuda_cast() +{ + Tensor in(Eigen::array(72,53,97)); + Tensor out(Eigen::array(72,53,97)); + in.setRandom(); + + std::size_t in_bytes = in.size() * sizeof(double); + std::size_t out_bytes = out.size() * sizeof(float); + + double* d_in; + float* d_out; + cudaMalloc((void**)(&d_in), in_bytes); + cudaMalloc((void**)(&d_out), out_bytes); + + cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + + cudaStream_t stream; + assert(cudaStreamCreate(&stream) == cudaSuccess); + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > gpu_in(d_in, Eigen::array(72,53,97)); + Eigen::TensorMap > gpu_out(d_out, Eigen::array(72,53,97)); + + gpu_out.device(gpu_device) = gpu_in.template cast(); + + assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); + assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + + for (int i = 0; i < 72; ++i) { + for (int j = 0; j < 53; ++j) { + for (int k = 0; k < 97; ++k) { + VERIFY_IS_APPROX(out(Eigen::array(i,j,k)), static_cast(in(Eigen::array(i,j,k)))); + } + } + } +} + + void test_cxx11_tensor_cuda() { CALL_SUBTEST(test_cuda_elementwise_small()); @@ -471,4 +510,5 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST(test_cuda_convolution_2d()); CALL_SUBTEST(test_cuda_convolution_3d()); CALL_SUBTEST(test_cuda_constant_broadcast()); + CALL_SUBTEST(test_cuda_cast()); } From 409547a0c83604b6dea70b8523674ac19e2af958 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 12 Feb 2015 21:04:31 +0100 Subject: [PATCH 214/214] update EIGEN_FAST_MATH documentation --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 13f8fdd4e..27167d3dc 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -385,7 +385,7 @@ /** Allows to disable some optimizations which might affect the accuracy of the result. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * They currently include: - * - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled. + * - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled. */ #ifndef EIGEN_FAST_MATH #define EIGEN_FAST_MATH 1