mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-11 03:09:01 +08:00
Merged in deven-amd/eigen/HIP_fixes (pull request PR-518)
PR with HIP specific fixes (for the eigen nightly regression failures in HIP mode)
This commit is contained in:
commit
564ca71e39
@ -379,10 +379,12 @@
|
|||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_HIPCC)
|
||||||
|
|
||||||
#define EIGEN_VECTORIZE_GPU
|
#define EIGEN_VECTORIZE_GPU
|
||||||
#include <hip/hip_vector_types.h>
|
#include <hip/hip_vector_types.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
|
||||||
#define EIGEN_HAS_HIP_FP16
|
#define EIGEN_HAS_HIP_FP16
|
||||||
#include <hip/hip_fp16.h>
|
#include <hip/hip_fp16.h>
|
||||||
|
@ -96,10 +96,16 @@ inline void throw_std_bad_alloc()
|
|||||||
/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
|
/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
|
||||||
* Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
|
* Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
|
||||||
*/
|
*/
|
||||||
inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
|
EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
|
||||||
{
|
{
|
||||||
eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2");
|
eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2");
|
||||||
|
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
void *original = ::malloc(size+alignment);
|
||||||
|
#else
|
||||||
void *original = std::malloc(size+alignment);
|
void *original = std::malloc(size+alignment);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (original == 0) return 0;
|
if (original == 0) return 0;
|
||||||
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
|
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
|
||||||
*(reinterpret_cast<void**>(aligned) - 1) = original;
|
*(reinterpret_cast<void**>(aligned) - 1) = original;
|
||||||
@ -107,9 +113,15 @@ inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = E
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** \internal Frees memory allocated with handmade_aligned_malloc */
|
/** \internal Frees memory allocated with handmade_aligned_malloc */
|
||||||
inline void handmade_aligned_free(void *ptr)
|
EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
|
||||||
{
|
{
|
||||||
if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
|
if (ptr) {
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
::free(*(reinterpret_cast<void**>(ptr) - 1));
|
||||||
|
#else
|
||||||
|
std::free(*(reinterpret_cast<void**>(ptr) - 1));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \internal
|
/** \internal
|
||||||
|
@ -186,21 +186,21 @@ struct TensorContractionKernel {
|
|||||||
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
|
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
|
||||||
GebpKernel;
|
GebpKernel;
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
|
||||||
static void packLhs(LhsScalar* lhsBlock,
|
static void packLhs(LhsScalar* lhsBlock,
|
||||||
const typename LhsMapper::SubMapper& data_mapper,
|
const typename LhsMapper::SubMapper& data_mapper,
|
||||||
const StorageIndex depth, const StorageIndex rows) {
|
const StorageIndex depth, const StorageIndex rows) {
|
||||||
LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
|
LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
|
||||||
static void packRhs(RhsScalar* rhsBlock,
|
static void packRhs(RhsScalar* rhsBlock,
|
||||||
const typename RhsMapper::SubMapper& data_mapper,
|
const typename RhsMapper::SubMapper& data_mapper,
|
||||||
const StorageIndex depth, const StorageIndex cols) {
|
const StorageIndex depth, const StorageIndex cols) {
|
||||||
RhsPacker()(rhsBlock, data_mapper, depth, cols);
|
RhsPacker()(rhsBlock, data_mapper, depth, cols);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DONT_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
|
||||||
static void invoke(const OutputMapper& output_mapper,
|
static void invoke(const OutputMapper& output_mapper,
|
||||||
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
|
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
|
||||||
const StorageIndex rows, const StorageIndex depth,
|
const StorageIndex rows, const StorageIndex depth,
|
||||||
|
@ -218,6 +218,7 @@ struct InnerMostDimReducer<Self, Op, false, true> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(EIGEN_HIPCC)
|
||||||
template <typename Self, typename Op>
|
template <typename Self, typename Op>
|
||||||
struct InnerMostDimReducer<Self, Op, true, true> {
|
struct InnerMostDimReducer<Self, Op, true, true> {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
|
||||||
@ -257,7 +258,8 @@ struct InnerMostDimReducer<Self, Op, true, true> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
|
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
|
||||||
struct InnerMostDimPreserver {
|
struct InnerMostDimPreserver {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
|
||||||
|
@ -292,7 +292,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename Op>
|
template <typename Op>
|
||||||
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
|
__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half2* scratch) {
|
||||||
eigen_assert(threadIdx.x == 1);
|
eigen_assert(threadIdx.x == 1);
|
||||||
half tmp = __low2half(*scratch);
|
half tmp = __low2half(*scratch);
|
||||||
reducer.reduce(__high2half(*scratch), &tmp);
|
reducer.reduce(__high2half(*scratch), &tmp);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user