mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-29 23:34:12 +08:00
removing the *Hip files from the unsupported/Eigen/CXX11/src/Tensor and unsupported/test directories
This commit is contained in:
parent
7e41c8f1a9
commit
cfdabbcc8f
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,352 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
|
||||
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_runtime_api.h"
|
||||
#endif
|
||||
#include <unistd.h> //for sleep function
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
static const int kHipScratchSize = 1024;
|
||||
|
||||
// This defines an interface that GPUDevice can take to use
|
||||
// HIP streams underneath.
|
||||
class StreamInterface {
|
||||
public:
|
||||
virtual ~StreamInterface() {}
|
||||
|
||||
virtual const hipStream_t& stream() const = 0;
|
||||
virtual const hipDeviceProp_t& deviceProperties() const = 0;
|
||||
|
||||
// Allocate memory on the actual device where the computation will run
|
||||
virtual void* allocate(size_t num_bytes) const = 0;
|
||||
virtual void deallocate(void* buffer) const = 0;
|
||||
|
||||
// Return a scratchpad buffer of size 1k
|
||||
virtual void* scratchpad() const = 0;
|
||||
|
||||
// Return a semaphore. The semaphore is initially initialized to 0, and
|
||||
// each kernel using it is responsible for resetting to 0 upon completion
|
||||
// to maintain the invariant that the semaphore is always equal to 0 upon
|
||||
// each kernel start.
|
||||
virtual unsigned int* semaphore() const = 0;
|
||||
};
|
||||
|
||||
static hipDeviceProp_t* m_deviceProperties;
|
||||
static bool m_devicePropInitialized = false;
|
||||
|
||||
static void initializeDeviceProp() {
|
||||
if (!m_devicePropInitialized) {
|
||||
// Attempts to ensure proper behavior in the case of multiple threads
|
||||
// calling this function simultaneously. This would be trivial to
|
||||
// implement if we could use std::mutex, but unfortunately mutex don't
|
||||
// compile with nvcc, so we resort to atomics and thread fences instead.
|
||||
// Note that if the caller uses a compiler that doesn't support c++11 we
|
||||
// can't ensure that the initialization is thread safe.
|
||||
#if 0 && __cplusplus >= 201103L
|
||||
static std::atomic<bool> first(true);
|
||||
if (first.exchange(false)) {
|
||||
#else
|
||||
static bool first = true;
|
||||
if (first) {
|
||||
first = false;
|
||||
#endif
|
||||
// We're the first thread to reach this point.
|
||||
int num_devices;
|
||||
hipError_t status = hipGetDeviceCount(&num_devices);
|
||||
if (status != hipSuccess) {
|
||||
std::cerr << "Failed to get the number of HIP devices: "
|
||||
<< hipGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == hipSuccess);
|
||||
}
|
||||
m_deviceProperties = new hipDeviceProp_t[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
status = hipGetDeviceProperties(&m_deviceProperties[i], i);
|
||||
if (status != hipSuccess) {
|
||||
std::cerr << "Failed to initialize HIP device #"
|
||||
<< i
|
||||
<< ": "
|
||||
<< hipGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == hipSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0 && __cplusplus >= 201103L
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
#endif
|
||||
m_devicePropInitialized = true;
|
||||
} else {
|
||||
// Wait for the other thread to inititialize the properties.
|
||||
while (!m_devicePropInitialized) {
|
||||
#if 0 && __cplusplus >= 201103L
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
#endif
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00;
|
||||
|
||||
class HipStreamDevice : public StreamInterface {
|
||||
public:
|
||||
// Use the default stream on the current device
|
||||
HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
|
||||
hipGetDevice(&device_);
|
||||
initializeDeviceProp();
|
||||
}
|
||||
// Use the default stream on the specified device
|
||||
HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
initializeDeviceProp();
|
||||
}
|
||||
// Use the specified stream. Note that it's the
|
||||
// caller responsibility to ensure that the stream can run on
|
||||
// the specified device. If no device is specified the code
|
||||
// assumes that the stream is associated to the current gpu device.
|
||||
HipStreamDevice(const hipStream_t* stream, int device = -1)
|
||||
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
|
||||
if (device < 0) {
|
||||
hipGetDevice(&device_);
|
||||
} else {
|
||||
int num_devices;
|
||||
hipError_t err = hipGetDeviceCount(&num_devices);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
assert(device < num_devices);
|
||||
device_ = device;
|
||||
}
|
||||
initializeDeviceProp();
|
||||
}
|
||||
|
||||
virtual ~HipStreamDevice() {
|
||||
if (scratch_) {
|
||||
deallocate(scratch_);
|
||||
}
|
||||
}
|
||||
|
||||
const hipStream_t& stream() const { return *stream_; }
|
||||
const hipDeviceProp_t& deviceProperties() const {
|
||||
return m_deviceProperties[device_];
|
||||
}
|
||||
virtual void* allocate(size_t num_bytes) const {
|
||||
hipError_t err = hipSetDevice(device_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
void* result;
|
||||
err = hipMalloc(&result, num_bytes);
|
||||
assert(err == hipSuccess);
|
||||
assert(result != NULL);
|
||||
return result;
|
||||
}
|
||||
virtual void deallocate(void* buffer) const {
|
||||
hipError_t err = hipSetDevice(device_);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
assert(buffer != NULL);
|
||||
err = hipFree(buffer);
|
||||
assert(err == hipSuccess);
|
||||
}
|
||||
|
||||
virtual void* scratchpad() const {
|
||||
if (scratch_ == NULL) {
|
||||
scratch_ = allocate(kHipScratchSize + sizeof(unsigned int));
|
||||
}
|
||||
return scratch_;
|
||||
}
|
||||
|
||||
virtual unsigned int* semaphore() const {
|
||||
if (semaphore_ == NULL) {
|
||||
char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize;
|
||||
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
|
||||
//hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
||||
hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int));
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
}
|
||||
return semaphore_;
|
||||
}
|
||||
|
||||
private:
|
||||
const hipStream_t* stream_;
|
||||
int device_;
|
||||
mutable void* scratch_;
|
||||
mutable unsigned int* semaphore_;
|
||||
};
|
||||
|
||||
struct GpuDevice {
|
||||
// The StreamInterface is not owned: the caller is
|
||||
// responsible for its initialization and eventual destruction.
|
||||
explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
|
||||
eigen_assert(stream);
|
||||
}
|
||||
// TODO(bsteiner): This is an internal API, we should not expose it.
|
||||
EIGEN_STRONG_INLINE const hipStream_t& stream() const {
|
||||
return stream_->stream();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
return stream_->allocate(num_bytes);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
|
||||
stream_->deallocate(buffer);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void* scratchpad() const {
|
||||
return stream_->scratchpad();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE unsigned int* semaphore() const {
|
||||
return stream_->semaphore();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
|
||||
#if !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice,
|
||||
stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
|
||||
hipError_t err =
|
||||
hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||
hipError_t err =
|
||||
hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream());
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||
#if !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
//TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream());
|
||||
hipError_t err = hipMemset(buffer, c, n);
|
||||
EIGEN_UNUSED_VARIABLE(err)
|
||||
assert(err == hipSuccess);
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t numThreads() const {
|
||||
// FIXME
|
||||
return 32;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
|
||||
// FIXME
|
||||
return 48*1024;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
|
||||
// We won't try to take advantage of the l2 cache for the time being, and
|
||||
// there is no l3 cache on hip devices.
|
||||
return firstLevelCacheSize();
|
||||
}
|
||||
|
||||
// FIXME - this will move into HIP
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#undef assert
|
||||
#define assert(COND)
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
|
||||
#if defined(EIGEN_HIPCC) && \
|
||||
!defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
hipError_t err = hipStreamSynchronize(stream_->stream());
|
||||
if (err != hipSuccess) {
|
||||
std::cerr << "Error detected in HIP stream: "
|
||||
<< hipGetErrorString(err)
|
||||
<< std::endl;
|
||||
assert(err == hipSuccess);
|
||||
}
|
||||
#else
|
||||
assert(false && "The default device should be used instead to generate kernel code");
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const {
|
||||
return stream_->deviceProperties().multiProcessorCount;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const {
|
||||
return stream_->deviceProperties().maxThreadsPerBlock;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const {
|
||||
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
return stream_->deviceProperties().sharedMemPerBlock;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
return stream_->deviceProperties().major;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int minorDeviceVersion() const {
|
||||
return stream_->deviceProperties().minor;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE int maxBlocks() const {
|
||||
return max_blocks_;
|
||||
}
|
||||
|
||||
// This function checks if the HIP runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
inline bool ok() const {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
hipError_t error = hipStreamQuery(stream_->stream());
|
||||
return (error == hipSuccess) || (error == hipErrorNotReady);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
const StreamInterface* stream_;
|
||||
int max_blocks_;
|
||||
};
|
||||
|
||||
#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \
|
||||
assert(hipGetLastError() == hipSuccess);
|
||||
|
||||
|
||||
// FIXME: Should be device and kernel specific.
|
||||
#if defined(EIGEN_HIPCC)
|
||||
static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) {
|
||||
#if !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
hipError_t status = hipDeviceSetSharedMemConfig(config);
|
||||
EIGEN_UNUSED_VARIABLE(status)
|
||||
assert(status == hipSuccess);
|
||||
#else
|
||||
EIGEN_UNUSED_VARIABLE(config)
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
|
@ -1,815 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#define HIP_WARP_SIZE 64
|
||||
#endif
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
|
||||
// Full reducers for GPU, don't vectorize for now
|
||||
|
||||
// Reducer function that enables multiple hip thread to safely accumulate at the same
|
||||
// output address. It basically reads the current value of the output variable, and
|
||||
// attempts to update it with the new value. If in the meantime another hip thread
|
||||
// updated the content of the output address it will try again.
|
||||
template <typename T, typename R>
|
||||
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
|
||||
if (sizeof(T) == 4)
|
||||
{
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
unsigned int newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
unsigned int readback;
|
||||
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (sizeof(T) == 8) {
|
||||
unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
|
||||
unsigned long long newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
unsigned long long readback;
|
||||
while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
assert(0 && "Wordsize not supported");
|
||||
}
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
// We extend atomicExch to support extra data types
|
||||
template <typename Type>
|
||||
__device__ inline Type atomicExchCustom(Type* address, Type val) {
|
||||
return atomicExch(address, val);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline double atomicExchCustom(double* address, double val) {
|
||||
unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
|
||||
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
|
||||
}
|
||||
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
template <template <typename T> class R>
|
||||
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
|
||||
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
|
||||
unsigned int newval = oldval;
|
||||
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
unsigned int readback;
|
||||
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
|
||||
if (newval == oldval) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
|
||||
defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
|
||||
atomicAdd(output, accum);
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename CoeffType, typename Index>
|
||||
__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
|
||||
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
const Index num_threads = hipBlockDim_x * hipGridDim_x;
|
||||
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
|
||||
output[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void FullReductionKernel(const Self input, Index num_coeffs,
|
||||
typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
|
||||
defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
|
||||
// Initialize the output value
|
||||
const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x;
|
||||
if (hipGridDim_x == 1) {
|
||||
if (first_index == 0) {
|
||||
*output = reducer.initialize();
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (hipThreadIdx_x == 0) {
|
||||
unsigned int block = atomicCAS(semaphore, 0u, 1u);
|
||||
if (block == 0) {
|
||||
// We're the first block to run, initialize the output value
|
||||
atomicExchCustom(output, reducer.initialize());
|
||||
__threadfence();
|
||||
atomicExch(semaphore, 2u);
|
||||
}
|
||||
else {
|
||||
// Wait for the first block to initialize the output value.
|
||||
// Use atomicCAS here to ensure that the reads aren't cached
|
||||
unsigned int val;
|
||||
do {
|
||||
val = atomicCAS(semaphore, 2u, 2u);
|
||||
}
|
||||
while (val < 2u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u);
|
||||
|
||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
|
||||
for (Index i = 0; i < max_iter; i+=BlockSize) {
|
||||
const Index index = first_index + i;
|
||||
eigen_assert(index < num_coeffs);
|
||||
typename Self::CoeffReturnType val = input.m_impl.coeff(index);
|
||||
reducer.reduce(val, &accum);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
|
||||
// XXX use std::is_floating_point to determine the type of accum
|
||||
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
|
||||
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum);
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum);
|
||||
}
|
||||
}
|
||||
|
||||
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
|
||||
atomicReduce(output, accum, reducer);
|
||||
}
|
||||
|
||||
if (hipGridDim_x > 1 && hipThreadIdx_x == 0) {
|
||||
// Let the last block reset the semaphore
|
||||
atomicInc(semaphore, hipGridDim_x + 1);
|
||||
__threadfence_system();
|
||||
}
|
||||
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
|
||||
eigen_assert(hipBlockDim_x == 1);
|
||||
eigen_assert(hipGridDim_x == 1);
|
||||
if (num_coeffs % 2 != 0) {
|
||||
half last = input.m_impl.coeff(num_coeffs-1);
|
||||
*scratch = __halves2half2(last, reducer.initialize());
|
||||
} else {
|
||||
*scratch = reducer.template initializePacket<half2>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
|
||||
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
const Index num_threads = hipBlockDim_x * hipGridDim_x;
|
||||
const Index num_packets = num_coeffs / 2;
|
||||
for (Index i = thread_id; i < num_packets; i += num_threads) {
|
||||
((half2*)output)[i] = reducer.template initializePacket<half2>();
|
||||
}
|
||||
|
||||
if (thread_id == 0 && num_coeffs % 2 != 0) {
|
||||
output[num_coeffs-1] = reducer.initialize();
|
||||
}
|
||||
}
|
||||
|
||||
template <int BlockSize, int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
|
||||
half* output, half2* scratch) {
|
||||
eigen_assert(NumPerThread % 2 == 0);
|
||||
|
||||
const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x;
|
||||
|
||||
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
|
||||
if (hipGridDim_x == 1 && first_index == 0) {
|
||||
if (num_coeffs % 2 != 0) {
|
||||
half last = input.m_impl.coeff(num_coeffs-1);
|
||||
*scratch = __halves2half2(last, reducer.initialize());
|
||||
} else {
|
||||
*scratch = reducer.template initializePacket<half2>();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
half2 accum = reducer.template initializePacket<half2>();
|
||||
const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
|
||||
for (Index i = 0; i < max_iter; i += BlockSize) {
|
||||
const Index index = first_index + 2*i;
|
||||
eigen_assert(index + 1 < num_coeffs);
|
||||
half2 val = input.m_impl.template packet<Unaligned>(index);
|
||||
reducer.reducePacket(val, &accum);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
wka_in.h = accum;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
|
||||
reducer.reducePacket(wka_out.h, &accum);
|
||||
}
|
||||
|
||||
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
|
||||
atomicReduce(scratch, accum, reducer);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (hipGridDim_x == 1 && first_index == 0) {
|
||||
half tmp = __low2half(*scratch);
|
||||
reducer.reduce(__high2half(*scratch), &tmp);
|
||||
*output = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
|
||||
eigen_assert(hipThreadIdx_x == 1);
|
||||
half tmp = __low2half(*scratch);
|
||||
reducer.reduce(__high2half(*scratch), &tmp);
|
||||
*output = tmp;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct FullReductionLauncher {
|
||||
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
|
||||
assert(false && "Should only be called on doubles, floats and half floats");
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
std::mutex __eigen_reduction_hip_mutex;
|
||||
}
|
||||
|
||||
// Specialization for float and double
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
|
||||
struct FullReductionLauncher<
|
||||
Self, Op, OutputType, PacketAccess,
|
||||
typename internal::enable_if<
|
||||
internal::is_same<float, OutputType>::value ||
|
||||
internal::is_same<double, OutputType>::value,
|
||||
void>::type> {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
|
||||
// guard FullReductionLauncher with a mutex so only 1 FullReductionKernel
|
||||
// is dispatched at a time
|
||||
std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex);
|
||||
|
||||
typedef typename Self::Index Index;
|
||||
typedef typename Self::CoeffReturnType Scalar;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
|
||||
unsigned int* semaphore = NULL;
|
||||
if (num_blocks > 1) {
|
||||
semaphore = device.semaphore();
|
||||
|
||||
unsigned int semaphore_host = 0xFF;
|
||||
hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
|
||||
if (semaphore_host != 0) {
|
||||
std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: "
|
||||
<< semaphore_host << "\n";
|
||||
// wait for all commands on the device to complete so semaphore value
|
||||
// is reset to 0
|
||||
hipDeviceSynchronize();
|
||||
|
||||
// read again
|
||||
hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
|
||||
if (semaphore_host != 0) {
|
||||
std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: "
|
||||
<< semaphore_host << ", apply manual override to 0\n";
|
||||
|
||||
// force set semaphore value to be 0
|
||||
semaphore_host = 0;
|
||||
hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
|
||||
dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer);
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
|
||||
assert(false && "Should not be called since there is no packet accessor");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct FullReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
half2* scratch = static_cast<half2*>(device.scratchpad());
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
|
||||
dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch);
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
|
||||
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>),
|
||||
dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template <typename Self, typename Op, bool Vectorizable>
|
||||
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple cases
|
||||
// of doubles, floats and half floats
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif
|
||||
|
||||
template <typename OutputType>
|
||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
||||
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
// Don't crash when we're called with an input tensor of size 0.
|
||||
if (num_coeffs == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
|
||||
defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
|
||||
typedef typename Self::CoeffReturnType Type;
|
||||
eigen_assert(hipBlockDim_y == 1);
|
||||
eigen_assert(hipBlockDim_z == 1);
|
||||
eigen_assert(hipGridDim_y == 1);
|
||||
eigen_assert(hipGridDim_z == 1);
|
||||
|
||||
const int unroll_times = 16;
|
||||
eigen_assert(NumPerThread % unroll_times == 0);
|
||||
|
||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread);
|
||||
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
|
||||
|
||||
const Index num_threads = hipBlockDim_x * hipGridDim_x;
|
||||
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (hipGridDim_x == 1) {
|
||||
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
|
||||
const Index row = i / input_col_blocks;
|
||||
|
||||
if (row < num_preserved_coeffs) {
|
||||
const Index col_block = i % input_col_blocks;
|
||||
const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x;
|
||||
|
||||
Type reduced_val = reducer.initialize();
|
||||
|
||||
for (Index j = 0; j < NumPerThread; j += unroll_times) {
|
||||
const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1);
|
||||
if (last_col >= num_coeffs_to_reduce) {
|
||||
for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) {
|
||||
const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
|
||||
reducer.reduce(val, &reduced_val);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
// Faster version of the loop with no branches after unrolling.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < unroll_times; ++k) {
|
||||
const Index col = col_begin + hipBlockDim_x * (j + k);
|
||||
reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
|
||||
// XXX use std::is_floating_point to determine the type of reduced_val
|
||||
if (std::is_floating_point<Type>::value) {
|
||||
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
|
||||
}
|
||||
}
|
||||
|
||||
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
|
||||
atomicReduce(&(output[row]), reduced_val, reducer);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
half* output) {
|
||||
eigen_assert(hipBlockDim_y == 1);
|
||||
eigen_assert(hipBlockDim_z == 1);
|
||||
eigen_assert(hipGridDim_y == 1);
|
||||
eigen_assert(hipGridDim_z == 1);
|
||||
|
||||
const int unroll_times = 16;
|
||||
eigen_assert(NumPerThread % unroll_times == 0);
|
||||
eigen_assert(unroll_times % 2 == 0);
|
||||
|
||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2);
|
||||
const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
|
||||
|
||||
const Index num_threads = hipBlockDim_x * hipGridDim_x;
|
||||
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (hipGridDim_x == 1) {
|
||||
Index i = 2*thread_id;
|
||||
for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
|
||||
half* loc = output + i;
|
||||
*((half2*)loc) = reducer.template initializePacket<half2>();
|
||||
}
|
||||
if (i < num_preserved_coeffs) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
|
||||
const Index row = 2 * (i / input_col_blocks);
|
||||
|
||||
if (row + 1 < num_preserved_coeffs) {
|
||||
const Index col_block = i % input_col_blocks;
|
||||
const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x);
|
||||
|
||||
half2 reduced_val1 = reducer.template initializePacket<half2>();
|
||||
half2 reduced_val2 = reducer.template initializePacket<half2>();
|
||||
|
||||
for (Index j = 0; j < NumPerThread; j += unroll_times) {
|
||||
const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2;
|
||||
if (last_col >= num_coeffs_to_reduce) {
|
||||
Index col = col_begin + hipBlockDim_x * j;
|
||||
for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) {
|
||||
const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
|
||||
reducer.reducePacket(val1, &reduced_val1);
|
||||
const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
|
||||
reducer.reducePacket(val2, &reduced_val2);
|
||||
}
|
||||
if (col < num_coeffs_to_reduce) {
|
||||
// Peel;
|
||||
const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
|
||||
const half2 val1 = __halves2half2(last1, reducer.initialize());
|
||||
reducer.reducePacket(val1, &reduced_val1);
|
||||
const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
|
||||
const half2 val2 = __halves2half2(last2, reducer.initialize());
|
||||
reducer.reducePacket(val2, &reduced_val2);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
// Faster version of the loop with no branches after unrolling.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < unroll_times; ++k) {
|
||||
const Index col = col_begin + hipBlockDim_x * (j + k) * 2;
|
||||
reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
|
||||
reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
|
||||
wka_in.h = reduced_val1;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val1);
|
||||
|
||||
wka_in.h = reduced_val2;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val2);
|
||||
}
|
||||
|
||||
half val1 = __low2half(reduced_val1);
|
||||
reducer.reduce(__high2half(reduced_val1), &val1);
|
||||
half val2 = __low2half(reduced_val2);
|
||||
reducer.reduce(__high2half(reduced_val2), &val2);
|
||||
half2 val = __halves2half2(val1, val2);
|
||||
|
||||
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
|
||||
half* loc = output + row;
|
||||
atomicReduce((half2*)loc, val, reducer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||
struct InnerReductionLauncher {
|
||||
static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// Specialization for float and double
|
||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
|
||||
struct InnerReductionLauncher<
|
||||
Self, Op, OutputType, PacketAccess,
|
||||
typename internal::enable_if<
|
||||
internal::is_same<float, OutputType>::value ||
|
||||
internal::is_same<double, OutputType>::value,
|
||||
void>::type> {
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 128;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumHipMultiProcessors() *
|
||||
device.maxHipThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumHipMultiProcessors() *
|
||||
device.maxHipThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>),
|
||||
dim3(num_blocks), dim3(1024), 0, device.stream(),
|
||||
reducer.initialize(), num_preserved_vals, output);
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self,
|
||||
num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
|
||||
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should not be called since there is no packet accessor");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
if (num_preserved_vals % 2 != 0) {
|
||||
// Not supported yet, revert to the slower code path
|
||||
return true;
|
||||
}
|
||||
|
||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||
const int block_size = /*256*/128;
|
||||
const int num_per_thread = /*128*/64;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumHipMultiProcessors() *
|
||||
device.maxHipThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumHipMultiProcessors() *
|
||||
device.maxHipThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>),
|
||||
dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output);
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
|
||||
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple case
|
||||
// of floats and half floats.
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
|
||||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
|
||||
#else
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
#endif
|
||||
|
||||
template <typename OutputType>
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||
// Don't crash when we're called with an input tensor of size 0.
|
||||
if (num_coeffs == 0) {
|
||||
return true;
|
||||
}
|
||||
// It's faster to use the usual code.
|
||||
if (num_coeffs_to_reduce <= 128) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
|
||||
}
|
||||
};
|
||||
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
typename Self::CoeffReturnType* output) {
|
||||
const Index num_threads = hipBlockDim_x * hipGridDim_x;
|
||||
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (hipGridDim_x == 1) {
|
||||
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Do the reduction.
|
||||
const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
|
||||
for (Index i = thread_id; i < max_iter; i += num_threads) {
|
||||
const Index input_col = i % num_preserved_coeffs;
|
||||
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
|
||||
typename Self::CoeffReturnType reduced_val = reducer.initialize();
|
||||
const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
|
||||
for (Index j = input_row; j < max_row; j++) {
|
||||
typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
|
||||
reducer.reduce(val, &reduced_val);
|
||||
}
|
||||
atomicReduce(&(output[input_col]), reduced_val, reducer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct OuterReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
// so reduce the scope of the optimized version of the code to the simple case
|
||||
// of floats.
|
||||
static const bool HasOptimizedImplementation = !Op::IsStateful &&
|
||||
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
|
||||
internal::is_same<typename Self::CoeffReturnType, double>::value);
|
||||
template <typename Device, typename OutputType>
|
||||
static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||
assert(false && "Should only be called to reduce doubles or floats on a gpu device");
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||
typedef typename Self::Index Index;
|
||||
|
||||
// It's faster to use the usual code.
|
||||
if (num_coeffs_to_reduce <= 32) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||
const int block_size = 256;
|
||||
const int num_per_thread = 16;
|
||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
||||
const int max_blocks = device.getNumHipMultiProcessors() *
|
||||
device.maxHipThreadsPerMultiProcessor() / block_size;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs in the reduction kernel itself when we don't have to worry
|
||||
// about race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumHipMultiProcessors() *
|
||||
device.maxHipThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>),
|
||||
dim3(num_blocks), dim3(1024), 0, device.stream(),
|
||||
reducer.initialize(), num_preserved_vals, output);
|
||||
}
|
||||
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
||||
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
} // end namespace internal
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
|
@ -1,251 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_hip
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template <int Layout>
|
||||
void test_hip_simple_argmax()
|
||||
{
|
||||
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
|
||||
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
|
||||
Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
|
||||
in.setRandom();
|
||||
in *= in.constant(100.0);
|
||||
in(0, 0, 0) = -1000.0;
|
||||
in(71, 52, 96) = 1000.0;
|
||||
|
||||
std::size_t in_bytes = in.size() * sizeof(double);
|
||||
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
|
||||
|
||||
double* d_in;
|
||||
DenseIndex* d_out_max;
|
||||
DenseIndex* d_out_min;
|
||||
hipMalloc((void**)(&d_in), in_bytes);
|
||||
hipMalloc((void**)(&d_out_max), out_bytes);
|
||||
hipMalloc((void**)(&d_out_min), out_bytes);
|
||||
|
||||
hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
|
||||
|
||||
gpu_out_max.device(gpu_device) = gpu_in.argmax();
|
||||
gpu_out_min.device(gpu_device) = gpu_in.argmin();
|
||||
|
||||
assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
|
||||
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
|
||||
|
||||
hipFree(d_in);
|
||||
hipFree(d_out_max);
|
||||
hipFree(d_out_min);
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
void test_hip_argmax_dim()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
std::vector<int> dims;
|
||||
dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
|
||||
|
||||
for (int dim = 0; dim < 4; ++dim) {
|
||||
tensor.setRandom();
|
||||
tensor = (tensor + tensor.constant(0.5)).log();
|
||||
|
||||
array<DenseIndex, 3> out_shape;
|
||||
for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
|
||||
|
||||
Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
|
||||
|
||||
array<DenseIndex, 4> ix;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != 0) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
|
||||
tensor(ix) = 10.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t in_bytes = tensor.size() * sizeof(float);
|
||||
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
|
||||
|
||||
float* d_in;
|
||||
DenseIndex* d_out;
|
||||
hipMalloc((void**)(&d_in), in_bytes);
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(tensor_arg.size(),
|
||||
size_t(2*3*5*7 / tensor.dimension(dim)));
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the first index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != tensor.dimension(dim) - 1) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
|
||||
tensor(ix) = 20.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the last index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
|
||||
}
|
||||
|
||||
hipFree(d_in);
|
||||
hipFree(d_out);
|
||||
}
|
||||
}
|
||||
|
||||
template <int DataLayout>
|
||||
void test_hip_argmin_dim()
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
std::vector<int> dims;
|
||||
dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
|
||||
|
||||
for (int dim = 0; dim < 4; ++dim) {
|
||||
tensor.setRandom();
|
||||
tensor = (tensor + tensor.constant(0.5)).log();
|
||||
|
||||
array<DenseIndex, 3> out_shape;
|
||||
for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
|
||||
|
||||
Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
|
||||
|
||||
array<DenseIndex, 4> ix;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != 0) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
|
||||
tensor(ix) = -10.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t in_bytes = tensor.size() * sizeof(float);
|
||||
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
|
||||
|
||||
float* d_in;
|
||||
DenseIndex* d_out;
|
||||
hipMalloc((void**)(&d_in), in_bytes);
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
|
||||
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
VERIFY_IS_EQUAL(tensor_arg.size(),
|
||||
2*3*5*7 / tensor.dimension(dim));
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect min to be in the first index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
|
||||
if (ix[dim] != tensor.dimension(dim) - 1) continue;
|
||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
|
||||
tensor(ix) = -20.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
|
||||
|
||||
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
|
||||
// Expect max to be in the last index of the reduced dimension
|
||||
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
|
||||
}
|
||||
|
||||
hipFree(d_in);
|
||||
hipFree(d_out);
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
|
||||
CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
|
||||
CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
|
||||
CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
|
||||
}
|
@ -1,79 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
void test_hip_conversion() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
Tensor<float, 1> floats(num_elem);
|
||||
floats.setRandom();
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
|
||||
d_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
|
||||
d_conv, num_elem);
|
||||
|
||||
gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
|
||||
|
||||
gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
|
||||
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
|
||||
|
||||
Tensor<float, 1> initial(num_elem);
|
||||
Tensor<float, 1> final(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(initial(i), final(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_half);
|
||||
gpu_device.deallocate(d_conv);
|
||||
}
|
||||
|
||||
|
||||
void test_fallback_conversion() {
|
||||
int num_elem = 101;
|
||||
Tensor<float, 1> floats(num_elem);
|
||||
floats.setRandom();
|
||||
|
||||
Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
|
||||
Eigen::Tensor<float, 1> conv = halfs.cast<float>();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(floats(i), conv(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_cast_float16_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_conversion());
|
||||
CALL_SUBTEST(test_fallback_conversion());
|
||||
}
|
@ -1,215 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
using Eigen::Tensor;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
// more than 30 * 1024, which is the number of threads in blocks on
|
||||
// a 15 SM GK110 GPU
|
||||
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
|
||||
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
|
||||
Tensor<float, 2, DataLayout> t_result(m_size, n_size);
|
||||
Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
|
||||
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
|
||||
t_left.setRandom();
|
||||
t_right.setRandom();
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = t_result.size() * sizeof(float);
|
||||
|
||||
float* d_t_left;
|
||||
float* d_t_right;
|
||||
float* d_t_result;
|
||||
|
||||
hipMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
hipMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
hipMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
|
||||
|
||||
|
||||
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
<< " vs " << t_result_gpu(i) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
hipFree((void*)d_t_left);
|
||||
hipFree((void*)d_t_right);
|
||||
hipFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_scalar(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
// with these dimensions, the output has 300 * 140 elements, which is
|
||||
// more than 30 * 1024, which is the number of threads in blocks on
|
||||
// a 15 SM GK110 GPU
|
||||
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
|
||||
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
|
||||
Tensor<float, 0, DataLayout> t_result;
|
||||
Tensor<float, 0, DataLayout> t_result_gpu;
|
||||
Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
|
||||
|
||||
t_left.setRandom();
|
||||
t_right.setRandom();
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = sizeof(float);
|
||||
|
||||
float* d_t_left;
|
||||
float* d_t_right;
|
||||
float* d_t_result;
|
||||
|
||||
hipMalloc((void**)(&d_t_left), t_left_bytes);
|
||||
hipMalloc((void**)(&d_t_right), t_right_bytes);
|
||||
hipMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_left(d_t_left, m_size, k_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
|
||||
gpu_t_right(d_t_right, k_size, n_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
|
||||
gpu_t_result(d_t_result);
|
||||
|
||||
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
|
||||
if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
|
||||
std::cout << "mismatch detected: " << t_result()
|
||||
<< " vs " << t_result_gpu() << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
hipFree((void*)d_t_left);
|
||||
hipFree((void*)d_t_right);
|
||||
hipFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_m() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_hip_contraction<ColMajor>(k, 128, 128);
|
||||
test_hip_contraction<RowMajor>(k, 128, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_k() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_hip_contraction<ColMajor>(128, k, 128);
|
||||
test_hip_contraction<RowMajor>(128, k, 128);
|
||||
}
|
||||
}
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_n() {
|
||||
for (int k = 32; k < 256; k++) {
|
||||
test_hip_contraction<ColMajor>(128, 128, k);
|
||||
test_hip_contraction<RowMajor>(128, 128, k);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_contraction_sizes() {
|
||||
int m_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257 , 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
|
||||
int n_sizes[] = { 31, 39, 63, 64, 65,
|
||||
127, 129, 255, 257, 511,
|
||||
512, 513, 1023, 1024, 1025};
|
||||
|
||||
int k_sizes[] = { 31, 39, 63, 64, 65,
|
||||
95, 96, 127, 129, 255,
|
||||
257, 511, 512, 513, 1023,
|
||||
1024, 1025};
|
||||
|
||||
for (int i = 0; i < 15; i++) {
|
||||
for (int j = 0; j < 15; j++) {
|
||||
for (int k = 0; k < 17; k++) {
|
||||
test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
|
||||
|
||||
CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
|
||||
|
||||
CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
|
||||
|
||||
CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
|
||||
|
||||
CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
|
||||
CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
|
||||
|
||||
// Commenting out these tests due to long runtimes
|
||||
// CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
|
||||
// CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
|
||||
}
|
@ -1,389 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_device
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
using Eigen::RowMajor;
|
||||
|
||||
// Context for evaluation on cpu
|
||||
struct CPUContext {
|
||||
CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
|
||||
kernel_1d_(0) = 3.14f;
|
||||
kernel_1d_(1) = 2.7f;
|
||||
|
||||
kernel_2d_(0,0) = 3.14f;
|
||||
kernel_2d_(1,0) = 2.7f;
|
||||
kernel_2d_(0,1) = 0.2f;
|
||||
kernel_2d_(1,1) = 7.0f;
|
||||
|
||||
kernel_3d_(0,0,0) = 3.14f;
|
||||
kernel_3d_(0,1,0) = 2.7f;
|
||||
kernel_3d_(0,0,1) = 0.2f;
|
||||
kernel_3d_(0,1,1) = 7.0f;
|
||||
kernel_3d_(1,0,0) = -1.0f;
|
||||
kernel_3d_(1,1,0) = -0.3f;
|
||||
kernel_3d_(1,0,1) = -0.7f;
|
||||
kernel_3d_(1,1,1) = -0.5f;
|
||||
}
|
||||
|
||||
const Eigen::DefaultDevice& device() const { return cpu_device_; }
|
||||
|
||||
const Eigen::Tensor<float, 3>& in1() const { return in1_; }
|
||||
const Eigen::Tensor<float, 3>& in2() const { return in2_; }
|
||||
Eigen::Tensor<float, 3>& out() { return out_; }
|
||||
const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
|
||||
const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
|
||||
const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
|
||||
|
||||
private:
|
||||
const Eigen::Tensor<float, 3>& in1_;
|
||||
const Eigen::Tensor<float, 3>& in2_;
|
||||
Eigen::Tensor<float, 3>& out_;
|
||||
|
||||
Eigen::Tensor<float, 1> kernel_1d_;
|
||||
Eigen::Tensor<float, 2> kernel_2d_;
|
||||
Eigen::Tensor<float, 3> kernel_3d_;
|
||||
|
||||
Eigen::DefaultDevice cpu_device_;
|
||||
};
|
||||
|
||||
|
||||
// Context for evaluation on GPU
|
||||
struct GPUContext {
|
||||
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
|
||||
assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
|
||||
float kernel_1d_val[] = {3.14f, 2.7f};
|
||||
assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
|
||||
|
||||
assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
|
||||
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
|
||||
assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
|
||||
|
||||
assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
|
||||
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
|
||||
assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
|
||||
}
|
||||
~GPUContext() {
|
||||
assert(hipFree(kernel_1d_) == hipSuccess);
|
||||
assert(hipFree(kernel_2d_) == hipSuccess);
|
||||
assert(hipFree(kernel_3d_) == hipSuccess);
|
||||
}
|
||||
|
||||
const Eigen::GpuDevice& device() const { return gpu_device_; }
|
||||
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
|
||||
|
||||
private:
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
|
||||
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
|
||||
|
||||
float* kernel_1d_;
|
||||
float* kernel_2d_;
|
||||
float* kernel_3d_;
|
||||
|
||||
Eigen::HipStreamDevice stream_;
|
||||
Eigen::GpuDevice gpu_device_;
|
||||
};
|
||||
|
||||
|
||||
// The actual expression to evaluate
|
||||
template <typename Context>
|
||||
void test_contextual_eval(Context* context)
|
||||
{
|
||||
context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_forced_contextual_eval(Context* context)
|
||||
{
|
||||
context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_compound_assignment(Context* context)
|
||||
{
|
||||
context->out().device(context->device()) = context->in1().constant(2.718f);
|
||||
context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
|
||||
}
|
||||
|
||||
|
||||
template <typename Context>
|
||||
void test_contraction(Context* context)
|
||||
{
|
||||
Eigen::array<std::pair<int, int>, 2> dims;
|
||||
dims[0] = std::make_pair(1, 1);
|
||||
dims[1] = std::make_pair(2, 2);
|
||||
|
||||
Eigen::array<int, 2> shape(40, 50*70);
|
||||
|
||||
Eigen::DSizes<int, 2> indices(0,0);
|
||||
Eigen::DSizes<int, 2> sizes(40,40);
|
||||
|
||||
context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
|
||||
}
|
||||
|
||||
|
||||
template <typename Context>
|
||||
void test_1d_convolution(Context* context)
|
||||
{
|
||||
Eigen::DSizes<int, 3> indices(0,0,0);
|
||||
Eigen::DSizes<int, 3> sizes(40,49,70);
|
||||
|
||||
Eigen::array<int, 1> dims(1);
|
||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_2d_convolution(Context* context)
|
||||
{
|
||||
Eigen::DSizes<int, 3> indices(0,0,0);
|
||||
Eigen::DSizes<int, 3> sizes(40,49,69);
|
||||
|
||||
Eigen::array<int, 2> dims(1,2);
|
||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
|
||||
}
|
||||
|
||||
template <typename Context>
|
||||
void test_3d_convolution(Context* context)
|
||||
{
|
||||
Eigen::DSizes<int, 3> indices(0,0,0);
|
||||
Eigen::DSizes<int, 3> sizes(39,49,69);
|
||||
|
||||
Eigen::array<int, 3> dims(0,1,2);
|
||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
|
||||
}
|
||||
|
||||
|
||||
void test_cpu() {
|
||||
Eigen::Tensor<float, 3> in1(40,50,70);
|
||||
Eigen::Tensor<float, 3> in2(40,50,70);
|
||||
Eigen::Tensor<float, 3> out(40,50,70);
|
||||
|
||||
in1 = in1.random() + in1.constant(10.0f);
|
||||
in2 = in2.random() + in2.constant(10.0f);
|
||||
|
||||
CPUContext context(in1, in2, out);
|
||||
test_contextual_eval(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_forced_contextual_eval(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_compound_assignment(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_contraction(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 40; ++j) {
|
||||
const float result = out(i,j,0);
|
||||
float expected = 0;
|
||||
for (int k = 0; k < 50; ++k) {
|
||||
for (int l = 0; l < 70; ++l) {
|
||||
expected += in1(i, k, l) * in2(j, k, l);
|
||||
}
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
|
||||
test_1d_convolution(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_2d_convolution(&context);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
|
||||
(in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
|
||||
if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_3d_convolution(&context);
|
||||
for (int i = 0; i < 39; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
|
||||
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
|
||||
(in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
|
||||
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
|
||||
if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test_gpu() {
|
||||
Eigen::Tensor<float, 3> in1(40,50,70);
|
||||
Eigen::Tensor<float, 3> in2(40,50,70);
|
||||
Eigen::Tensor<float, 3> out(40,50,70);
|
||||
in1 = in1.random() + in1.constant(10.0f);
|
||||
in2 = in2.random() + in2.constant(10.0f);
|
||||
|
||||
std::size_t in1_bytes = in1.size() * sizeof(float);
|
||||
std::size_t in2_bytes = in2.size() * sizeof(float);
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_in1;
|
||||
float* d_in2;
|
||||
float* d_out;
|
||||
hipMalloc((void**)(&d_in1), in1_bytes);
|
||||
hipMalloc((void**)(&d_in2), in2_bytes);
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
|
||||
|
||||
GPUContext context(gpu_in1, gpu_in2, gpu_out);
|
||||
test_contextual_eval(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_forced_contextual_eval(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_compound_assignment(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 50; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_contraction(&context);
|
||||
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 40; ++j) {
|
||||
const float result = out(i,j,0);
|
||||
float expected = 0;
|
||||
for (int k = 0; k < 50; ++k) {
|
||||
for (int l = 0; l < 70; ++l) {
|
||||
expected += in1(i, k, l) * in2(j, k, l);
|
||||
}
|
||||
}
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
|
||||
test_1d_convolution(&context);
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 70; ++k) {
|
||||
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_2d_convolution(&context);
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
|
||||
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
test_3d_convolution(&context);
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
|
||||
for (int i = 0; i < 39; ++i) {
|
||||
for (int j = 0; j < 49; ++j) {
|
||||
for (int k = 0; k < 69; ++k) {
|
||||
const float result = out(i,j,k);
|
||||
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
|
||||
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
|
||||
in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
|
||||
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
|
||||
VERIFY_IS_APPROX(expected, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_device()
|
||||
{
|
||||
CALL_SUBTEST(test_cpu());
|
||||
CALL_SUBTEST(test_gpu());
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,498 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
template<typename>
|
||||
void test_hip_numext() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
|
||||
bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||
gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
|
||||
gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
|
||||
|
||||
Tensor<bool, 1> half_prec(num_elem);
|
||||
Tensor<bool, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking numext " << i << std::endl;
|
||||
VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
|
||||
#ifdef EIGEN_HAS_HIP_FP16
|
||||
|
||||
template<typename>
|
||||
void test_hip_conversion() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
|
||||
d_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
|
||||
d_conv, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random();
|
||||
gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
|
||||
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
|
||||
|
||||
Tensor<float, 1> initial(num_elem);
|
||||
Tensor<float, 1> final(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(initial(i), final(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_half);
|
||||
gpu_device.deallocate(d_conv);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_unary() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||
gpu_res_float.device(gpu_device) = gpu_float.abs();
|
||||
gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking unary " << i << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_elementwise() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
|
||||
gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
|
||||
VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_trancendental() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
|
||||
gpu_float3.device(gpu_device) = gpu_float3.random();
|
||||
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
|
||||
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
|
||||
gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
|
||||
gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
|
||||
|
||||
gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
|
||||
gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
|
||||
|
||||
gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
|
||||
gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
|
||||
|
||||
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
|
||||
gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
|
||||
|
||||
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
|
||||
gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
|
||||
|
||||
Tensor<float, 1> input1(num_elem);
|
||||
Tensor<Eigen::half, 1> half_prec1(num_elem);
|
||||
Tensor<Eigen::half, 1> full_prec1(num_elem);
|
||||
Tensor<float, 1> input2(num_elem);
|
||||
Tensor<Eigen::half, 1> half_prec2(num_elem);
|
||||
Tensor<Eigen::half, 1> full_prec2(num_elem);
|
||||
Tensor<float, 1> input3(num_elem);
|
||||
Tensor<Eigen::half, 1> half_prec3(num_elem);
|
||||
Tensor<Eigen::half, 1> full_prec3(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
|
||||
}
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
|
||||
if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
|
||||
VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
|
||||
else
|
||||
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
|
||||
}
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
|
||||
}
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_float3);
|
||||
gpu_device.deallocate(d_res1_half);
|
||||
gpu_device.deallocate(d_res1_float);
|
||||
gpu_device.deallocate(d_res2_half);
|
||||
gpu_device.deallocate(d_res2_float);
|
||||
gpu_device.deallocate(d_res3_float);
|
||||
gpu_device.deallocate(d_res3_half);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_contractions() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int rows = 23;
|
||||
int cols = 23;
|
||||
int num_elem = rows*cols;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, rows, cols);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
|
||||
|
||||
typedef Tensor<float, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
|
||||
|
||||
Tensor<Eigen::half, 2> half_prec(rows, cols);
|
||||
Tensor<Eigen::half, 2> full_prec(rows, cols);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
for (int j = 0; j < cols; ++j) {
|
||||
std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
|
||||
if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
|
||||
VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_reductions(int size1, int size2, int redux) {
|
||||
|
||||
std::cout << "Reducing " << size1 << " by " << size2
|
||||
<< " tensor along dim " << redux << std::endl;
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = size1*size2;
|
||||
int result_size = (redux == 1 ? size1 : size2);
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
|
||||
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, size1, size2);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, size1, size2);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, result_size);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, result_size);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
|
||||
|
||||
Eigen::array<int, 1> redux_dim(redux);
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
|
||||
|
||||
Tensor<Eigen::half, 1> half_prec(result_size);
|
||||
Tensor<Eigen::half, 1> full_prec(result_size);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < result_size; ++i) {
|
||||
std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_reductions() {
|
||||
test_hip_reductions<void>(13, 13, 0);
|
||||
test_hip_reductions<void>(13, 13, 1);
|
||||
|
||||
test_hip_reductions<void>(35, 36, 0);
|
||||
test_hip_reductions<void>(35, 36, 1);
|
||||
|
||||
test_hip_reductions<void>(36, 35, 0);
|
||||
test_hip_reductions<void>(36, 35, 1);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_full_reductions() {
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int size = 13;
|
||||
int num_elem = size*size;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
|
||||
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, size, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, size, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half);
|
||||
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
|
||||
|
||||
Tensor<Eigen::half, 0> half_prec;
|
||||
Tensor<Eigen::half, 0> full_prec;
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
VERIFY_IS_APPROX(full_prec(), half_prec());
|
||||
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
|
||||
gpu_device.synchronize();
|
||||
|
||||
VERIFY_IS_APPROX(full_prec(), half_prec());
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
template<typename>
|
||||
void test_hip_forced_evals() {
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
|
||||
d_res_half1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
|
||||
d_res_half2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
Eigen::array<int, 1> no_bcast;
|
||||
no_bcast[0] = 1;
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||
gpu_res_float.device(gpu_device) = gpu_float.abs();
|
||||
gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
|
||||
gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec1(num_elem);
|
||||
Tensor<float, 1> half_prec2(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
gpu_device.synchronize();
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_res_half1);
|
||||
gpu_device.deallocate(d_res_half2);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void test_cxx11_tensor_of_float16_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_numext<void>());
|
||||
|
||||
#ifdef EIGEN_HAS_HIP_FP16
|
||||
CALL_SUBTEST(test_hip_conversion<void>());
|
||||
CALL_SUBTEST(test_hip_unary<void>());
|
||||
CALL_SUBTEST(test_hip_elementwise<void>());
|
||||
CALL_SUBTEST(test_hip_trancendental<void>());
|
||||
CALL_SUBTEST(test_hip_contractions<void>());
|
||||
CALL_SUBTEST(test_hip_reductions<void>());
|
||||
CALL_SUBTEST(test_hip_full_reductions<void>());
|
||||
CALL_SUBTEST(test_hip_forced_evals<void>());
|
||||
#else
|
||||
std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
|
||||
#endif
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
void test_hip_random_uniform()
|
||||
{
|
||||
Tensor<float, 2> out(72,97);
|
||||
out.setZero();
|
||||
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_out;
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
|
||||
|
||||
gpu_out.device(gpu_device) = gpu_out.random();
|
||||
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
|
||||
// For now we just check thes code doesn't crash.
|
||||
// TODO: come up with a valid test of randomness
|
||||
}
|
||||
|
||||
|
||||
void test_hip_random_normal()
|
||||
{
|
||||
Tensor<float, 2> out(72,97);
|
||||
out.setZero();
|
||||
|
||||
std::size_t out_bytes = out.size() * sizeof(float);
|
||||
|
||||
float* d_out;
|
||||
hipMalloc((void**)(&d_out), out_bytes);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
|
||||
|
||||
Eigen::internal::NormalRandomGenerator<float> gen(true);
|
||||
gpu_out.device(gpu_device) = gpu_out.random(gen);
|
||||
|
||||
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
|
||||
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
|
||||
}
|
||||
|
||||
static void test_complex()
|
||||
{
|
||||
Tensor<std::complex<float>, 1> vec(6);
|
||||
vec.setRandom();
|
||||
|
||||
// Fixme: we should check that the generated numbers follow a uniform
|
||||
// distribution instead.
|
||||
for (int i = 1; i < 6; ++i) {
|
||||
VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_random_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_random_uniform());
|
||||
CALL_SUBTEST(test_hip_random_normal());
|
||||
CALL_SUBTEST(test_complex());
|
||||
}
|
@ -1,154 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_full_reductions() {
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
const int num_rows = internal::random<int>(1024, 5*1024);
|
||||
const int num_cols = internal::random<int>(1024, 5*1024);
|
||||
|
||||
Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
|
||||
in.setRandom();
|
||||
|
||||
Tensor<Type, 0, DataLayout> full_redux;
|
||||
full_redux = in.sum();
|
||||
|
||||
std::size_t in_bytes = in.size() * sizeof(Type);
|
||||
std::size_t out_bytes = full_redux.size() * sizeof(Type);
|
||||
Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
|
||||
Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
|
||||
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
|
||||
|
||||
TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
|
||||
TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
|
||||
|
||||
out_gpu.device(gpu_device) = in_gpu.sum();
|
||||
|
||||
Tensor<Type, 0, DataLayout> full_redux_gpu;
|
||||
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
|
||||
gpu_device.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
|
||||
|
||||
gpu_device.deallocate(gpu_in_ptr);
|
||||
gpu_device.deallocate(gpu_out_ptr);
|
||||
}
|
||||
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_first_dim_reductions() {
|
||||
int dim_x = 33;
|
||||
int dim_y = 1;
|
||||
int dim_z = 128;
|
||||
|
||||
Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
|
||||
in.setRandom();
|
||||
|
||||
Eigen::array<int, 1> red_axis;
|
||||
red_axis[0] = 0;
|
||||
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
|
||||
|
||||
// Create device
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice dev(&stream);
|
||||
|
||||
// Create data(T)
|
||||
Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
|
||||
Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
|
||||
|
||||
// Perform operation
|
||||
dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
|
||||
gpu_out.device(dev) = gpu_in.sum(red_axis);
|
||||
gpu_out.device(dev) += gpu_in.sum(red_axis);
|
||||
Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
|
||||
dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
|
||||
dev.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
for (int i = 0; i < gpu_out.size(); ++i) {
|
||||
VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
|
||||
}
|
||||
|
||||
dev.deallocate(in_data);
|
||||
dev.deallocate(out_data);
|
||||
}
|
||||
|
||||
template<typename Type, int DataLayout>
|
||||
static void test_last_dim_reductions() {
|
||||
int dim_x = 128;
|
||||
int dim_y = 1;
|
||||
int dim_z = 33;
|
||||
|
||||
Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
|
||||
in.setRandom();
|
||||
|
||||
Eigen::array<int, 1> red_axis;
|
||||
red_axis[0] = 2;
|
||||
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
|
||||
|
||||
// Create device
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice dev(&stream);
|
||||
|
||||
// Create data
|
||||
Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
|
||||
Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
|
||||
Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
|
||||
|
||||
// Perform operation
|
||||
dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
|
||||
gpu_out.device(dev) = gpu_in.sum(red_axis);
|
||||
gpu_out.device(dev) += gpu_in.sum(red_axis);
|
||||
Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
|
||||
dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
|
||||
dev.synchronize();
|
||||
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
for (int i = 0; i < gpu_out.size(); ++i) {
|
||||
VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
|
||||
}
|
||||
|
||||
dev.deallocate(in_data);
|
||||
dev.deallocate(out_data);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_reduction_hip() {
|
||||
CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
|
||||
CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
|
||||
CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
|
||||
|
||||
CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
|
||||
CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
|
||||
// Outer reductions of doubles aren't supported just yet.
|
||||
// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
|
||||
|
||||
CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
|
||||
// Outer reductions of doubles aren't supported just yet.
|
||||
// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
|
||||
CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
|
||||
CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
|
||||
template<int DataLayout>
|
||||
void test_hip_cumsum(int m_size, int k_size, int n_size)
|
||||
{
|
||||
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
|
||||
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
|
||||
Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
|
||||
Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
|
||||
|
||||
t_input.setRandom();
|
||||
|
||||
std::size_t t_input_bytes = t_input.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = t_result.size() * sizeof(float);
|
||||
|
||||
float* d_t_input;
|
||||
float* d_t_result;
|
||||
|
||||
hipMalloc((void**)(&d_t_input), t_input_bytes);
|
||||
hipMalloc((void**)(&d_t_result), t_result_bytes);
|
||||
|
||||
hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
|
||||
|
||||
Eigen::HipStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
|
||||
gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
|
||||
gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
|
||||
|
||||
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
|
||||
t_result = t_input.cumsum(1);
|
||||
|
||||
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
<< " vs " << t_result_gpu(i) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
|
||||
hipFree((void*)d_t_input);
|
||||
hipFree((void*)d_t_result);
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_scan_hip()
|
||||
{
|
||||
CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
|
||||
CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user