mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-30 16:52:01 +08:00
Add CUDA complex sqrt.
This is to support scalar `sqrt` of complex numbers `std::complex<T>` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex<T>)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang).
This commit is contained in:
parent
fdf2ee62c5
commit
070d303d56
@ -323,6 +323,27 @@ struct abs2_retval
|
|||||||
typedef typename NumTraits<Scalar>::Real type;
|
typedef typename NumTraits<Scalar>::Real type;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Implementation of sqrt *
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
template<typename Scalar>
|
||||||
|
struct sqrt_impl
|
||||||
|
{
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
|
||||||
|
{
|
||||||
|
EIGEN_USING_STD(sqrt);
|
||||||
|
return sqrt(x);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Scalar>
|
||||||
|
struct sqrt_retval
|
||||||
|
{
|
||||||
|
typedef Scalar type;
|
||||||
|
};
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
* Implementation of norm1 *
|
* Implementation of norm1 *
|
||||||
****************************************************************************/
|
****************************************************************************/
|
||||||
@ -1368,12 +1389,11 @@ inline int log2(int x)
|
|||||||
*
|
*
|
||||||
* It's usage is justified in performance critical functions, like norm/normalize.
|
* It's usage is justified in performance critical functions, like norm/normalize.
|
||||||
*/
|
*/
|
||||||
template<typename T>
|
template<typename Scalar>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
EIGEN_DEVICE_FUNC
|
||||||
T sqrt(const T &x)
|
EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
|
||||||
{
|
{
|
||||||
EIGEN_USING_STD(sqrt);
|
return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
|
||||||
return sqrt(x);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
|
// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
|
||||||
|
@ -12,12 +12,12 @@
|
|||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
|
|
||||||
|
#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
|
||||||
|
|
||||||
// Many std::complex methods such as operator+, operator-, operator* and
|
// Many std::complex methods such as operator+, operator-, operator* and
|
||||||
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
||||||
// functions and thus Eigen functors making use of these operators fail to
|
// functions and thus Eigen functors making use of these operators fail to
|
||||||
@ -94,10 +94,53 @@ template<typename T> struct scalar_quotient_op<const std::complex<T>, const std:
|
|||||||
|
|
||||||
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
|
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
struct sqrt_impl<std::complex<T>> {
|
||||||
|
static EIGEN_DEVICE_FUNC std::complex<T> run(const std::complex<T>& z) {
|
||||||
|
// Computes the principal sqrt of the input.
|
||||||
|
//
|
||||||
|
// For a complex square root of the number x + i*y. We want to find real
|
||||||
|
// numbers u and v such that
|
||||||
|
// (u + i*v)^2 = x + i*y <=>
|
||||||
|
// u^2 - v^2 + i*2*u*v = x + i*v.
|
||||||
|
// By equating the real and imaginary parts we get:
|
||||||
|
// u^2 - v^2 = x
|
||||||
|
// 2*u*v = y.
|
||||||
|
//
|
||||||
|
// For x >= 0, this has the numerically stable solution
|
||||||
|
// u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
|
||||||
|
// v = y / (2 * u)
|
||||||
|
// and for x < 0,
|
||||||
|
// v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
|
||||||
|
// u = y / (2 * v)
|
||||||
|
//
|
||||||
|
// Letting w = sqrt(0.5 * (|x| + |z|)),
|
||||||
|
// if x == 0: u = w, v = sign(y) * w
|
||||||
|
// if x > 0: u = w, v = y / (2 * w)
|
||||||
|
// if x < 0: u = |y| / (2 * w), v = sign(y) * w
|
||||||
|
|
||||||
|
const T x = numext::real(z);
|
||||||
|
const T y = numext::imag(z);
|
||||||
|
const T zero = T(0);
|
||||||
|
const T cst_half = T(0.5);
|
||||||
|
|
||||||
|
// Special case of isinf(y)
|
||||||
|
if ((numext::isinf)(y)) {
|
||||||
|
const T inf = std::numeric_limits<T>::infinity();
|
||||||
|
return std::complex<T>(inf, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
T w = numext::sqrt(cst_half * (numext::abs(x) + numext::abs(z)));
|
||||||
|
return
|
||||||
|
x == zero ? std::complex<T>(w, y < zero ? -w : w)
|
||||||
|
: x > zero ? std::complex<T>(w, y / (2 * w))
|
||||||
|
: std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} // end namespace internal
|
#endif // EIGEN_COMPLEX_CUDA_H
|
||||||
|
|
||||||
} // end namespace Eigen
|
|
||||||
|
|
||||||
#endif // EIGEN_COMPLEX_CUDA_H
|
|
||||||
|
@ -703,8 +703,8 @@ Packet psqrt_complex(const Packet& a) {
|
|||||||
// u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
|
// u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
|
||||||
// v = 0.5 * (y / u)
|
// v = 0.5 * (y / u)
|
||||||
// and for x < 0,
|
// and for x < 0,
|
||||||
// v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
|
// v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
|
||||||
// u = |0.5 * (y / v)|
|
// u = 0.5 * (y / v)
|
||||||
//
|
//
|
||||||
// To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
|
// To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
|
||||||
// l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
|
// l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
|
||||||
|
@ -395,6 +395,12 @@ find_package(CUDA 5.0)
|
|||||||
if(CUDA_FOUND)
|
if(CUDA_FOUND)
|
||||||
|
|
||||||
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||||
|
|
||||||
|
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
|
||||||
|
if (${CUDA_VERSION} STREQUAL "7.0")
|
||||||
|
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
|
||||||
|
endif()
|
||||||
|
|
||||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
||||||
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
|
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
|
||||||
endif()
|
endif()
|
||||||
@ -404,7 +410,12 @@ if(CUDA_FOUND)
|
|||||||
foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
|
foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
|
||||||
string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
|
string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
|
||||||
endforeach()
|
endforeach()
|
||||||
|
else()
|
||||||
|
foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
|
||||||
|
string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
|
||||||
|
endforeach()
|
||||||
endif()
|
endif()
|
||||||
|
string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
|
||||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
|
||||||
ei_add_test(gpu_basic)
|
ei_add_test(gpu_basic)
|
||||||
|
@ -14,7 +14,6 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||||
#define EIGEN_TEST_NO_COMPLEX
|
|
||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
@ -54,6 +53,59 @@ struct coeff_wise {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
struct complex_sqrt {
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
|
||||||
|
{
|
||||||
|
using namespace Eigen;
|
||||||
|
typedef typename T::Scalar ComplexType;
|
||||||
|
typedef typename T::Scalar::value_type ValueType;
|
||||||
|
const int num_special_inputs = 18;
|
||||||
|
|
||||||
|
if (i == 0) {
|
||||||
|
const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
|
||||||
|
typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
|
||||||
|
SpecialInputs special_in;
|
||||||
|
special_in.setZero();
|
||||||
|
int idx = 0;
|
||||||
|
special_in[idx++] = ComplexType(0, 0);
|
||||||
|
special_in[idx++] = ComplexType(-0, 0);
|
||||||
|
special_in[idx++] = ComplexType(0, -0);
|
||||||
|
special_in[idx++] = ComplexType(-0, -0);
|
||||||
|
// GCC's fallback sqrt implementation fails for inf inputs.
|
||||||
|
// It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
|
||||||
|
// clang includes the GCC header (which temporarily disables
|
||||||
|
// _GLIBCXX_USE_C99_COMPLEX)
|
||||||
|
#if !defined(_GLIBCXX_COMPLEX) || \
|
||||||
|
(_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
|
||||||
|
const ValueType inf = std::numeric_limits<ValueType>::infinity();
|
||||||
|
special_in[idx++] = ComplexType(1.0, inf);
|
||||||
|
special_in[idx++] = ComplexType(nan, inf);
|
||||||
|
special_in[idx++] = ComplexType(1.0, -inf);
|
||||||
|
special_in[idx++] = ComplexType(nan, -inf);
|
||||||
|
special_in[idx++] = ComplexType(-inf, 1.0);
|
||||||
|
special_in[idx++] = ComplexType(inf, 1.0);
|
||||||
|
special_in[idx++] = ComplexType(-inf, -1.0);
|
||||||
|
special_in[idx++] = ComplexType(inf, -1.0);
|
||||||
|
special_in[idx++] = ComplexType(-inf, nan);
|
||||||
|
special_in[idx++] = ComplexType(inf, nan);
|
||||||
|
#endif
|
||||||
|
special_in[idx++] = ComplexType(1.0, nan);
|
||||||
|
special_in[idx++] = ComplexType(nan, 1.0);
|
||||||
|
special_in[idx++] = ComplexType(nan, -1.0);
|
||||||
|
special_in[idx++] = ComplexType(nan, nan);
|
||||||
|
|
||||||
|
Map<SpecialInputs> special_out(out);
|
||||||
|
special_out = special_in.cwiseSqrt();
|
||||||
|
}
|
||||||
|
|
||||||
|
T x1(in + i);
|
||||||
|
Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
|
||||||
|
res = x1.cwiseSqrt();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct replicate {
|
struct replicate {
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
@ -161,17 +213,58 @@ struct matrix_inverse {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename Type1, typename Type2>
|
||||||
|
bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
|
||||||
|
{
|
||||||
|
if (a.rows() != b.rows()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (a.cols() != b.cols()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (Index r = 0; r < a.rows(); ++r) {
|
||||||
|
for (Index c = 0; c < a.cols(); ++c) {
|
||||||
|
if (a(r, c) != b(r, c)
|
||||||
|
&& !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c)))
|
||||||
|
&& !test_isApprox(a(r, c), b(r, c))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Kernel, typename Input, typename Output>
|
||||||
|
void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
|
||||||
|
{
|
||||||
|
Output out_ref, out_gpu;
|
||||||
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
|
out_ref = out_gpu = out;
|
||||||
|
#else
|
||||||
|
EIGEN_UNUSED_VARIABLE(in);
|
||||||
|
EIGEN_UNUSED_VARIABLE(out);
|
||||||
|
#endif
|
||||||
|
run_on_cpu (ker, n, in, out_ref);
|
||||||
|
run_on_gpu(ker, n, in, out_gpu);
|
||||||
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
|
verifyIsApproxWithInfsNans(out_ref, out_gpu);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DECLARE_TEST(gpu_basic)
|
EIGEN_DECLARE_TEST(gpu_basic)
|
||||||
{
|
{
|
||||||
ei_test_init_gpu();
|
ei_test_init_gpu();
|
||||||
|
|
||||||
int nthreads = 100;
|
int nthreads = 100;
|
||||||
Eigen::VectorXf in, out;
|
Eigen::VectorXf in, out;
|
||||||
|
Eigen::VectorXcf cfin, cfout;
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
int data_size = nthreads * 512;
|
int data_size = nthreads * 512;
|
||||||
in.setRandom(data_size);
|
in.setRandom(data_size);
|
||||||
out.setRandom(data_size);
|
out.setConstant(data_size, -1);
|
||||||
|
cfin.setRandom(data_size);
|
||||||
|
cfout.setConstant(data_size, -1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
|
CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
|
||||||
@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic)
|
|||||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
|
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
|
||||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
|
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
|
||||||
|
|
||||||
|
CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
|
||||||
|
|
||||||
#if defined(__NVCC__)
|
#if defined(__NVCC__)
|
||||||
// FIXME
|
// FIXME
|
||||||
// These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
|
// These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
|
||||||
|
@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
|
|||||||
#else
|
#else
|
||||||
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
|
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
|
||||||
#endif
|
#endif
|
||||||
|
// Pre-launch errors.
|
||||||
|
gpuError_t err = gpuGetLastError();
|
||||||
|
if (err != gpuSuccess) {
|
||||||
|
printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
|
||||||
|
gpu_assert(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kernel execution errors.
|
||||||
|
err = gpuDeviceSynchronize();
|
||||||
|
if (err != gpuSuccess) {
|
||||||
|
printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
|
||||||
|
gpu_assert(false);
|
||||||
|
}
|
||||||
|
|
||||||
gpuDeviceSynchronize();
|
|
||||||
|
|
||||||
// check inputs have not been modified
|
// check inputs have not been modified
|
||||||
gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
|
gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
|
||||||
@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
|
|||||||
{
|
{
|
||||||
Input in_ref, in_gpu;
|
Input in_ref, in_gpu;
|
||||||
Output out_ref, out_gpu;
|
Output out_ref, out_gpu;
|
||||||
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
in_ref = in_gpu = in;
|
in_ref = in_gpu = in;
|
||||||
out_ref = out_gpu = out;
|
out_ref = out_gpu = out;
|
||||||
#else
|
#else
|
||||||
@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
|
|||||||
#endif
|
#endif
|
||||||
run_on_cpu (ker, n, in_ref, out_ref);
|
run_on_cpu (ker, n, in_ref, out_ref);
|
||||||
run_on_gpu(ker, n, in_gpu, out_gpu);
|
run_on_gpu(ker, n, in_gpu, out_gpu);
|
||||||
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
|
#if !defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
VERIFY_IS_APPROX(in_ref, in_gpu);
|
VERIFY_IS_APPROX(in_ref, in_gpu);
|
||||||
VERIFY_IS_APPROX(out_ref, out_gpu);
|
VERIFY_IS_APPROX(out_ref, out_gpu);
|
||||||
#endif
|
#endif
|
||||||
@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
|
|||||||
|
|
||||||
struct compile_time_device_info {
|
struct compile_time_device_info {
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
void operator()(int /*i*/, const int* /*in*/, int* info) const
|
void operator()(int i, const int* /*in*/, int* info) const
|
||||||
{
|
{
|
||||||
#if defined(__CUDA_ARCH__)
|
if (i == 0) {
|
||||||
info[0] = int(__CUDA_ARCH__ +0);
|
#if defined(__CUDA_ARCH__)
|
||||||
#endif
|
info[0] = int(__CUDA_ARCH__ +0);
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#endif
|
||||||
info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
#endif
|
info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
|
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
|
||||||
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
|
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
|
||||||
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
|
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
|
||||||
// This will obsviously not work when trying to compile tensorflow on a system with no CUDA
|
// This will obviously not work when trying to compile tensorflow on a system with no CUDA
|
||||||
// To work around this issue for HIP systems (and leave the default behaviour intact), the
|
// To work around this issue for HIP systems (and leave the default behaviour intact), the
|
||||||
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
|
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
|
||||||
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
|
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
|
||||||
@ -30,6 +30,9 @@
|
|||||||
#define gpuSuccess hipSuccess
|
#define gpuSuccess hipSuccess
|
||||||
#define gpuErrorNotReady hipErrorNotReady
|
#define gpuErrorNotReady hipErrorNotReady
|
||||||
#define gpuGetDeviceCount hipGetDeviceCount
|
#define gpuGetDeviceCount hipGetDeviceCount
|
||||||
|
#define gpuGetLastError hipGetLastError
|
||||||
|
#define gpuPeekAtLastError hipPeekAtLastError
|
||||||
|
#define gpuGetErrorName hipGetErrorName
|
||||||
#define gpuGetErrorString hipGetErrorString
|
#define gpuGetErrorString hipGetErrorString
|
||||||
#define gpuGetDeviceProperties hipGetDeviceProperties
|
#define gpuGetDeviceProperties hipGetDeviceProperties
|
||||||
#define gpuStreamDefault hipStreamDefault
|
#define gpuStreamDefault hipStreamDefault
|
||||||
@ -57,6 +60,9 @@
|
|||||||
#define gpuSuccess cudaSuccess
|
#define gpuSuccess cudaSuccess
|
||||||
#define gpuErrorNotReady cudaErrorNotReady
|
#define gpuErrorNotReady cudaErrorNotReady
|
||||||
#define gpuGetDeviceCount cudaGetDeviceCount
|
#define gpuGetDeviceCount cudaGetDeviceCount
|
||||||
|
#define gpuGetLastError cudaGetLastError
|
||||||
|
#define gpuPeekAtLastError cudaPeekAtLastError
|
||||||
|
#define gpuGetErrorName cudaGetErrorName
|
||||||
#define gpuGetErrorString cudaGetErrorString
|
#define gpuGetErrorString cudaGetErrorString
|
||||||
#define gpuGetDeviceProperties cudaGetDeviceProperties
|
#define gpuGetDeviceProperties cudaGetDeviceProperties
|
||||||
#define gpuStreamDefault cudaStreamDefault
|
#define gpuStreamDefault cudaStreamDefault
|
||||||
|
Loading…
x
Reference in New Issue
Block a user