Improved support for vectorization of 16-bit floats

This commit is contained in:
Benoit Steiner 2016-06-09 08:22:27 -07:00
parent 15890c304e
commit aa33446dac
3 changed files with 95 additions and 4 deletions

View File

@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
};
template<typename Reducer, typename Device>
struct reducer_traits {
enum {
Cost = 1,
PacketAccess = false
};
};
// Standard reduction functors
template <typename T> struct SumReducer
{
@ -119,6 +127,15 @@ template <typename T> struct SumReducer
}
};
template <typename T, typename Device>
struct reducer_traits<SumReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::type::HasAdd
};
};
template <typename T> struct MeanReducer
{
static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
DenseIndex packetCount_;
};
template <typename T, typename Device>
struct reducer_traits<MeanReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::type::HasAdd
};
};
template <typename T> struct MaxReducer
{
static const bool PacketAccess = packet_traits<T>::HasMax;
@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
}
};
template <typename T, typename Device>
struct reducer_traits<MaxReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::type::HasMax
};
};
template <typename T> struct MinReducer
{
static const bool PacketAccess = packet_traits<T>::HasMin;
@ -228,6 +263,14 @@ template <typename T> struct MinReducer
}
};
template <typename T, typename Device>
struct reducer_traits<MinReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::type::HasMin
};
};
template <typename T> struct ProdReducer
{
@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
}
};
template <typename T, typename Device>
struct reducer_traits<ProdReducer<T>, Device> {
enum {
Cost = NumTraits<T>::MulCost,
PacketAccess = PacketType<T, Device>::type::HasMul
};
};
struct AndReducer
{
@ -280,6 +331,15 @@ struct AndReducer
}
};
template <typename Device>
struct reducer_traits<AndReducer, Device> {
enum {
Cost = 1,
PacketAccess = false
};
};
struct OrReducer {
static const bool PacketAccess = false;
static const bool IsStateful = false;
@ -295,6 +355,15 @@ struct OrReducer {
}
};
template <typename Device>
struct reducer_traits<OrReducer, Device> {
enum {
Cost = 1,
PacketAccess = false
};
};
// Argmin/Argmax reducers
template <typename T> struct ArgMaxTupleReducer
{
@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
}
};
template <typename T, typename Device>
struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = false
};
};
template <typename T> struct ArgMinTupleReducer
{
static const bool PacketAccess = false;
@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
}
};
template <typename T, typename Device>
struct reducer_traits<ArgMinTupleReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = false
};
};
// Random number generation
namespace {

View File

@ -54,6 +54,11 @@ struct PacketType {
// For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <>
struct PacketType<half, GpuDevice> {
typedef half2 type;
static const int size = 2;
};
template <>
struct PacketType<float, GpuDevice> {
typedef float4 type;

View File

@ -331,7 +331,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value;
@ -346,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
return;
}
FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
@ -608,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
#ifdef EIGEN_HAS_CUDA_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
internal::is_same<typename Self::CoeffReturnType, float>::value;
@ -627,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
return true;
}
return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};