diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 06b555428..bb3b6692f 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -688,7 +688,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(half* to, cons to[stride*3].x = static_cast(a >> 48); } -template<> EIGEN_DEVICE_FUNC inline void +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);