Merged in rmlarsen/eigen2 (pull request PR-292)

Adds a fast memcpy function to Eigen.
This commit is contained in:
Benoit Steiner 2017-01-25 00:14:04 +00:00
commit e96c77668d
5 changed files with 54 additions and 19 deletions

View File

@ -74,6 +74,41 @@ inline void throw_std_bad_alloc()
#endif #endif
} }
EIGEN_DEVICE_FUNC
inline void fast_memcpy(void* dst, const void* src, size_t size) {
#if defined(__CUDA__) || defined(__ANDROID__)
::memcpy(dst, src, size);
#else
switch(size) {
// Most compilers will generate inline code for fixed sizes,
// which is significantly faster for small copies.
case 1: memcpy(dst, src, 1); break;
case 2: memcpy(dst, src, 2); break;
case 3: memcpy(dst, src, 3); break;
case 4: memcpy(dst, src, 4); break;
case 5: memcpy(dst, src, 5); break;
case 6: memcpy(dst, src, 6); break;
case 7: memcpy(dst, src, 7); break;
case 8: memcpy(dst, src, 8); break;
case 9: memcpy(dst, src, 9); break;
case 10: memcpy(dst, src, 10); break;
case 11: memcpy(dst, src, 11); break;
case 12: memcpy(dst, src, 12); break;
case 13: memcpy(dst, src, 13); break;
case 14: memcpy(dst, src, 14); break;
case 15: memcpy(dst, src, 15); break;
case 16: memcpy(dst, src, 16); break;
#ifdef EIGEN_OS_LINUX
// On Linux, memmove appears to be faster than memcpy for
// large sizes, strangely enough.
default: memmove(dst, src, size); break;
#else
default: memcpy(dst, src, size); break;
#endif
}
#endif
}
/***************************************************************************** /*****************************************************************************
*** Implementation of handmade aligned functions *** *** Implementation of handmade aligned functions ***
*****************************************************************************/ *****************************************************************************/
@ -493,7 +528,7 @@ template<typename T> struct smart_copy_helper<T,true> {
IntPtr size = IntPtr(end)-IntPtr(start); IntPtr size = IntPtr(end)-IntPtr(start);
if(size==0) return; if(size==0) return;
eigen_internal_assert(start!=0 && end!=0 && target!=0); eigen_internal_assert(start!=0 && end!=0 && target!=0);
memcpy(target, start, size); fast_memcpy(target, start, size);
} }
}; };

View File

@ -56,7 +56,7 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index
} else { } else {
// Naive memcpy calls // Naive memcpy calls
for (Index col = 0; col < cols; ++col) { for (Index col = 0; col < cols; ++col) {
memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); internal::fast_memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
} }
} }
} }

View File

@ -22,7 +22,7 @@ struct DefaultDevice {
internal::aligned_free(buffer); internal::aligned_free(buffer);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
::memcpy(dst, src, n); internal::fast_memcpy(dst, src, n);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n); memcpy(dst, src, n);

View File

@ -106,7 +106,7 @@ struct ThreadPoolDevice {
} }
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
::memcpy(dst, src, n); internal::fast_memcpy(dst, src, n);
} }
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n); memcpy(dst, src, n);

View File

@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// get data into line_buf // get data into line_buf
const Index stride = m_strides[dim]; const Index stride = m_strides[dim];
if (stride == 1) { if (stride == 1) {
memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
} else { } else {
Index offset = base_offset; Index offset = base_offset;
for (int j = 0; j < line_len; ++j, offset += stride) { for (int j = 0; j < line_len; ++j, offset += stride) {
@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// write back // write back
if (FFTDir == FFT_FORWARD && stride == 1) { if (FFTDir == FFT_FORWARD && stride == 1) {
memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
} else { } else {
Index offset = base_offset; Index offset = base_offset;
const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);