mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-20 11:54:27 +08:00
Abandon blocking size lookup table approach. Not performing as well in real world as in microbenchmark.
This commit is contained in:
parent
ebea530782
commit
051d5325cc
@ -310,7 +310,6 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/NEON/PacketMath.h"
|
||||
#include "src/Core/arch/NEON/MathFunctions.h"
|
||||
#include "src/Core/arch/NEON/Complex.h"
|
||||
#include "src/Core/arch/NEON/BlockingSizesLookupTables.h"
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
@ -384,7 +383,6 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/Inverse.h"
|
||||
#include "src/Core/TriangularMatrix.h"
|
||||
#include "src/Core/SelfAdjointView.h"
|
||||
#include "src/Core/products/LookupBlockingSizesTable.h"
|
||||
#include "src/Core/products/GeneralBlockPanelKernel.h"
|
||||
#include "src/Core/products/Parallelizer.h"
|
||||
#include "src/Core/ProductEvaluators.h"
|
||||
|
@ -1,110 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
|
||||
#define EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
/* The following lookup table was generated from measurements on a Nexus 5,
|
||||
* which has a Qualcomm Krait 400 CPU. This is very representative of current
|
||||
* 32bit (ARMv7) Android devices. On the other hand, I don't know how
|
||||
* representative that is outside of these conditions. Accordingly,
|
||||
* let's only use this lookup table on ARM 32bit on Android for now.
|
||||
*
|
||||
* Measurements were single-threaded, with Scalar=float, compiled with
|
||||
* -mfpu=neon-vfpv4, so the pmadd instruction used was VFMA.F32.
|
||||
*
|
||||
* The device was cooled, allowing it to run a the max clock speed throughout.
|
||||
* This may not be representative of real-world thermal conditions.
|
||||
*
|
||||
* The benchmark attempted to flush caches to test cold-cache performance.
|
||||
*/
|
||||
#if EIGEN_ARCH_ARM && EIGEN_OS_ANDROID
|
||||
template<>
|
||||
struct BlockingSizesLookupTable<float, float> {
|
||||
static const size_t BaseSize = 16;
|
||||
static const size_t NumSizes = 8;
|
||||
static const unsigned short* Data() {
|
||||
static const unsigned short data[512] = {
|
||||
0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x447, 0x447,
|
||||
0x454, 0x455, 0x456, 0x457, 0x458, 0x459, 0x45a, 0x456,
|
||||
0x464, 0x465, 0x466, 0x467, 0x468, 0x469, 0x46a, 0x467,
|
||||
0x474, 0x475, 0x476, 0x467, 0x478, 0x479, 0x476, 0x478,
|
||||
0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x476, 0x476,
|
||||
0x474, 0x475, 0x476, 0x477, 0x478, 0x479, 0x496, 0x488,
|
||||
0x474, 0x475, 0x476, 0x4a6, 0x496, 0x496, 0x495, 0x4a6,
|
||||
0x474, 0x475, 0x466, 0x4a6, 0x497, 0x4a5, 0x496, 0x4a5,
|
||||
0x544, 0x545, 0x546, 0x547, 0x548, 0x549, 0x54a, 0x54b,
|
||||
0x554, 0x555, 0x556, 0x557, 0x558, 0x559, 0x55a, 0x55b,
|
||||
0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x56b,
|
||||
0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x576,
|
||||
0x564, 0x565, 0x566, 0x567, 0x568, 0x569, 0x56a, 0x587,
|
||||
0x564, 0x565, 0x566, 0x567, 0x596, 0x596, 0x596, 0x597,
|
||||
0x574, 0x565, 0x566, 0x596, 0x596, 0x5a6, 0x5a6, 0x5a6,
|
||||
0x564, 0x565, 0x5a6, 0x596, 0x5a6, 0x5a6, 0x5a6, 0x5a6,
|
||||
0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0x64a, 0x64b,
|
||||
0x644, 0x655, 0x656, 0x657, 0x658, 0x659, 0x65a, 0x65b,
|
||||
0x664, 0x665, 0x666, 0x667, 0x668, 0x669, 0x65a, 0x667,
|
||||
0x654, 0x665, 0x676, 0x677, 0x678, 0x679, 0x67a, 0x675,
|
||||
0x684, 0x675, 0x686, 0x687, 0x688, 0x688, 0x687, 0x686,
|
||||
0x664, 0x685, 0x666, 0x677, 0x697, 0x696, 0x697, 0x697,
|
||||
0x664, 0x665, 0x696, 0x696, 0x685, 0x6a6, 0x696, 0x696,
|
||||
0x664, 0x675, 0x686, 0x696, 0x6a6, 0x696, 0x696, 0x696,
|
||||
0x744, 0x745, 0x746, 0x747, 0x748, 0x749, 0x74a, 0x747,
|
||||
0x754, 0x755, 0x756, 0x757, 0x758, 0x759, 0x75a, 0x757,
|
||||
0x764, 0x765, 0x756, 0x767, 0x768, 0x759, 0x75a, 0x766,
|
||||
0x744, 0x755, 0x766, 0x777, 0x768, 0x759, 0x778, 0x777,
|
||||
0x744, 0x745, 0x766, 0x777, 0x788, 0x786, 0x786, 0x788,
|
||||
0x754, 0x755, 0x766, 0x787, 0x796, 0x796, 0x787, 0x796,
|
||||
0x684, 0x695, 0x696, 0x6a6, 0x795, 0x786, 0x795, 0x796,
|
||||
0x684, 0x695, 0x696, 0x795, 0x786, 0x796, 0x795, 0x796,
|
||||
0x844, 0x845, 0x846, 0x847, 0x848, 0x849, 0x848, 0x848,
|
||||
0x844, 0x855, 0x846, 0x847, 0x848, 0x849, 0x855, 0x857,
|
||||
0x844, 0x845, 0x846, 0x857, 0x848, 0x859, 0x866, 0x865,
|
||||
0x844, 0x855, 0x846, 0x847, 0x878, 0x859, 0x877, 0x877,
|
||||
0x844, 0x855, 0x846, 0x867, 0x886, 0x887, 0x885, 0x886,
|
||||
0x784, 0x785, 0x786, 0x877, 0x897, 0x885, 0x896, 0x896,
|
||||
0x684, 0x695, 0x686, 0x886, 0x885, 0x885, 0x886, 0x896,
|
||||
0x694, 0x6a5, 0x6a6, 0x885, 0x885, 0x886, 0x896, 0x896,
|
||||
0x944, 0x945, 0x946, 0x947, 0x948, 0x847, 0x847, 0x848,
|
||||
0x954, 0x855, 0x856, 0x947, 0x858, 0x857, 0x858, 0x858,
|
||||
0x944, 0x945, 0x946, 0x867, 0x948, 0x866, 0x867, 0x867,
|
||||
0x944, 0x975, 0x976, 0x877, 0x877, 0x877, 0x877, 0x877,
|
||||
0x784, 0x785, 0x886, 0x887, 0x886, 0x887, 0x887, 0x887,
|
||||
0x784, 0x785, 0x786, 0x796, 0x887, 0x897, 0x896, 0x896,
|
||||
0x684, 0x695, 0x6a6, 0x886, 0x886, 0x896, 0x896, 0x896,
|
||||
0x6a4, 0x6a5, 0x696, 0x896, 0x886, 0x896, 0x896, 0x896,
|
||||
0xa44, 0xa45, 0xa46, 0xa47, 0x847, 0x848, 0x847, 0x848,
|
||||
0xa44, 0xa45, 0x856, 0x857, 0x857, 0x857, 0x857, 0x857,
|
||||
0xa44, 0xa65, 0x866, 0x867, 0x867, 0x867, 0x867, 0x867,
|
||||
0x774, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
|
||||
0x784, 0x785, 0x886, 0x887, 0x887, 0x887, 0x887, 0x887,
|
||||
0x784, 0x785, 0x786, 0x787, 0x887, 0x896, 0x897, 0x897,
|
||||
0x684, 0x6a5, 0x696, 0x886, 0x886, 0x896, 0x896, 0x896,
|
||||
0x684, 0x6a5, 0x6a5, 0x886, 0x886, 0x896, 0x896, 0x896,
|
||||
0xb44, 0x845, 0x846, 0x847, 0x847, 0x945, 0x846, 0x946,
|
||||
0xb54, 0x855, 0x856, 0x857, 0x857, 0x856, 0x857, 0x856,
|
||||
0x864, 0x865, 0x866, 0x867, 0x867, 0x866, 0x866, 0x867,
|
||||
0x864, 0x875, 0x876, 0x877, 0x877, 0x877, 0x877, 0x877,
|
||||
0x784, 0x885, 0x886, 0x787, 0x887, 0x887, 0x887, 0x887,
|
||||
0x784, 0x785, 0x786, 0x796, 0x886, 0x897, 0x897, 0x897,
|
||||
0x684, 0x695, 0x696, 0x886, 0x896, 0x896, 0x896, 0x896,
|
||||
0x684, 0x685, 0x696, 0xb57, 0x896, 0x896, 0x896, 0x896
|
||||
};
|
||||
return data;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EIGEN_NEON_BLOCKING_SIZES_LOOKUP_TABLES_H
|
@ -291,7 +291,6 @@ inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
|
||||
*
|
||||
* The blocking size parameters may be evaluated:
|
||||
* - either by a heuristic based on cache sizes;
|
||||
* - or using a precomputed lookup table;
|
||||
* - or using fixed prescribed values (for testing purposes).
|
||||
*
|
||||
* \sa setCpuCacheSizes */
|
||||
@ -300,10 +299,8 @@ template<typename LhsScalar, typename RhsScalar, int KcFactor>
|
||||
void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
|
||||
{
|
||||
if (!useSpecificBlockingSizes(k, m, n)) {
|
||||
if (!lookupBlockingSizesFromTable<LhsScalar, RhsScalar>(k, m, n, num_threads)) {
|
||||
evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
|
||||
}
|
||||
}
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
enum {
|
||||
|
@ -1,97 +0,0 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
|
||||
#define EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename LhsScalar,
|
||||
typename RhsScalar,
|
||||
bool HasLookupTable = BlockingSizesLookupTable<LhsScalar, RhsScalar>::NumSizes != 0 >
|
||||
struct LookupBlockingSizesFromTableImpl
|
||||
{
|
||||
static bool run(Index&, Index&, Index&, Index)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
inline size_t floor_log2_helper(unsigned short& x, size_t offset)
|
||||
{
|
||||
unsigned short y = x >> offset;
|
||||
if (y) {
|
||||
x = y;
|
||||
return offset;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t floor_log2(unsigned short x)
|
||||
{
|
||||
return floor_log2_helper(x, 8)
|
||||
+ floor_log2_helper(x, 4)
|
||||
+ floor_log2_helper(x, 2)
|
||||
+ floor_log2_helper(x, 1);
|
||||
}
|
||||
|
||||
inline size_t ceil_log2(unsigned short x)
|
||||
{
|
||||
return x > 1 ? floor_log2(x - 1) + 1 : 0;
|
||||
}
|
||||
|
||||
template <typename LhsScalar,
|
||||
typename RhsScalar>
|
||||
struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
|
||||
{
|
||||
static bool run(Index& k, Index& m, Index& n, Index)
|
||||
{
|
||||
using std::min;
|
||||
using std::max;
|
||||
typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
|
||||
const unsigned short minsize = Table::BaseSize;
|
||||
const unsigned short maxsize = minsize << (Table::NumSizes - 1);
|
||||
const unsigned short k_clamped = max<unsigned short>(minsize, min<Index>(k, maxsize));
|
||||
const unsigned short m_clamped = max<unsigned short>(minsize, min<Index>(m, maxsize));
|
||||
const unsigned short n_clamped = max<unsigned short>(minsize, min<Index>(n, maxsize));
|
||||
const size_t k_index = ceil_log2(k_clamped / minsize);
|
||||
const size_t m_index = ceil_log2(m_clamped / minsize);
|
||||
const size_t n_index = ceil_log2(n_clamped / minsize);
|
||||
const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
|
||||
const unsigned short table_entry = Table::Data()[index];
|
||||
k = min<Index>(k, 1 << ((table_entry & 0xf00) >> 8));
|
||||
m = min<Index>(m, 1 << ((table_entry & 0x0f0) >> 4));
|
||||
n = min<Index>(n, 1 << ((table_entry & 0x00f) >> 0));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename LhsScalar,
|
||||
typename RhsScalar>
|
||||
bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
|
||||
{
|
||||
if (num_threads > 1) {
|
||||
// We don't currently have lookup tables recorded for multithread performance,
|
||||
// and we have confirmed experimentally that our single-thread-recorded LUTs are
|
||||
// poor for multithread performance, and our LUTs don't currently contain
|
||||
// any annotation about multithread status (FIXME - we need that).
|
||||
// So for now, we just early-return here.
|
||||
return false;
|
||||
}
|
||||
return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
|
@ -288,14 +288,6 @@ struct stem_function
|
||||
typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
|
||||
typedef ComplexScalar type(ComplexScalar, int);
|
||||
};
|
||||
|
||||
template <typename LhsScalar,
|
||||
typename RhsScalar>
|
||||
struct BlockingSizesLookupTable
|
||||
{
|
||||
static const size_t NumSizes = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
Loading…
x
Reference in New Issue
Block a user