mirror of
https://github.com/sockspls/badfish
synced 2025-04-29 16:23:09 +00:00
Add large page support for NNUE weights and simplify TT mem management
Use TT memory functions to allocate memory for the NNUE weights. This should provide a small speed-up on systems where large pages are not automatically used, including Windows and some Linux distributions. Further, since we now have a wrapper for std::aligned_alloc(), we can simplify the TT memory management a bit: - We no longer need to store separate pointers to the hash table and its underlying memory allocation. - We also get to merge the Linux-specific and default implementations of aligned_ttmem_alloc(). Finally, we'll enable the VirtualAlloc code path with large page support also for Win32. STC: https://tests.stockfishchess.org/tests/view/5f66595823a84a47b9036fba LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 14896 W: 1854 L: 1686 D: 11356 Ptnml(0-2): 65, 1224, 4742, 1312, 105 closes https://github.com/official-stockfish/Stockfish/pull/3081 No functional change.
This commit is contained in:
parent
16b4578cc1
commit
485d517c68
7 changed files with 57 additions and 45 deletions
|
@ -152,7 +152,7 @@ to find the best move. The classical evaluation computes this value as a functio
|
|||
of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
|
||||
The NNUE evaluation computes this value with a neural network based on basic
|
||||
inputs (e.g. piece positions only). The network is optimized and trained
|
||||
on the evalutions of millions of positions at moderate search depth.
|
||||
on the evaluations of millions of positions at moderate search depth.
|
||||
|
||||
The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
|
||||
It can be evaluated efficiently on CPUs, and exploits the fact that only parts
|
||||
|
|
57
src/misc.cpp
57
src/misc.cpp
|
@ -357,27 +357,11 @@ void std_aligned_free(void* ptr) {
|
|||
#endif
|
||||
}
|
||||
|
||||
/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
|
||||
/// The returned pointer is the aligned one, while the mem argument is the one that needs
|
||||
/// to be passed to free. With c++17 some of this functionality could be simplified.
|
||||
/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
|
||||
|
||||
#if defined(__linux__) && !defined(__ANDROID__)
|
||||
#if defined(_WIN32)
|
||||
|
||||
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
||||
|
||||
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
|
||||
size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
|
||||
if (posix_memalign(&mem, alignment, size))
|
||||
mem = nullptr;
|
||||
#if defined(MADV_HUGEPAGE)
|
||||
madvise(mem, allocSize, MADV_HUGEPAGE);
|
||||
#endif
|
||||
return mem;
|
||||
}
|
||||
|
||||
#elif defined(_WIN64)
|
||||
|
||||
static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
|
||||
static void* aligned_large_pages_alloc_win(size_t allocSize) {
|
||||
|
||||
HANDLE hProcessToken { };
|
||||
LUID luid { };
|
||||
|
@ -422,12 +406,13 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
|
|||
return mem;
|
||||
}
|
||||
|
||||
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
||||
void* aligned_large_pages_alloc(size_t allocSize) {
|
||||
|
||||
static bool firstCall = true;
|
||||
void* mem;
|
||||
|
||||
// Try to allocate large pages
|
||||
mem = aligned_ttmem_alloc_large_pages(allocSize);
|
||||
mem = aligned_large_pages_alloc_win(allocSize);
|
||||
|
||||
// Suppress info strings on the first call. The first call occurs before 'uci'
|
||||
// is received and in that case this output confuses some GUIs.
|
||||
|
@ -449,23 +434,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
|||
|
||||
#else
|
||||
|
||||
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
||||
void* aligned_large_pages_alloc(size_t allocSize) {
|
||||
|
||||
constexpr size_t alignment = 64; // assumed cache line size
|
||||
size_t size = allocSize + alignment - 1; // allocate some extra space
|
||||
mem = malloc(size);
|
||||
void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
|
||||
return ret;
|
||||
#if defined(__linux__)
|
||||
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
|
||||
#else
|
||||
constexpr size_t alignment = 4096; // assumed small page size
|
||||
#endif
|
||||
|
||||
// round up to multiples of alignment
|
||||
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
|
||||
void *mem = std_aligned_alloc(alignment, size);
|
||||
#if defined(MADV_HUGEPAGE)
|
||||
madvise(mem, size, MADV_HUGEPAGE);
|
||||
#endif
|
||||
return mem;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/// aligned_ttmem_free() will free the previously allocated ttmem
|
||||
/// aligned_large_pages_free() will free the previously allocated ttmem
|
||||
|
||||
#if defined(_WIN64)
|
||||
#if defined(_WIN32)
|
||||
|
||||
void aligned_ttmem_free(void* mem) {
|
||||
void aligned_large_pages_free(void* mem) {
|
||||
|
||||
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
|
||||
{
|
||||
|
@ -478,8 +471,8 @@ void aligned_ttmem_free(void* mem) {
|
|||
|
||||
#else
|
||||
|
||||
void aligned_ttmem_free(void *mem) {
|
||||
free(mem);
|
||||
void aligned_large_pages_free(void *mem) {
|
||||
std_aligned_free(mem);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -33,8 +33,8 @@ void prefetch(void* addr);
|
|||
void start_logger(const std::string& fname);
|
||||
void* std_aligned_alloc(size_t alignment, size_t size);
|
||||
void std_aligned_free(void* ptr);
|
||||
void* aligned_ttmem_alloc(size_t size, void*& mem);
|
||||
void aligned_ttmem_free(void* mem); // nop if mem == nullptr
|
||||
void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
|
||||
void aligned_large_pages_free(void* mem); // nop if mem == nullptr
|
||||
|
||||
void dbg_hit_on(bool b);
|
||||
void dbg_hit_on(bool c, bool b);
|
||||
|
|
|
@ -52,7 +52,7 @@ namespace Eval::NNUE {
|
|||
};
|
||||
|
||||
// Input feature converter
|
||||
AlignedPtr<FeatureTransformer> feature_transformer;
|
||||
LargePagePtr<FeatureTransformer> feature_transformer;
|
||||
|
||||
// Evaluation function
|
||||
AlignedPtr<Network> network;
|
||||
|
@ -70,14 +70,22 @@ namespace Eval::NNUE {
|
|||
std::memset(pointer.get(), 0, sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Initialize(LargePagePtr<T>& pointer) {
|
||||
|
||||
static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
|
||||
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
|
||||
std::memset(pointer.get(), 0, sizeof(T));
|
||||
}
|
||||
|
||||
// Read evaluation function parameters
|
||||
template <typename T>
|
||||
bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
|
||||
bool ReadParameters(std::istream& stream, T& reference) {
|
||||
|
||||
std::uint32_t header;
|
||||
header = read_little_endian<std::uint32_t>(stream);
|
||||
if (!stream || header != T::GetHashValue()) return false;
|
||||
return pointer->ReadParameters(stream);
|
||||
return reference.ReadParameters(stream);
|
||||
}
|
||||
|
||||
} // namespace Detail
|
||||
|
@ -110,8 +118,8 @@ namespace Eval::NNUE {
|
|||
std::string architecture;
|
||||
if (!ReadHeader(stream, &hash_value, &architecture)) return false;
|
||||
if (hash_value != kHashValue) return false;
|
||||
if (!Detail::ReadParameters(stream, feature_transformer)) return false;
|
||||
if (!Detail::ReadParameters(stream, network)) return false;
|
||||
if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
|
||||
if (!Detail::ReadParameters(stream, *network)) return false;
|
||||
return stream && stream.peek() == std::ios::traits_type::eof();
|
||||
}
|
||||
|
||||
|
|
|
@ -40,9 +40,20 @@ namespace Eval::NNUE {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TtmemDeleter {
|
||||
void operator()(T* ptr) const {
|
||||
ptr->~T();
|
||||
aligned_large_pages_free(ptr);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
|
||||
|
||||
template <typename T>
|
||||
using LargePagePtr = std::unique_ptr<T, TtmemDeleter<T>>;
|
||||
|
||||
} // namespace Eval::NNUE
|
||||
|
||||
#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
|
||||
|
|
|
@ -62,11 +62,12 @@ void TranspositionTable::resize(size_t mbSize) {
|
|||
|
||||
Threads.main()->wait_for_search_finished();
|
||||
|
||||
aligned_ttmem_free(mem);
|
||||
aligned_large_pages_free(table);
|
||||
|
||||
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
|
||||
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
|
||||
if (!mem)
|
||||
|
||||
table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
|
||||
if (!table)
|
||||
{
|
||||
std::cerr << "Failed to allocate " << mbSize
|
||||
<< "MB for transposition table." << std::endl;
|
||||
|
|
3
src/tt.h
3
src/tt.h
|
@ -73,7 +73,7 @@ class TranspositionTable {
|
|||
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
|
||||
|
||||
public:
|
||||
~TranspositionTable() { aligned_ttmem_free(mem); }
|
||||
~TranspositionTable() { aligned_large_pages_free(table); }
|
||||
void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
|
||||
TTEntry* probe(const Key key, bool& found) const;
|
||||
int hashfull() const;
|
||||
|
@ -89,7 +89,6 @@ private:
|
|||
|
||||
size_t clusterCount;
|
||||
Cluster* table;
|
||||
void* mem;
|
||||
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
|
||||
};
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue