mirror of
https://github.com/sockspls/badfish
synced 2025-05-01 01:03:09 +00:00
Add large page support for NNUE weights and simplify TT mem management
Use TT memory functions to allocate memory for the NNUE weights. This should provide a small speed-up on systems where large pages are not automatically used, including Windows and some Linux distributions. Further, since we now have a wrapper for std::aligned_alloc(), we can simplify the TT memory management a bit: - We no longer need to store separate pointers to the hash table and its underlying memory allocation. - We also get to merge the Linux-specific and default implementations of aligned_ttmem_alloc(). Finally, we'll enable the VirtualAlloc code path with large page support also for Win32. STC: https://tests.stockfishchess.org/tests/view/5f66595823a84a47b9036fba LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 14896 W: 1854 L: 1686 D: 11356 Ptnml(0-2): 65, 1224, 4742, 1312, 105 closes https://github.com/official-stockfish/Stockfish/pull/3081 No functional change.
This commit is contained in:
parent
16b4578cc1
commit
485d517c68
7 changed files with 57 additions and 45 deletions
|
@ -152,7 +152,7 @@ to find the best move. The classical evaluation computes this value as a functio
|
||||||
of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
|
of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
|
||||||
The NNUE evaluation computes this value with a neural network based on basic
|
The NNUE evaluation computes this value with a neural network based on basic
|
||||||
inputs (e.g. piece positions only). The network is optimized and trained
|
inputs (e.g. piece positions only). The network is optimized and trained
|
||||||
on the evalutions of millions of positions at moderate search depth.
|
on the evaluations of millions of positions at moderate search depth.
|
||||||
|
|
||||||
The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
|
The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
|
||||||
It can be evaluated efficiently on CPUs, and exploits the fact that only parts
|
It can be evaluated efficiently on CPUs, and exploits the fact that only parts
|
||||||
|
|
57
src/misc.cpp
57
src/misc.cpp
|
@ -357,27 +357,11 @@ void std_aligned_free(void* ptr) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
|
/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
|
||||||
/// The returned pointer is the aligned one, while the mem argument is the one that needs
|
|
||||||
/// to be passed to free. With c++17 some of this functionality could be simplified.
|
|
||||||
|
|
||||||
#if defined(__linux__) && !defined(__ANDROID__)
|
#if defined(_WIN32)
|
||||||
|
|
||||||
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
static void* aligned_large_pages_alloc_win(size_t allocSize) {
|
||||||
|
|
||||||
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
|
|
||||||
size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
|
|
||||||
if (posix_memalign(&mem, alignment, size))
|
|
||||||
mem = nullptr;
|
|
||||||
#if defined(MADV_HUGEPAGE)
|
|
||||||
madvise(mem, allocSize, MADV_HUGEPAGE);
|
|
||||||
#endif
|
|
||||||
return mem;
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(_WIN64)
|
|
||||||
|
|
||||||
static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
|
|
||||||
|
|
||||||
HANDLE hProcessToken { };
|
HANDLE hProcessToken { };
|
||||||
LUID luid { };
|
LUID luid { };
|
||||||
|
@ -422,12 +406,13 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
|
||||||
return mem;
|
return mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
void* aligned_large_pages_alloc(size_t allocSize) {
|
||||||
|
|
||||||
static bool firstCall = true;
|
static bool firstCall = true;
|
||||||
|
void* mem;
|
||||||
|
|
||||||
// Try to allocate large pages
|
// Try to allocate large pages
|
||||||
mem = aligned_ttmem_alloc_large_pages(allocSize);
|
mem = aligned_large_pages_alloc_win(allocSize);
|
||||||
|
|
||||||
// Suppress info strings on the first call. The first call occurs before 'uci'
|
// Suppress info strings on the first call. The first call occurs before 'uci'
|
||||||
// is received and in that case this output confuses some GUIs.
|
// is received and in that case this output confuses some GUIs.
|
||||||
|
@ -449,23 +434,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
|
void* aligned_large_pages_alloc(size_t allocSize) {
|
||||||
|
|
||||||
constexpr size_t alignment = 64; // assumed cache line size
|
#if defined(__linux__)
|
||||||
size_t size = allocSize + alignment - 1; // allocate some extra space
|
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
|
||||||
mem = malloc(size);
|
#else
|
||||||
void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
|
constexpr size_t alignment = 4096; // assumed small page size
|
||||||
return ret;
|
#endif
|
||||||
|
|
||||||
|
// round up to multiples of alignment
|
||||||
|
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
|
||||||
|
void *mem = std_aligned_alloc(alignment, size);
|
||||||
|
#if defined(MADV_HUGEPAGE)
|
||||||
|
madvise(mem, size, MADV_HUGEPAGE);
|
||||||
|
#endif
|
||||||
|
return mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/// aligned_ttmem_free() will free the previously allocated ttmem
|
/// aligned_large_pages_free() will free the previously allocated ttmem
|
||||||
|
|
||||||
#if defined(_WIN64)
|
#if defined(_WIN32)
|
||||||
|
|
||||||
void aligned_ttmem_free(void* mem) {
|
void aligned_large_pages_free(void* mem) {
|
||||||
|
|
||||||
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
|
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
|
||||||
{
|
{
|
||||||
|
@ -478,8 +471,8 @@ void aligned_ttmem_free(void* mem) {
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void aligned_ttmem_free(void *mem) {
|
void aligned_large_pages_free(void *mem) {
|
||||||
free(mem);
|
std_aligned_free(mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -33,8 +33,8 @@ void prefetch(void* addr);
|
||||||
void start_logger(const std::string& fname);
|
void start_logger(const std::string& fname);
|
||||||
void* std_aligned_alloc(size_t alignment, size_t size);
|
void* std_aligned_alloc(size_t alignment, size_t size);
|
||||||
void std_aligned_free(void* ptr);
|
void std_aligned_free(void* ptr);
|
||||||
void* aligned_ttmem_alloc(size_t size, void*& mem);
|
void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
|
||||||
void aligned_ttmem_free(void* mem); // nop if mem == nullptr
|
void aligned_large_pages_free(void* mem); // nop if mem == nullptr
|
||||||
|
|
||||||
void dbg_hit_on(bool b);
|
void dbg_hit_on(bool b);
|
||||||
void dbg_hit_on(bool c, bool b);
|
void dbg_hit_on(bool c, bool b);
|
||||||
|
|
|
@ -52,7 +52,7 @@ namespace Eval::NNUE {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Input feature converter
|
// Input feature converter
|
||||||
AlignedPtr<FeatureTransformer> feature_transformer;
|
LargePagePtr<FeatureTransformer> feature_transformer;
|
||||||
|
|
||||||
// Evaluation function
|
// Evaluation function
|
||||||
AlignedPtr<Network> network;
|
AlignedPtr<Network> network;
|
||||||
|
@ -70,14 +70,22 @@ namespace Eval::NNUE {
|
||||||
std::memset(pointer.get(), 0, sizeof(T));
|
std::memset(pointer.get(), 0, sizeof(T));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void Initialize(LargePagePtr<T>& pointer) {
|
||||||
|
|
||||||
|
static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
|
||||||
|
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
|
||||||
|
std::memset(pointer.get(), 0, sizeof(T));
|
||||||
|
}
|
||||||
|
|
||||||
// Read evaluation function parameters
|
// Read evaluation function parameters
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
|
bool ReadParameters(std::istream& stream, T& reference) {
|
||||||
|
|
||||||
std::uint32_t header;
|
std::uint32_t header;
|
||||||
header = read_little_endian<std::uint32_t>(stream);
|
header = read_little_endian<std::uint32_t>(stream);
|
||||||
if (!stream || header != T::GetHashValue()) return false;
|
if (!stream || header != T::GetHashValue()) return false;
|
||||||
return pointer->ReadParameters(stream);
|
return reference.ReadParameters(stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Detail
|
} // namespace Detail
|
||||||
|
@ -110,8 +118,8 @@ namespace Eval::NNUE {
|
||||||
std::string architecture;
|
std::string architecture;
|
||||||
if (!ReadHeader(stream, &hash_value, &architecture)) return false;
|
if (!ReadHeader(stream, &hash_value, &architecture)) return false;
|
||||||
if (hash_value != kHashValue) return false;
|
if (hash_value != kHashValue) return false;
|
||||||
if (!Detail::ReadParameters(stream, feature_transformer)) return false;
|
if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
|
||||||
if (!Detail::ReadParameters(stream, network)) return false;
|
if (!Detail::ReadParameters(stream, *network)) return false;
|
||||||
return stream && stream.peek() == std::ios::traits_type::eof();
|
return stream && stream.peek() == std::ios::traits_type::eof();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,9 +40,20 @@ namespace Eval::NNUE {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct TtmemDeleter {
|
||||||
|
void operator()(T* ptr) const {
|
||||||
|
ptr->~T();
|
||||||
|
aligned_large_pages_free(ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
|
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using LargePagePtr = std::unique_ptr<T, TtmemDeleter<T>>;
|
||||||
|
|
||||||
} // namespace Eval::NNUE
|
} // namespace Eval::NNUE
|
||||||
|
|
||||||
#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
|
#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
|
||||||
|
|
|
@ -62,11 +62,12 @@ void TranspositionTable::resize(size_t mbSize) {
|
||||||
|
|
||||||
Threads.main()->wait_for_search_finished();
|
Threads.main()->wait_for_search_finished();
|
||||||
|
|
||||||
aligned_ttmem_free(mem);
|
aligned_large_pages_free(table);
|
||||||
|
|
||||||
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
|
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
|
||||||
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
|
|
||||||
if (!mem)
|
table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
|
||||||
|
if (!table)
|
||||||
{
|
{
|
||||||
std::cerr << "Failed to allocate " << mbSize
|
std::cerr << "Failed to allocate " << mbSize
|
||||||
<< "MB for transposition table." << std::endl;
|
<< "MB for transposition table." << std::endl;
|
||||||
|
|
3
src/tt.h
3
src/tt.h
|
@ -73,7 +73,7 @@ class TranspositionTable {
|
||||||
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
|
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
|
||||||
|
|
||||||
public:
|
public:
|
||||||
~TranspositionTable() { aligned_ttmem_free(mem); }
|
~TranspositionTable() { aligned_large_pages_free(table); }
|
||||||
void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
|
void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
|
||||||
TTEntry* probe(const Key key, bool& found) const;
|
TTEntry* probe(const Key key, bool& found) const;
|
||||||
int hashfull() const;
|
int hashfull() const;
|
||||||
|
@ -89,7 +89,6 @@ private:
|
||||||
|
|
||||||
size_t clusterCount;
|
size_t clusterCount;
|
||||||
Cluster* table;
|
Cluster* table;
|
||||||
void* mem;
|
|
||||||
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
|
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue