From 00a28ae325688346e63a452b2050bd1491085359 Mon Sep 17 00:00:00 2001 From: Disservin Date: Fri, 31 May 2024 10:53:10 +0200 Subject: [PATCH] Add helpers for managing aligned memory Previously, we had two type aliases, LargePagePtr and AlignedPtr, which required manually initializing the aligned memory for the pointer. The new helpers: - make_unique_aligned - make_unique_large_page are now available for allocating aligned memory (with large pages). They behave similarly to std::make_unique, ensuring objects allocated with these functions follow RAII. The old approach had issues with initializing non-trivial types or arrays of objects. The evaluation function of the network is now a unique pointer to an array instead of an array of unique pointers. Memory related functions have been moved into memory.h Passed High Hash Pressure Test Non-Regression STC: https://tests.stockfishchess.org/tests/view/665b2b36586058766677cfd2 LLR: 2.93 (-2.94,2.94) <-1.75,0.25> Total: 476992 W: 122426 L: 122677 D: 231889 Ptnml(0-2): 1145, 51027, 134419, 50744, 1161 Failed Normal Non-Regression STC: https://tests.stockfishchess.org/tests/view/665b2997586058766677cfc8 LLR: -2.94 (-2.94,2.94) <-1.75,0.25> Total: 877312 W: 225233 L: 226395 D: 425684 Ptnml(0-2): 2110, 94642, 246239, 93630, 2035 Probably a fluke since there shouldn't be a real slowndown and it has also passed the high hash pressure test. closes https://github.com/official-stockfish/Stockfish/pull/5332 No functional change --- src/Makefile | 8 +- src/memory.cpp | 229 +++++++++++++++++++++++++++++++++++++++++++ src/memory.h | 215 ++++++++++++++++++++++++++++++++++++++++ src/misc.cpp | 199 +------------------------------------ src/misc.h | 48 +-------- src/nnue/network.cpp | 77 +++++---------- src/nnue/network.h | 6 +- src/numa.h | 1 + src/thread.h | 4 +- src/tt.cpp | 7 +- src/tt.h | 10 +- 11 files changed, 492 insertions(+), 312 deletions(-) create mode 100644 src/memory.cpp create mode 100644 src/memory.h diff --git a/src/Makefile b/src/Makefile index 5119b615..29c4f879 100644 --- a/src/Makefile +++ b/src/Makefile @@ -55,7 +55,7 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \ misc.cpp movegen.cpp movepick.cpp position.cpp \ search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ - nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp + nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \ nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \ @@ -63,7 +63,7 @@ HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \ nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \ nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \ search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \ - tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h + tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h OBJS = $(notdir $(SRCS:.cpp=.o)) @@ -489,8 +489,8 @@ ifeq ($(COMP),clang) endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -mmacosx-version-min=10.14 - LDFLAGS += -mmacosx-version-min=10.14 + CXXFLAGS += -mmacosx-version-min=10.15 + LDFLAGS += -mmacosx-version-min=10.15 ifneq ($(arch),any) CXXFLAGS += -arch $(arch) LDFLAGS += -arch $(arch) diff --git a/src/memory.cpp b/src/memory.cpp new file mode 100644 index 00000000..565b39b2 --- /dev/null +++ b/src/memory.cpp @@ -0,0 +1,229 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "memory.h" + +#include + +#if __has_include("features.h") + #include +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + #include +#endif + +#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \ + || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \ + || defined(__e2k__) + #define POSIXALIGNEDALLOC + #include +#endif + +#ifdef _WIN32 + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + + #ifndef NOMINMAX + #define NOMINMAX + #endif + + #include // std::hex, std::dec + #include // std::cerr + #include // std::endl + #include +// The needed Windows API for processor groups could be missed from old Windows +// versions, so instead of calling them directly (forcing the linker to resolve +// the calls at compile time), try to load them at runtime. To do this we need +// first to define the corresponding function pointers. +extern "C" { +using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE); +using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID); +using AdjustTokenPrivileges_t = + bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD); +} +#endif + + +namespace Stockfish { + +// Wrapper for systems where the c++17 implementation +// does not guarantee the availability of aligned_alloc(). Memory allocated with +// std_aligned_alloc() must be freed with std_aligned_free(). +void* std_aligned_alloc(size_t alignment, size_t size) { + // Apple requires 10.15, which is enforced in the makefile +#if defined(_ISOC11_SOURCE) || defined(__APPLE__) + return aligned_alloc(alignment, size); +#elif defined(POSIXALIGNEDALLOC) + void* mem; + return posix_memalign(&mem, alignment, size) ? nullptr : mem; +#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) + return _mm_malloc(size, alignment); +#elif defined(_WIN32) + return _aligned_malloc(size, alignment); +#else + return std::aligned_alloc(alignment, size); +#endif +} + +void std_aligned_free(void* ptr) { + +#if defined(POSIXALIGNEDALLOC) + free(ptr); +#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) + _mm_free(ptr); +#elif defined(_WIN32) + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages. + +#if defined(_WIN32) + +static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) { + + #if !defined(_WIN64) + return nullptr; + #else + + HANDLE hProcessToken{}; + LUID luid{}; + void* mem = nullptr; + + const size_t largePageSize = GetLargePageMinimum(); + if (!largePageSize) + return nullptr; + + // Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges + + HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll")); + + if (!hAdvapi32) + hAdvapi32 = LoadLibrary(TEXT("advapi32.dll")); + + auto OpenProcessToken_f = + OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken")); + if (!OpenProcessToken_f) + return nullptr; + auto LookupPrivilegeValueA_f = + LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA")); + if (!LookupPrivilegeValueA_f) + return nullptr; + auto AdjustTokenPrivileges_f = + AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges")); + if (!AdjustTokenPrivileges_f) + return nullptr; + + // We need SeLockMemoryPrivilege, so try to enable it for the process + if (!OpenProcessToken_f( // OpenProcessToken() + GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) + return nullptr; + + if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid)) + { + TOKEN_PRIVILEGES tp{}; + TOKEN_PRIVILEGES prevTp{}; + DWORD prevTpLen = 0; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Luid = luid; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds, + // we still need to query GetLastError() to ensure that the privileges were actually obtained. + if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, + &prevTpLen) + && GetLastError() == ERROR_SUCCESS) + { + // Round up size to full pages and allocate + allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); + mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, + PAGE_READWRITE); + + // Privilege no longer needed, restore previous state + AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr); + } + } + + CloseHandle(hProcessToken); + + return mem; + + #endif +} + +void* aligned_large_pages_alloc(size_t allocSize) { + + // Try to allocate large pages + void* mem = aligned_large_pages_alloc_windows(allocSize); + + // Fall back to regular, page-aligned, allocation if necessary + if (!mem) + mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + + return mem; +} + +#else + +void* aligned_large_pages_alloc(size_t allocSize) { + + #if defined(__linux__) + constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size + #else + constexpr size_t alignment = 4096; // assumed small page size + #endif + + // Round up to multiples of alignment + size_t size = ((allocSize + alignment - 1) / alignment) * alignment; + void* mem = std_aligned_alloc(alignment, size); + #if defined(MADV_HUGEPAGE) + madvise(mem, size, MADV_HUGEPAGE); + #endif + return mem; +} + +#endif + + +// aligned_large_pages_free() will free the previously allocated ttmem + +#if defined(_WIN32) + +void aligned_large_pages_free(void* mem) { + + if (mem && !VirtualFree(mem, 0, MEM_RELEASE)) + { + DWORD err = GetLastError(); + std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err + << std::dec << std::endl; + exit(EXIT_FAILURE); + } +} + +#else + +void aligned_large_pages_free(void* mem) { std_aligned_free(mem); } + +#endif +} // namespace Stockfish diff --git a/src/memory.h b/src/memory.h new file mode 100644 index 00000000..ad7ca602 --- /dev/null +++ b/src/memory.h @@ -0,0 +1,215 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef MEMORY_H_INCLUDED +#define MEMORY_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +namespace Stockfish { + +void* std_aligned_alloc(size_t alignment, size_t size); +void std_aligned_free(void* ptr); +// memory aligned by page size, min alignment: 4096 bytes +void* aligned_large_pages_alloc(size_t size); +// nop if mem == nullptr +void aligned_large_pages_free(void* mem); + +// frees memory which was placed there with placement new. +// works for both single objects and arrays of unknown bound +template +void memory_deleter(T* ptr, FREE_FUNC free_func) { + if (!ptr) + return; + + // Explicitly needed to call the destructor + if constexpr (!std::is_trivially_destructible_v) + ptr->~T(); + + free_func(ptr); + return; +} + +// frees memory which was placed there with placement new. +// works for both single objects and arrays of unknown bound +template +void memory_deleter_array(T* ptr, FREE_FUNC free_func) { + if (!ptr) + return; + + + // Move back on the pointer to where the size is allocated. + const size_t array_offset = std::max(sizeof(size_t), alignof(T)); + char* raw_memory = reinterpret_cast(ptr) - array_offset; + + if constexpr (!std::is_trivially_destructible_v) + { + const size_t size = *reinterpret_cast(raw_memory); + + // Explicitly call the destructor for each element in reverse order + for (size_t i = size; i-- > 0;) + ptr[i].~T(); + } + + free_func(raw_memory); +} + +// Allocates memory for a single object and places it there with placement new. +template +inline std::enable_if_t, T*> memory_allocator(ALLOC_FUNC alloc_func, + Args&&... args) { + void* raw_memory = alloc_func(sizeof(T)); + ASSERT_ALIGNED(raw_memory, alignof(T)); + return new (raw_memory) T(std::forward(args)...); +} + +// Allocates memory for an array of unknown bound and places it there with placement new. +template +inline std::enable_if_t, std::remove_extent_t*> +memory_allocator(ALLOC_FUNC alloc_func, size_t num) { + using ElementType = std::remove_extent_t; + + const size_t array_offset = std::max(sizeof(size_t), alignof(ElementType)); + + // save the array size in the memory location + char* raw_memory = + reinterpret_cast(alloc_func(array_offset + num * sizeof(ElementType))); + ASSERT_ALIGNED(raw_memory, alignof(T)); + + new (raw_memory) size_t(num); + + for (size_t i = 0; i < num; ++i) + new (raw_memory + array_offset + i * sizeof(ElementType)) ElementType(); + + // Need to return the pointer at the start of the array so that the indexing in unique_ptr works + return reinterpret_cast(raw_memory + array_offset); +} + +// +// +// aligned large page unique ptr +// +// + +template +struct LargePageDeleter { + void operator()(T* ptr) const { return memory_deleter(ptr, aligned_large_pages_free); } +}; + +template +struct LargePageArrayDeleter { + void operator()(T* ptr) const { return memory_deleter_array(ptr, aligned_large_pages_free); } +}; + +template +using LargePagePtr = + std::conditional_t, + std::unique_ptr>>, + std::unique_ptr>>; + +// make_unique_large_page for single objects +template +std::enable_if_t, LargePagePtr> make_unique_large_page(Args&&... args) { + static_assert(alignof(T) <= 4096, + "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); + + T* obj = memory_allocator(aligned_large_pages_alloc, std::forward(args)...); + + return LargePagePtr(obj); +} + +// make_unique_large_page for arrays of unknown bound +template +std::enable_if_t, LargePagePtr> make_unique_large_page(size_t num) { + using ElementType = std::remove_extent_t; + + static_assert(alignof(ElementType) <= 4096, + "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); + + ElementType* memory = memory_allocator(aligned_large_pages_alloc, num); + + return LargePagePtr(memory); +} + +// +// +// aligned unique ptr +// +// + +template +struct AlignedDeleter { + void operator()(T* ptr) const { return memory_deleter(ptr, std_aligned_free); } +}; + +template +struct AlignedArrayDeleter { + void operator()(T* ptr) const { return memory_deleter_array(ptr, std_aligned_free); } +}; + +template +using AlignedPtr = + std::conditional_t, + std::unique_ptr>>, + std::unique_ptr>>; + +// make_unique_aligned for single objects +template +std::enable_if_t, AlignedPtr> make_unique_aligned(Args&&... args) { + const auto func = [](size_t size) { return std_aligned_alloc(alignof(T), size); }; + T* obj = memory_allocator(func, std::forward(args)...); + + return AlignedPtr(obj); +} + +// make_unique_aligned for arrays of unknown bound +template +std::enable_if_t, AlignedPtr> make_unique_aligned(size_t num) { + using ElementType = std::remove_extent_t; + + const auto func = [](size_t size) { return std_aligned_alloc(alignof(ElementType), size); }; + ElementType* memory = memory_allocator(func, num); + + return AlignedPtr(memory); +} + + +// Get the first aligned element of an array. +// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, +// where N is the number of elements in the array. +template +T* align_ptr_up(T* ptr) { + static_assert(alignof(T) < Alignment); + + const uintptr_t ptrint = reinterpret_cast(reinterpret_cast(ptr)); + return reinterpret_cast( + reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); +} + + +} // namespace Stockfish + +#endif // #ifndef MEMORY_H_INCLUDED diff --git a/src/misc.cpp b/src/misc.cpp index aa22e61f..a8bb46ec 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -18,29 +18,6 @@ #include "misc.h" -#ifdef _WIN32 - #if _WIN32_WINNT < 0x0601 - #undef _WIN32_WINNT - #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes - #endif - - #ifndef NOMINMAX - #define NOMINMAX - #endif - - #include -// The needed Windows API for processor groups could be missed from old Windows -// versions, so instead of calling them directly (forcing the linker to resolve -// the calls at compile time), try to load them at runtime. To do this we need -// first to define the corresponding function pointers. -extern "C" { -using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE); -using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID); -using AdjustTokenPrivileges_t = - bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD); -} -#endif - #include #include #include @@ -48,25 +25,14 @@ using AdjustTokenPrivileges_t = #include #include #include -#include #include +#include #include #include #include #include "types.h" -#if defined(__linux__) && !defined(__ANDROID__) - #include -#endif - -#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \ - || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \ - || defined(__e2k__) - #define POSIXALIGNEDALLOC - #include -#endif - namespace Stockfish { namespace { @@ -427,169 +393,6 @@ void prefetch(const void* addr) { #endif - -// Wrapper for systems where the c++17 implementation -// does not guarantee the availability of aligned_alloc(). Memory allocated with -// std_aligned_alloc() must be freed with std_aligned_free(). -void* std_aligned_alloc(size_t alignment, size_t size) { - -#if defined(POSIXALIGNEDALLOC) - void* mem; - return posix_memalign(&mem, alignment, size) ? nullptr : mem; -#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) - return _mm_malloc(size, alignment); -#elif defined(_WIN32) - return _aligned_malloc(size, alignment); -#else - return std::aligned_alloc(alignment, size); -#endif -} - -void std_aligned_free(void* ptr) { - -#if defined(POSIXALIGNEDALLOC) - free(ptr); -#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) - _mm_free(ptr); -#elif defined(_WIN32) - _aligned_free(ptr); -#else - free(ptr); -#endif -} - -// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages. - -#if defined(_WIN32) - -static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) { - - #if !defined(_WIN64) - return nullptr; - #else - - HANDLE hProcessToken{}; - LUID luid{}; - void* mem = nullptr; - - const size_t largePageSize = GetLargePageMinimum(); - if (!largePageSize) - return nullptr; - - // Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges - - HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll")); - - if (!hAdvapi32) - hAdvapi32 = LoadLibrary(TEXT("advapi32.dll")); - - auto OpenProcessToken_f = - OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken")); - if (!OpenProcessToken_f) - return nullptr; - auto LookupPrivilegeValueA_f = - LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA")); - if (!LookupPrivilegeValueA_f) - return nullptr; - auto AdjustTokenPrivileges_f = - AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges")); - if (!AdjustTokenPrivileges_f) - return nullptr; - - // We need SeLockMemoryPrivilege, so try to enable it for the process - if (!OpenProcessToken_f( // OpenProcessToken() - GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) - return nullptr; - - if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid)) - { - TOKEN_PRIVILEGES tp{}; - TOKEN_PRIVILEGES prevTp{}; - DWORD prevTpLen = 0; - - tp.PrivilegeCount = 1; - tp.Privileges[0].Luid = luid; - tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - - // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds, - // we still need to query GetLastError() to ensure that the privileges were actually obtained. - if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, - &prevTpLen) - && GetLastError() == ERROR_SUCCESS) - { - // Round up size to full pages and allocate - allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); - mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, - PAGE_READWRITE); - - // Privilege no longer needed, restore previous state - AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr); - } - } - - CloseHandle(hProcessToken); - - return mem; - - #endif -} - -void* aligned_large_pages_alloc(size_t allocSize) { - - // Try to allocate large pages - void* mem = aligned_large_pages_alloc_windows(allocSize); - - // Fall back to regular, page-aligned, allocation if necessary - if (!mem) - mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); - - return mem; -} - -#else - -void* aligned_large_pages_alloc(size_t allocSize) { - - #if defined(__linux__) - constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size - #else - constexpr size_t alignment = 4096; // assumed small page size - #endif - - // Round up to multiples of alignment - size_t size = ((allocSize + alignment - 1) / alignment) * alignment; - void* mem = std_aligned_alloc(alignment, size); - #if defined(MADV_HUGEPAGE) - madvise(mem, size, MADV_HUGEPAGE); - #endif - return mem; -} - -#endif - - -// aligned_large_pages_free() will free the previously allocated ttmem - -#if defined(_WIN32) - -void aligned_large_pages_free(void* mem) { - - if (mem && !VirtualFree(mem, 0, MEM_RELEASE)) - { - DWORD err = GetLastError(); - std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err - << std::dec << std::endl; - exit(EXIT_FAILURE); - } -} - -#else - -void aligned_large_pages_free(void* mem) { std_aligned_free(mem); } - -#endif - - #ifdef _WIN32 #include #define GETCWD _getcwd diff --git a/src/misc.h b/src/misc.h index 5c0bde44..557a4d8c 100644 --- a/src/misc.h +++ b/src/misc.h @@ -26,10 +26,9 @@ #include #include #include -#include +#include #include #include -#include #define stringify2(x) #x #define stringify(x) stringify2(x) @@ -44,39 +43,10 @@ std::string compiler_info(); // which can be quite slow. void prefetch(const void* addr); -void start_logger(const std::string& fname); -void* std_aligned_alloc(size_t alignment, size_t size); -void std_aligned_free(void* ptr); -// memory aligned by page size, min alignment: 4096 bytes -void* aligned_large_pages_alloc(size_t size); -// nop if mem == nullptr -void aligned_large_pages_free(void* mem); +void start_logger(const std::string& fname); size_t str_to_size_t(const std::string& s); -// Deleter for automating release of memory area -template -struct AlignedDeleter { - void operator()(T* ptr) const { - ptr->~T(); - std_aligned_free(ptr); - } -}; - -template -struct LargePageDeleter { - void operator()(T* ptr) const { - ptr->~T(); - aligned_large_pages_free(ptr); - } -}; - -template -using AlignedPtr = std::unique_ptr>; - -template -using LargePagePtr = std::unique_ptr>; - #if defined(__linux__) struct PipeDeleter { @@ -141,20 +111,6 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK - -// Get the first aligned element of an array. -// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, -// where N is the number of elements in the array. -template -T* align_ptr_up(T* ptr) { - static_assert(alignof(T) < Alignment); - - const uintptr_t ptrint = reinterpret_cast(reinterpret_cast(ptr)); - return reinterpret_cast( - reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); -} - - // True if and only if the binary is compiled on a little-endian machine static inline const union { uint32_t i; diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index db864fcd..71c384ff 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -30,6 +29,7 @@ #include "../evaluate.h" #include "../incbin/incbin.h" +#include "../memory.h" #include "../misc.h" #include "../position.h" #include "../types.h" @@ -86,23 +86,6 @@ namespace Stockfish::Eval::NNUE { namespace Detail { -// Initialize the evaluation function parameters -template -void initialize(AlignedPtr& pointer) { - - pointer.reset(reinterpret_cast(std_aligned_alloc(alignof(T), sizeof(T)))); - std::memset(pointer.get(), 0, sizeof(T)); -} - -template -void initialize(LargePagePtr& pointer) { - - static_assert(alignof(T) <= 4096, - "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); - pointer.reset(reinterpret_cast(aligned_large_pages_alloc(sizeof(T)))); - std::memset(pointer.get(), 0, sizeof(T)); -} - // Read evaluation function parameters template bool read_parameters(std::istream& stream, T& reference) { @@ -128,19 +111,17 @@ template Network::Network(const Network& other) : evalFile(other.evalFile), embeddedType(other.embeddedType) { + if (other.featureTransformer) - { - Detail::initialize(featureTransformer); - *featureTransformer = *other.featureTransformer; - } + featureTransformer = make_unique_large_page(*other.featureTransformer); + + network = make_unique_aligned(LayerStacks); + + if (!other.network) + return; + for (std::size_t i = 0; i < LayerStacks; ++i) - { - if (other.network[i]) - { - Detail::initialize(network[i]); - *(network[i]) = *(other.network[i]); - } - } + network[i] = other.network[i]; } template @@ -150,18 +131,15 @@ Network::operator=(const Network& other) { embeddedType = other.embeddedType; if (other.featureTransformer) - { - Detail::initialize(featureTransformer); - *featureTransformer = *other.featureTransformer; - } + featureTransformer = make_unique_large_page(*other.featureTransformer); + + network = make_unique_aligned(LayerStacks); + + if (!other.network) + return *this; + for (std::size_t i = 0; i < LayerStacks; ++i) - { - if (other.network[i]) - { - Detail::initialize(network[i]); - *(network[i]) = *(other.network[i]); - } - } + network[i] = other.network[i]; return *this; } @@ -253,7 +231,7 @@ Value Network::evaluate(const Position& const int bucket = (pos.count() - 1) / 4; const auto psqt = featureTransformer->transform(pos, cache, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); + const auto positional = network[bucket].propagate(transformedFeatures); if (complexity) *complexity = std::abs(psqt - positional) / OutputScale; @@ -292,11 +270,11 @@ void Network::verify(std::string evalfilePath) const { exit(EXIT_FAILURE); } - size_t size = sizeof(*featureTransformer) + sizeof(*network) * LayerStacks; + size_t size = sizeof(*featureTransformer) + sizeof(Arch) * LayerStacks; sync_cout << "info string NNUE evaluation using " << evalfilePath << " (" << size / (1024 * 1024) << "MiB, (" << featureTransformer->InputDimensions << ", " - << network[0]->TransformedFeatureDimensions << ", " << network[0]->FC_0_OUTPUTS - << ", " << network[0]->FC_1_OUTPUTS << ", 1))" << sync_endl; + << network[0].TransformedFeatureDimensions << ", " << network[0].FC_0_OUTPUTS << ", " + << network[0].FC_1_OUTPUTS << ", 1))" << sync_endl; } @@ -333,7 +311,7 @@ Network::trace_evaluate(const Position& { const auto materialist = featureTransformer->transform(pos, cache, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); + const auto positional = network[bucket].propagate(transformedFeatures); t.psqt[bucket] = static_cast(materialist / OutputScale); t.positional[bucket] = static_cast(positional / OutputScale); @@ -386,9 +364,8 @@ void Network::load_internal() { template void Network::initialize() { - Detail::initialize(featureTransformer); - for (std::size_t i = 0; i < LayerStacks; ++i) - Detail::initialize(network[i]); + featureTransformer = make_unique_large_page(); + network = make_unique_aligned(LayerStacks); } @@ -455,7 +432,7 @@ bool Network::read_parameters(std::istream& stream, return false; for (std::size_t i = 0; i < LayerStacks; ++i) { - if (!Detail::read_parameters(stream, *(network[i]))) + if (!Detail::read_parameters(stream, network[i])) return false; } return stream && stream.peek() == std::ios::traits_type::eof(); @@ -471,7 +448,7 @@ bool Network::write_parameters(std::ostream& stream, return false; for (std::size_t i = 0; i < LayerStacks; ++i) { - if (!Detail::write_parameters(stream, *(network[i]))) + if (!Detail::write_parameters(stream, network[i])) return false; } return bool(stream); diff --git a/src/nnue/network.h b/src/nnue/network.h index f0ccfafc..6ba3cfba 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -25,13 +25,13 @@ #include #include -#include "../misc.h" +#include "../memory.h" #include "../position.h" #include "../types.h" +#include "nnue_accumulator.h" #include "nnue_architecture.h" #include "nnue_feature_transformer.h" #include "nnue_misc.h" -#include "nnue_accumulator.h" namespace Stockfish::Eval::NNUE { @@ -91,7 +91,7 @@ class Network { LargePagePtr featureTransformer; // Evaluation function - AlignedPtr network[LayerStacks]; + AlignedPtr network; EvalFile evalFile; EmbeddedNNUEType embeddedType; diff --git a/src/numa.h b/src/numa.h index 5934a0cd..a56d7142 100644 --- a/src/numa.h +++ b/src/numa.h @@ -32,6 +32,7 @@ #include #include #include +#include // We support linux very well, but we explicitly do NOT support Android, because there's // no affected systems, not worth maintaining. diff --git a/src/thread.h b/src/thread.h index 102b2299..7416271b 100644 --- a/src/thread.h +++ b/src/thread.h @@ -23,15 +23,15 @@ #include #include #include +#include #include #include #include -#include +#include "numa.h" #include "position.h" #include "search.h" #include "thread_win32_osx.h" -#include "numa.h" namespace Stockfish { diff --git a/src/tt.cpp b/src/tt.cpp index f95170e9..f808106a 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -24,7 +24,7 @@ #include #include -#include "misc.h" +#include "memory.h" #include "syzygy/tbprobe.h" #include "thread.h" @@ -75,11 +75,10 @@ uint8_t TTEntry::relative_age(const uint8_t generation8) const { // measured in megabytes. Transposition table consists // of clusters and each cluster consists of ClusterSize number of TTEntry. void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) { - aligned_large_pages_free(table); - clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); - table = static_cast(aligned_large_pages_alloc(clusterCount * sizeof(Cluster))); + table = make_unique_large_page(clusterCount); + if (!table) { std::cerr << "Failed to allocate " << mbSize << "MB for transposition table." << std::endl; diff --git a/src/tt.h b/src/tt.h index 3b09ec4e..2dcfdd44 100644 --- a/src/tt.h +++ b/src/tt.h @@ -21,7 +21,9 @@ #include #include +#include +#include "memory.h" #include "misc.h" #include "types.h" @@ -94,8 +96,6 @@ class TranspositionTable { static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; public: - ~TranspositionTable() { aligned_large_pages_free(table); } - void new_search() { // increment by delta to keep lower bits as is generation8 += GENERATION_DELTA; @@ -115,9 +115,9 @@ class TranspositionTable { private: friend struct TTEntry; - size_t clusterCount; - Cluster* table = nullptr; - uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 + size_t clusterCount; + LargePagePtr table; + uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 }; } // namespace Stockfish