1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-29 16:23:09 +00:00

Add helpers for managing aligned memory

Previously, we had two type aliases, LargePagePtr and AlignedPtr, which
required manually initializing the aligned memory for the pointer.

The new helpers:

- make_unique_aligned
- make_unique_large_page

are now available for allocating aligned memory (with large pages). They
behave similarly to std::make_unique, ensuring objects allocated with
these functions follow RAII.

The old approach had issues with initializing non-trivial types or
arrays of objects. The evaluation function of the network is now a
unique pointer to an array instead of an array of unique pointers.

Memory related functions have been moved into memory.h

Passed High Hash Pressure Test Non-Regression STC:
https://tests.stockfishchess.org/tests/view/665b2b36586058766677cfd2
LLR: 2.93 (-2.94,2.94) <-1.75,0.25>
Total: 476992 W: 122426 L: 122677 D: 231889
Ptnml(0-2): 1145, 51027, 134419, 50744, 1161

Failed Normal Non-Regression STC:
https://tests.stockfishchess.org/tests/view/665b2997586058766677cfc8
LLR: -2.94 (-2.94,2.94) <-1.75,0.25>
Total: 877312 W: 225233 L: 226395 D: 425684
Ptnml(0-2): 2110, 94642, 246239, 93630, 2035

Probably a fluke since there shouldn't be a real slowndown and it has also
passed the high hash pressure test.

closes https://github.com/official-stockfish/Stockfish/pull/5332

No functional change
This commit is contained in:
Disservin 2024-05-31 10:53:10 +02:00
parent a2a7edf4c8
commit 00a28ae325
11 changed files with 492 additions and 312 deletions

View file

@ -55,7 +55,7 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench
SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
misc.cpp movegen.cpp movepick.cpp position.cpp \
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp
nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp
HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
@ -63,7 +63,7 @@ HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h
tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h
OBJS = $(notdir $(SRCS:.cpp=.o))
@ -489,8 +489,8 @@ ifeq ($(COMP),clang)
endif
ifeq ($(KERNEL),Darwin)
CXXFLAGS += -mmacosx-version-min=10.14
LDFLAGS += -mmacosx-version-min=10.14
CXXFLAGS += -mmacosx-version-min=10.15
LDFLAGS += -mmacosx-version-min=10.15
ifneq ($(arch),any)
CXXFLAGS += -arch $(arch)
LDFLAGS += -arch $(arch)

229
src/memory.cpp Normal file
View file

@ -0,0 +1,229 @@
/*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "memory.h"
#include <cstdlib>
#if __has_include("features.h")
#include <features.h>
#endif
#if defined(__linux__) && !defined(__ANDROID__)
#include <sys/mman.h>
#endif
#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \
|| (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \
|| defined(__e2k__)
#define POSIXALIGNEDALLOC
#include <stdlib.h>
#endif
#ifdef _WIN32
#if _WIN32_WINNT < 0x0601
#undef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 // Force to include needed API prototypes
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <ios> // std::hex, std::dec
#include <iostream> // std::cerr
#include <ostream> // std::endl
#include <windows.h>
// The needed Windows API for processor groups could be missed from old Windows
// versions, so instead of calling them directly (forcing the linker to resolve
// the calls at compile time), try to load them at runtime. To do this we need
// first to define the corresponding function pointers.
extern "C" {
using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE);
using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID);
using AdjustTokenPrivileges_t =
bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
}
#endif
namespace Stockfish {
// Wrapper for systems where the c++17 implementation
// does not guarantee the availability of aligned_alloc(). Memory allocated with
// std_aligned_alloc() must be freed with std_aligned_free().
void* std_aligned_alloc(size_t alignment, size_t size) {
// Apple requires 10.15, which is enforced in the makefile
#if defined(_ISOC11_SOURCE) || defined(__APPLE__)
return aligned_alloc(alignment, size);
#elif defined(POSIXALIGNEDALLOC)
void* mem;
return posix_memalign(&mem, alignment, size) ? nullptr : mem;
#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64)
return _mm_malloc(size, alignment);
#elif defined(_WIN32)
return _aligned_malloc(size, alignment);
#else
return std::aligned_alloc(alignment, size);
#endif
}
void std_aligned_free(void* ptr) {
#if defined(POSIXALIGNEDALLOC)
free(ptr);
#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64)
_mm_free(ptr);
#elif defined(_WIN32)
_aligned_free(ptr);
#else
free(ptr);
#endif
}
// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
#if defined(_WIN32)
static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) {
#if !defined(_WIN64)
return nullptr;
#else
HANDLE hProcessToken{};
LUID luid{};
void* mem = nullptr;
const size_t largePageSize = GetLargePageMinimum();
if (!largePageSize)
return nullptr;
// Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges
HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll"));
if (!hAdvapi32)
hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
auto OpenProcessToken_f =
OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
if (!OpenProcessToken_f)
return nullptr;
auto LookupPrivilegeValueA_f =
LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
if (!LookupPrivilegeValueA_f)
return nullptr;
auto AdjustTokenPrivileges_f =
AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
if (!AdjustTokenPrivileges_f)
return nullptr;
// We need SeLockMemoryPrivilege, so try to enable it for the process
if (!OpenProcessToken_f( // OpenProcessToken()
GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
return nullptr;
if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid))
{
TOKEN_PRIVILEGES tp{};
TOKEN_PRIVILEGES prevTp{};
DWORD prevTpLen = 0;
tp.PrivilegeCount = 1;
tp.Privileges[0].Luid = luid;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
// Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
// we still need to query GetLastError() to ensure that the privileges were actually obtained.
if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp,
&prevTpLen)
&& GetLastError() == ERROR_SUCCESS)
{
// Round up size to full pages and allocate
allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1);
mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
PAGE_READWRITE);
// Privilege no longer needed, restore previous state
AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
}
}
CloseHandle(hProcessToken);
return mem;
#endif
}
void* aligned_large_pages_alloc(size_t allocSize) {
// Try to allocate large pages
void* mem = aligned_large_pages_alloc_windows(allocSize);
// Fall back to regular, page-aligned, allocation if necessary
if (!mem)
mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
return mem;
}
#else
void* aligned_large_pages_alloc(size_t allocSize) {
#if defined(__linux__)
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
#else
constexpr size_t alignment = 4096; // assumed small page size
#endif
// Round up to multiples of alignment
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
void* mem = std_aligned_alloc(alignment, size);
#if defined(MADV_HUGEPAGE)
madvise(mem, size, MADV_HUGEPAGE);
#endif
return mem;
}
#endif
// aligned_large_pages_free() will free the previously allocated ttmem
#if defined(_WIN32)
void aligned_large_pages_free(void* mem) {
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
{
DWORD err = GetLastError();
std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err
<< std::dec << std::endl;
exit(EXIT_FAILURE);
}
}
#else
void aligned_large_pages_free(void* mem) { std_aligned_free(mem); }
#endif
} // namespace Stockfish

215
src/memory.h Normal file
View file

@ -0,0 +1,215 @@
/*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef MEMORY_H_INCLUDED
#define MEMORY_H_INCLUDED
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <new>
#include <type_traits>
#include <utility>
#include "types.h"
namespace Stockfish {
void* std_aligned_alloc(size_t alignment, size_t size);
void std_aligned_free(void* ptr);
// memory aligned by page size, min alignment: 4096 bytes
void* aligned_large_pages_alloc(size_t size);
// nop if mem == nullptr
void aligned_large_pages_free(void* mem);
// frees memory which was placed there with placement new.
// works for both single objects and arrays of unknown bound
template<typename T, typename FREE_FUNC>
void memory_deleter(T* ptr, FREE_FUNC free_func) {
if (!ptr)
return;
// Explicitly needed to call the destructor
if constexpr (!std::is_trivially_destructible_v<T>)
ptr->~T();
free_func(ptr);
return;
}
// frees memory which was placed there with placement new.
// works for both single objects and arrays of unknown bound
template<typename T, typename FREE_FUNC>
void memory_deleter_array(T* ptr, FREE_FUNC free_func) {
if (!ptr)
return;
// Move back on the pointer to where the size is allocated.
const size_t array_offset = std::max(sizeof(size_t), alignof(T));
char* raw_memory = reinterpret_cast<char*>(ptr) - array_offset;
if constexpr (!std::is_trivially_destructible_v<T>)
{
const size_t size = *reinterpret_cast<size_t*>(raw_memory);
// Explicitly call the destructor for each element in reverse order
for (size_t i = size; i-- > 0;)
ptr[i].~T();
}
free_func(raw_memory);
}
// Allocates memory for a single object and places it there with placement new.
template<typename T, typename ALLOC_FUNC, typename... Args>
inline std::enable_if_t<!std::is_array_v<T>, T*> memory_allocator(ALLOC_FUNC alloc_func,
Args&&... args) {
void* raw_memory = alloc_func(sizeof(T));
ASSERT_ALIGNED(raw_memory, alignof(T));
return new (raw_memory) T(std::forward<Args>(args)...);
}
// Allocates memory for an array of unknown bound and places it there with placement new.
template<typename T, typename ALLOC_FUNC>
inline std::enable_if_t<std::is_array_v<T>, std::remove_extent_t<T>*>
memory_allocator(ALLOC_FUNC alloc_func, size_t num) {
using ElementType = std::remove_extent_t<T>;
const size_t array_offset = std::max(sizeof(size_t), alignof(ElementType));
// save the array size in the memory location
char* raw_memory =
reinterpret_cast<char*>(alloc_func(array_offset + num * sizeof(ElementType)));
ASSERT_ALIGNED(raw_memory, alignof(T));
new (raw_memory) size_t(num);
for (size_t i = 0; i < num; ++i)
new (raw_memory + array_offset + i * sizeof(ElementType)) ElementType();
// Need to return the pointer at the start of the array so that the indexing in unique_ptr<T[]> works
return reinterpret_cast<ElementType*>(raw_memory + array_offset);
}
//
//
// aligned large page unique ptr
//
//
template<typename T>
struct LargePageDeleter {
void operator()(T* ptr) const { return memory_deleter<T>(ptr, aligned_large_pages_free); }
};
template<typename T>
struct LargePageArrayDeleter {
void operator()(T* ptr) const { return memory_deleter_array<T>(ptr, aligned_large_pages_free); }
};
template<typename T>
using LargePagePtr =
std::conditional_t<std::is_array_v<T>,
std::unique_ptr<T, LargePageArrayDeleter<std::remove_extent_t<T>>>,
std::unique_ptr<T, LargePageDeleter<T>>>;
// make_unique_large_page for single objects
template<typename T, typename... Args>
std::enable_if_t<!std::is_array_v<T>, LargePagePtr<T>> make_unique_large_page(Args&&... args) {
static_assert(alignof(T) <= 4096,
"aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
T* obj = memory_allocator<T>(aligned_large_pages_alloc, std::forward<Args>(args)...);
return LargePagePtr<T>(obj);
}
// make_unique_large_page for arrays of unknown bound
template<typename T>
std::enable_if_t<std::is_array_v<T>, LargePagePtr<T>> make_unique_large_page(size_t num) {
using ElementType = std::remove_extent_t<T>;
static_assert(alignof(ElementType) <= 4096,
"aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
ElementType* memory = memory_allocator<T>(aligned_large_pages_alloc, num);
return LargePagePtr<T>(memory);
}
//
//
// aligned unique ptr
//
//
template<typename T>
struct AlignedDeleter {
void operator()(T* ptr) const { return memory_deleter<T>(ptr, std_aligned_free); }
};
template<typename T>
struct AlignedArrayDeleter {
void operator()(T* ptr) const { return memory_deleter_array<T>(ptr, std_aligned_free); }
};
template<typename T>
using AlignedPtr =
std::conditional_t<std::is_array_v<T>,
std::unique_ptr<T, AlignedArrayDeleter<std::remove_extent_t<T>>>,
std::unique_ptr<T, AlignedDeleter<T>>>;
// make_unique_aligned for single objects
template<typename T, typename... Args>
std::enable_if_t<!std::is_array_v<T>, AlignedPtr<T>> make_unique_aligned(Args&&... args) {
const auto func = [](size_t size) { return std_aligned_alloc(alignof(T), size); };
T* obj = memory_allocator<T>(func, std::forward<Args>(args)...);
return AlignedPtr<T>(obj);
}
// make_unique_aligned for arrays of unknown bound
template<typename T>
std::enable_if_t<std::is_array_v<T>, AlignedPtr<T>> make_unique_aligned(size_t num) {
using ElementType = std::remove_extent_t<T>;
const auto func = [](size_t size) { return std_aligned_alloc(alignof(ElementType), size); };
ElementType* memory = memory_allocator<T>(func, num);
return AlignedPtr<T>(memory);
}
// Get the first aligned element of an array.
// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes,
// where N is the number of elements in the array.
template<uintptr_t Alignment, typename T>
T* align_ptr_up(T* ptr) {
static_assert(alignof(T) < Alignment);
const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
return reinterpret_cast<T*>(
reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
}
} // namespace Stockfish
#endif // #ifndef MEMORY_H_INCLUDED

View file

@ -18,29 +18,6 @@
#include "misc.h"
#ifdef _WIN32
#if _WIN32_WINNT < 0x0601
#undef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 // Force to include needed API prototypes
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
// The needed Windows API for processor groups could be missed from old Windows
// versions, so instead of calling them directly (forcing the linker to resolve
// the calls at compile time), try to load them at runtime. To do this we need
// first to define the corresponding function pointers.
extern "C" {
using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE);
using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID);
using AdjustTokenPrivileges_t =
bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
}
#endif
#include <atomic>
#include <cctype>
#include <cmath>
@ -48,25 +25,14 @@ using AdjustTokenPrivileges_t =
#include <fstream>
#include <iomanip>
#include <iostream>
#include <limits>
#include <iterator>
#include <limits>
#include <mutex>
#include <sstream>
#include <string_view>
#include "types.h"
#if defined(__linux__) && !defined(__ANDROID__)
#include <sys/mman.h>
#endif
#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \
|| (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \
|| defined(__e2k__)
#define POSIXALIGNEDALLOC
#include <stdlib.h>
#endif
namespace Stockfish {
namespace {
@ -427,169 +393,6 @@ void prefetch(const void* addr) {
#endif
// Wrapper for systems where the c++17 implementation
// does not guarantee the availability of aligned_alloc(). Memory allocated with
// std_aligned_alloc() must be freed with std_aligned_free().
void* std_aligned_alloc(size_t alignment, size_t size) {
#if defined(POSIXALIGNEDALLOC)
void* mem;
return posix_memalign(&mem, alignment, size) ? nullptr : mem;
#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64)
return _mm_malloc(size, alignment);
#elif defined(_WIN32)
return _aligned_malloc(size, alignment);
#else
return std::aligned_alloc(alignment, size);
#endif
}
void std_aligned_free(void* ptr) {
#if defined(POSIXALIGNEDALLOC)
free(ptr);
#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64)
_mm_free(ptr);
#elif defined(_WIN32)
_aligned_free(ptr);
#else
free(ptr);
#endif
}
// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
#if defined(_WIN32)
static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) {
#if !defined(_WIN64)
return nullptr;
#else
HANDLE hProcessToken{};
LUID luid{};
void* mem = nullptr;
const size_t largePageSize = GetLargePageMinimum();
if (!largePageSize)
return nullptr;
// Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges
HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll"));
if (!hAdvapi32)
hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
auto OpenProcessToken_f =
OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
if (!OpenProcessToken_f)
return nullptr;
auto LookupPrivilegeValueA_f =
LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
if (!LookupPrivilegeValueA_f)
return nullptr;
auto AdjustTokenPrivileges_f =
AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
if (!AdjustTokenPrivileges_f)
return nullptr;
// We need SeLockMemoryPrivilege, so try to enable it for the process
if (!OpenProcessToken_f( // OpenProcessToken()
GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
return nullptr;
if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid))
{
TOKEN_PRIVILEGES tp{};
TOKEN_PRIVILEGES prevTp{};
DWORD prevTpLen = 0;
tp.PrivilegeCount = 1;
tp.Privileges[0].Luid = luid;
tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
// Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
// we still need to query GetLastError() to ensure that the privileges were actually obtained.
if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp,
&prevTpLen)
&& GetLastError() == ERROR_SUCCESS)
{
// Round up size to full pages and allocate
allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1);
mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
PAGE_READWRITE);
// Privilege no longer needed, restore previous state
AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
}
}
CloseHandle(hProcessToken);
return mem;
#endif
}
void* aligned_large_pages_alloc(size_t allocSize) {
// Try to allocate large pages
void* mem = aligned_large_pages_alloc_windows(allocSize);
// Fall back to regular, page-aligned, allocation if necessary
if (!mem)
mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
return mem;
}
#else
void* aligned_large_pages_alloc(size_t allocSize) {
#if defined(__linux__)
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
#else
constexpr size_t alignment = 4096; // assumed small page size
#endif
// Round up to multiples of alignment
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
void* mem = std_aligned_alloc(alignment, size);
#if defined(MADV_HUGEPAGE)
madvise(mem, size, MADV_HUGEPAGE);
#endif
return mem;
}
#endif
// aligned_large_pages_free() will free the previously allocated ttmem
#if defined(_WIN32)
void aligned_large_pages_free(void* mem) {
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
{
DWORD err = GetLastError();
std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err
<< std::dec << std::endl;
exit(EXIT_FAILURE);
}
}
#else
void aligned_large_pages_free(void* mem) { std_aligned_free(mem); }
#endif
#ifdef _WIN32
#include <direct.h>
#define GETCWD _getcwd

View file

@ -26,10 +26,9 @@
#include <cstdint>
#include <cstdio>
#include <iosfwd>
#include <memory>
#include <optional>
#include <string>
#include <vector>
#include <optional>
#define stringify2(x) #x
#define stringify(x) stringify2(x)
@ -44,39 +43,10 @@ std::string compiler_info();
// which can be quite slow.
void prefetch(const void* addr);
void start_logger(const std::string& fname);
void* std_aligned_alloc(size_t alignment, size_t size);
void std_aligned_free(void* ptr);
// memory aligned by page size, min alignment: 4096 bytes
void* aligned_large_pages_alloc(size_t size);
// nop if mem == nullptr
void aligned_large_pages_free(void* mem);
void start_logger(const std::string& fname);
size_t str_to_size_t(const std::string& s);
// Deleter for automating release of memory area
template<typename T>
struct AlignedDeleter {
void operator()(T* ptr) const {
ptr->~T();
std_aligned_free(ptr);
}
};
template<typename T>
struct LargePageDeleter {
void operator()(T* ptr) const {
ptr->~T();
aligned_large_pages_free(ptr);
}
};
template<typename T>
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
template<typename T>
using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
#if defined(__linux__)
struct PipeDeleter {
@ -141,20 +111,6 @@ std::ostream& operator<<(std::ostream&, SyncCout);
#define sync_cout std::cout << IO_LOCK
#define sync_endl std::endl << IO_UNLOCK
// Get the first aligned element of an array.
// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes,
// where N is the number of elements in the array.
template<uintptr_t Alignment, typename T>
T* align_ptr_up(T* ptr) {
static_assert(alignof(T) < Alignment);
const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
return reinterpret_cast<T*>(
reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
}
// True if and only if the binary is compiled on a little-endian machine
static inline const union {
uint32_t i;

View file

@ -20,7 +20,6 @@
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
@ -30,6 +29,7 @@
#include "../evaluate.h"
#include "../incbin/incbin.h"
#include "../memory.h"
#include "../misc.h"
#include "../position.h"
#include "../types.h"
@ -86,23 +86,6 @@ namespace Stockfish::Eval::NNUE {
namespace Detail {
// Initialize the evaluation function parameters
template<typename T>
void initialize(AlignedPtr<T>& pointer) {
pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
std::memset(pointer.get(), 0, sizeof(T));
}
template<typename T>
void initialize(LargePagePtr<T>& pointer) {
static_assert(alignof(T) <= 4096,
"aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
std::memset(pointer.get(), 0, sizeof(T));
}
// Read evaluation function parameters
template<typename T>
bool read_parameters(std::istream& stream, T& reference) {
@ -128,19 +111,17 @@ template<typename Arch, typename Transformer>
Network<Arch, Transformer>::Network(const Network<Arch, Transformer>& other) :
evalFile(other.evalFile),
embeddedType(other.embeddedType) {
if (other.featureTransformer)
{
Detail::initialize(featureTransformer);
*featureTransformer = *other.featureTransformer;
}
featureTransformer = make_unique_large_page<Transformer>(*other.featureTransformer);
network = make_unique_aligned<Arch[]>(LayerStacks);
if (!other.network)
return;
for (std::size_t i = 0; i < LayerStacks; ++i)
{
if (other.network[i])
{
Detail::initialize(network[i]);
*(network[i]) = *(other.network[i]);
}
}
network[i] = other.network[i];
}
template<typename Arch, typename Transformer>
@ -150,18 +131,15 @@ Network<Arch, Transformer>::operator=(const Network<Arch, Transformer>& other) {
embeddedType = other.embeddedType;
if (other.featureTransformer)
{
Detail::initialize(featureTransformer);
*featureTransformer = *other.featureTransformer;
}
featureTransformer = make_unique_large_page<Transformer>(*other.featureTransformer);
network = make_unique_aligned<Arch[]>(LayerStacks);
if (!other.network)
return *this;
for (std::size_t i = 0; i < LayerStacks; ++i)
{
if (other.network[i])
{
Detail::initialize(network[i]);
*(network[i]) = *(other.network[i]);
}
}
network[i] = other.network[i];
return *this;
}
@ -253,7 +231,7 @@ Value Network<Arch, Transformer>::evaluate(const Position&
const int bucket = (pos.count<ALL_PIECES>() - 1) / 4;
const auto psqt = featureTransformer->transform(pos, cache, transformedFeatures, bucket);
const auto positional = network[bucket]->propagate(transformedFeatures);
const auto positional = network[bucket].propagate(transformedFeatures);
if (complexity)
*complexity = std::abs(psqt - positional) / OutputScale;
@ -292,11 +270,11 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
exit(EXIT_FAILURE);
}
size_t size = sizeof(*featureTransformer) + sizeof(*network) * LayerStacks;
size_t size = sizeof(*featureTransformer) + sizeof(Arch) * LayerStacks;
sync_cout << "info string NNUE evaluation using " << evalfilePath << " ("
<< size / (1024 * 1024) << "MiB, (" << featureTransformer->InputDimensions << ", "
<< network[0]->TransformedFeatureDimensions << ", " << network[0]->FC_0_OUTPUTS
<< ", " << network[0]->FC_1_OUTPUTS << ", 1))" << sync_endl;
<< network[0].TransformedFeatureDimensions << ", " << network[0].FC_0_OUTPUTS << ", "
<< network[0].FC_1_OUTPUTS << ", 1))" << sync_endl;
}
@ -333,7 +311,7 @@ Network<Arch, Transformer>::trace_evaluate(const Position&
{
const auto materialist =
featureTransformer->transform(pos, cache, transformedFeatures, bucket);
const auto positional = network[bucket]->propagate(transformedFeatures);
const auto positional = network[bucket].propagate(transformedFeatures);
t.psqt[bucket] = static_cast<Value>(materialist / OutputScale);
t.positional[bucket] = static_cast<Value>(positional / OutputScale);
@ -386,9 +364,8 @@ void Network<Arch, Transformer>::load_internal() {
template<typename Arch, typename Transformer>
void Network<Arch, Transformer>::initialize() {
Detail::initialize(featureTransformer);
for (std::size_t i = 0; i < LayerStacks; ++i)
Detail::initialize(network[i]);
featureTransformer = make_unique_large_page<Transformer>();
network = make_unique_aligned<Arch[]>(LayerStacks);
}
@ -455,7 +432,7 @@ bool Network<Arch, Transformer>::read_parameters(std::istream& stream,
return false;
for (std::size_t i = 0; i < LayerStacks; ++i)
{
if (!Detail::read_parameters(stream, *(network[i])))
if (!Detail::read_parameters(stream, network[i]))
return false;
}
return stream && stream.peek() == std::ios::traits_type::eof();
@ -471,7 +448,7 @@ bool Network<Arch, Transformer>::write_parameters(std::ostream& stream,
return false;
for (std::size_t i = 0; i < LayerStacks; ++i)
{
if (!Detail::write_parameters(stream, *(network[i])))
if (!Detail::write_parameters(stream, network[i]))
return false;
}
return bool(stream);

View file

@ -25,13 +25,13 @@
#include <string>
#include <utility>
#include "../misc.h"
#include "../memory.h"
#include "../position.h"
#include "../types.h"
#include "nnue_accumulator.h"
#include "nnue_architecture.h"
#include "nnue_feature_transformer.h"
#include "nnue_misc.h"
#include "nnue_accumulator.h"
namespace Stockfish::Eval::NNUE {
@ -91,7 +91,7 @@ class Network {
LargePagePtr<Transformer> featureTransformer;
// Evaluation function
AlignedPtr<Arch> network[LayerStacks];
AlignedPtr<Arch[]> network;
EvalFile evalFile;
EmbeddedNNUEType embeddedType;

View file

@ -32,6 +32,7 @@
#include <thread>
#include <utility>
#include <vector>
#include <cstring>
// We support linux very well, but we explicitly do NOT support Android, because there's
// no affected systems, not worth maintaining.

View file

@ -23,15 +23,15 @@
#include <condition_variable>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <memory>
#include <mutex>
#include <vector>
#include <functional>
#include "numa.h"
#include "position.h"
#include "search.h"
#include "thread_win32_osx.h"
#include "numa.h"
namespace Stockfish {

View file

@ -24,7 +24,7 @@
#include <cstring>
#include <iostream>
#include "misc.h"
#include "memory.h"
#include "syzygy/tbprobe.h"
#include "thread.h"
@ -75,11 +75,10 @@ uint8_t TTEntry::relative_age(const uint8_t generation8) const {
// measured in megabytes. Transposition table consists
// of clusters and each cluster consists of ClusterSize number of TTEntry.
void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) {
aligned_large_pages_free(table);
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
table = make_unique_large_page<Cluster[]>(clusterCount);
if (!table)
{
std::cerr << "Failed to allocate " << mbSize << "MB for transposition table." << std::endl;

View file

@ -21,7 +21,9 @@
#include <cstddef>
#include <cstdint>
#include <memory>
#include "memory.h"
#include "misc.h"
#include "types.h"
@ -94,8 +96,6 @@ class TranspositionTable {
static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF;
public:
~TranspositionTable() { aligned_large_pages_free(table); }
void new_search() {
// increment by delta to keep lower bits as is
generation8 += GENERATION_DELTA;
@ -115,9 +115,9 @@ class TranspositionTable {
private:
friend struct TTEntry;
size_t clusterCount;
Cluster* table = nullptr;
uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8
size_t clusterCount;
LargePagePtr<Cluster[]> table;
uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8
};
} // namespace Stockfish