1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-29 16:23:09 +00:00

NUMA: Fix concurrency counting for windows systems

If there is more than 1 processor group, std:🧵:hardware_concurrency should not be used.

fixes #5307

closes https://github.com/official-stockfish/Stockfish/pull/5311

No functional change
This commit is contained in:
Disservin 2024-05-30 19:55:59 +02:00
parent 02eae52833
commit 596fb4842b
2 changed files with 41 additions and 25 deletions

View file

@ -34,16 +34,10 @@
// the calls at compile time), try to load them at runtime. To do this we need
// first to define the corresponding function pointers.
extern "C" {
using fun1_t = bool (*)(LOGICAL_PROCESSOR_RELATIONSHIP,
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX,
PDWORD);
using fun2_t = bool (*)(USHORT, PGROUP_AFFINITY);
using fun3_t = bool (*)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
using fun4_t = bool (*)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
using fun5_t = WORD (*)();
using fun6_t = bool (*)(HANDLE, DWORD, PHANDLE);
using fun7_t = bool (*)(LPCSTR, LPCSTR, PLUID);
using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE);
using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID);
using AdjustTokenPrivileges_t =
bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
}
#endif
@ -488,23 +482,25 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
if (!hAdvapi32)
hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
auto fun6 = fun6_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
if (!fun6)
auto OpenProcessToken_f =
OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
if (!OpenProcessToken_f)
return nullptr;
auto fun7 = fun7_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
if (!fun7)
auto LookupPrivilegeValueA_f =
LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
if (!LookupPrivilegeValueA_f)
return nullptr;
auto fun8 = fun8_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
if (!fun8)
auto AdjustTokenPrivileges_f =
AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
if (!AdjustTokenPrivileges_f)
return nullptr;
// We need SeLockMemoryPrivilege, so try to enable it for the process
if (!fun6( // OpenProcessToken()
if (!OpenProcessToken_f( // OpenProcessToken()
GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
return nullptr;
if (fun7( // LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME, &luid)
nullptr, "SeLockMemoryPrivilege", &luid))
if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid))
{
TOKEN_PRIVILEGES tp{};
TOKEN_PRIVILEGES prevTp{};
@ -516,8 +512,8 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
// Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
// we still need to query GetLastError() to ensure that the privileges were actually obtained.
if (fun8( // AdjustTokenPrivileges()
hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen)
if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp,
&prevTpLen)
&& GetLastError() == ERROR_SUCCESS)
{
// Round up size to full pages and allocate
@ -526,8 +522,7 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
PAGE_READWRITE);
// Privilege no longer needed, restore previous state
fun8( // AdjustTokenPrivileges ()
hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
}
}

View file

@ -61,6 +61,7 @@ using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT)
// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
using GetActiveProcessorCount_t = DWORD (*)(WORD);
#endif
#include "misc.h"
@ -70,8 +71,28 @@ namespace Stockfish {
using CpuIndex = size_t;
using NumaIndex = size_t;
inline const CpuIndex SYSTEM_THREADS_NB =
std::max<CpuIndex>(1, std::thread::hardware_concurrency());
inline CpuIndex get_hardware_concurrency() {
CpuIndex concurrency = std::thread::hardware_concurrency();
// Get all processors across all processor groups on windows, since ::hardware_concurrency
// only returns the number of processors in the first group, because only these
// are available to std::thread.
#ifdef _WIN64
HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll"));
auto GetActiveProcessorCount_f =
GetActiveProcessorCount_t((void (*)()) GetProcAddress(k32, "GetActiveProcessorCount"));
if (GetActiveProcessorCount_f != nullptr)
{
concurrency = GetActiveProcessorCount_f(ALL_PROCESSOR_GROUPS);
}
#endif
return concurrency;
}
inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());
// We want to abstract the purpose of storing the numa node index somewhat.
// Whoever is using this does not need to know the specifics of the replication