mirror of
https://github.com/sockspls/badfish
synced 2025-04-29 16:23:09 +00:00
NUMA: Fix concurrency counting for windows systems
If there is more than 1 processor group, std:🧵:hardware_concurrency should not be used. fixes #5307 closes https://github.com/official-stockfish/Stockfish/pull/5311 No functional change
This commit is contained in:
parent
02eae52833
commit
596fb4842b
2 changed files with 41 additions and 25 deletions
41
src/misc.cpp
41
src/misc.cpp
|
@ -34,16 +34,10 @@
|
||||||
// the calls at compile time), try to load them at runtime. To do this we need
|
// the calls at compile time), try to load them at runtime. To do this we need
|
||||||
// first to define the corresponding function pointers.
|
// first to define the corresponding function pointers.
|
||||||
extern "C" {
|
extern "C" {
|
||||||
using fun1_t = bool (*)(LOGICAL_PROCESSOR_RELATIONSHIP,
|
using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE);
|
||||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX,
|
using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID);
|
||||||
PDWORD);
|
using AdjustTokenPrivileges_t =
|
||||||
using fun2_t = bool (*)(USHORT, PGROUP_AFFINITY);
|
bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
|
||||||
using fun3_t = bool (*)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
|
|
||||||
using fun4_t = bool (*)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
|
|
||||||
using fun5_t = WORD (*)();
|
|
||||||
using fun6_t = bool (*)(HANDLE, DWORD, PHANDLE);
|
|
||||||
using fun7_t = bool (*)(LPCSTR, LPCSTR, PLUID);
|
|
||||||
using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -488,23 +482,25 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
|
||||||
if (!hAdvapi32)
|
if (!hAdvapi32)
|
||||||
hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
|
hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
|
||||||
|
|
||||||
auto fun6 = fun6_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
|
auto OpenProcessToken_f =
|
||||||
if (!fun6)
|
OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
|
||||||
|
if (!OpenProcessToken_f)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
auto fun7 = fun7_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
|
auto LookupPrivilegeValueA_f =
|
||||||
if (!fun7)
|
LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
|
||||||
|
if (!LookupPrivilegeValueA_f)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
auto fun8 = fun8_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
|
auto AdjustTokenPrivileges_f =
|
||||||
if (!fun8)
|
AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
|
||||||
|
if (!AdjustTokenPrivileges_f)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
// We need SeLockMemoryPrivilege, so try to enable it for the process
|
// We need SeLockMemoryPrivilege, so try to enable it for the process
|
||||||
if (!fun6( // OpenProcessToken()
|
if (!OpenProcessToken_f( // OpenProcessToken()
|
||||||
GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
|
GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
if (fun7( // LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME, &luid)
|
if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid))
|
||||||
nullptr, "SeLockMemoryPrivilege", &luid))
|
|
||||||
{
|
{
|
||||||
TOKEN_PRIVILEGES tp{};
|
TOKEN_PRIVILEGES tp{};
|
||||||
TOKEN_PRIVILEGES prevTp{};
|
TOKEN_PRIVILEGES prevTp{};
|
||||||
|
@ -516,8 +512,8 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
|
||||||
|
|
||||||
// Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
|
// Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
|
||||||
// we still need to query GetLastError() to ensure that the privileges were actually obtained.
|
// we still need to query GetLastError() to ensure that the privileges were actually obtained.
|
||||||
if (fun8( // AdjustTokenPrivileges()
|
if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp,
|
||||||
hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen)
|
&prevTpLen)
|
||||||
&& GetLastError() == ERROR_SUCCESS)
|
&& GetLastError() == ERROR_SUCCESS)
|
||||||
{
|
{
|
||||||
// Round up size to full pages and allocate
|
// Round up size to full pages and allocate
|
||||||
|
@ -526,8 +522,7 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
|
||||||
PAGE_READWRITE);
|
PAGE_READWRITE);
|
||||||
|
|
||||||
// Privilege no longer needed, restore previous state
|
// Privilege no longer needed, restore previous state
|
||||||
fun8( // AdjustTokenPrivileges ()
|
AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
|
||||||
hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
25
src/numa.h
25
src/numa.h
|
@ -61,6 +61,7 @@ using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT)
|
||||||
// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
|
// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
|
||||||
using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
|
using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
|
||||||
|
|
||||||
|
using GetActiveProcessorCount_t = DWORD (*)(WORD);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
|
@ -70,8 +71,28 @@ namespace Stockfish {
|
||||||
using CpuIndex = size_t;
|
using CpuIndex = size_t;
|
||||||
using NumaIndex = size_t;
|
using NumaIndex = size_t;
|
||||||
|
|
||||||
inline const CpuIndex SYSTEM_THREADS_NB =
|
inline CpuIndex get_hardware_concurrency() {
|
||||||
std::max<CpuIndex>(1, std::thread::hardware_concurrency());
|
CpuIndex concurrency = std::thread::hardware_concurrency();
|
||||||
|
|
||||||
|
// Get all processors across all processor groups on windows, since ::hardware_concurrency
|
||||||
|
// only returns the number of processors in the first group, because only these
|
||||||
|
// are available to std::thread.
|
||||||
|
#ifdef _WIN64
|
||||||
|
HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll"));
|
||||||
|
auto GetActiveProcessorCount_f =
|
||||||
|
GetActiveProcessorCount_t((void (*)()) GetProcAddress(k32, "GetActiveProcessorCount"));
|
||||||
|
|
||||||
|
if (GetActiveProcessorCount_f != nullptr)
|
||||||
|
{
|
||||||
|
concurrency = GetActiveProcessorCount_f(ALL_PROCESSOR_GROUPS);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return concurrency;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());
|
||||||
|
|
||||||
|
|
||||||
// We want to abstract the purpose of storing the numa node index somewhat.
|
// We want to abstract the purpose of storing the numa node index somewhat.
|
||||||
// Whoever is using this does not need to know the specifics of the replication
|
// Whoever is using this does not need to know the specifics of the replication
|
||||||
|
|
Loading…
Add table
Reference in a new issue