NUMA: Fix concurrency counting for windows systems

If there is more than 1 processor group, std:🧵:hardware_concurrency should not be used. fixes #5307 closes https://github.com/official-stockfish/Stockfish/pull/5311 No functional change
2025-04-29 16:23:09 +00:00 · 2024-05-30 19:55:59 +02:00 · 2024-05-30 19:55:59 +02:00 · 596fb4842b
commit 596fb4842b
parent 02eae52833
2 changed files with 41 additions and 25 deletions
--- a/src/misc.cpp
+++ b/src/misc.cpp
@ -34,16 +34,10 @@
 // the calls at compile time), try to load them at runtime. To do this we need
 // first to define the corresponding function pointers.
 extern "C" {
-using fun1_t = bool (*)(LOGICAL_PROCESSOR_RELATIONSHIP,
+using OpenProcessToken_t      = bool (*)(HANDLE, DWORD, PHANDLE);
-                        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX,
+using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID);
-                        PDWORD);
+using AdjustTokenPrivileges_t =
-using fun2_t = bool (*)(USHORT, PGROUP_AFFINITY);
+  bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
 using fun3_t = bool (*)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 using fun4_t = bool (*)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
 using fun5_t = WORD (*)();
 using fun6_t = bool (*)(HANDLE, DWORD, PHANDLE);
 using fun7_t = bool (*)(LPCSTR, LPCSTR, PLUID);
 using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
 }
 #endif
@ -488,23 +482,25 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
    if (!hAdvapi32)
        hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
-    auto fun6 = fun6_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
+    auto OpenProcessToken_f =
-    if (!fun6)
+      OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
    if (!OpenProcessToken_f)
        return nullptr;
-    auto fun7 = fun7_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
+    auto LookupPrivilegeValueA_f =
-    if (!fun7)
+      LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
    if (!LookupPrivilegeValueA_f)
        return nullptr;
-    auto fun8 = fun8_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
+    auto AdjustTokenPrivileges_f =
-    if (!fun8)
+      AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
    if (!AdjustTokenPrivileges_f)
        return nullptr;
    // We need SeLockMemoryPrivilege, so try to enable it for the process
-    if (!fun6(  // OpenProcessToken()
+    if (!OpenProcessToken_f(  // OpenProcessToken()
          GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
        return nullptr;
-    if (fun7(  // LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME, &luid)
+    if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid))
          nullptr, "SeLockMemoryPrivilege", &luid))
    {
        TOKEN_PRIVILEGES tp{};
        TOKEN_PRIVILEGES prevTp{};
@ -516,8 +512,8 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
        // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
        // we still need to query GetLastError() to ensure that the privileges were actually obtained.
-        if (fun8(  // AdjustTokenPrivileges()
+        if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp,
-              hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen)
+                                    &prevTpLen)
            && GetLastError() == ERROR_SUCCESS)
        {
            // Round up size to full pages and allocate
@ -526,8 +522,7 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize
                                     PAGE_READWRITE);
            // Privilege no longer needed, restore previous state
-            fun8(  // AdjustTokenPrivileges ()
+            AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
              hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
        }
    }
--- a/src/numa.h
+++ b/src/numa.h
@ -61,6 +61,7 @@ using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT)
 // https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
 using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
 using GetActiveProcessorCount_t = DWORD (*)(WORD);
 #endif
 #include "misc.h"
@ -70,8 +71,28 @@ namespace Stockfish {
 using CpuIndex  = size_t;
 using NumaIndex = size_t;
-inline const CpuIndex SYSTEM_THREADS_NB =
+inline CpuIndex get_hardware_concurrency() {
-  std::max<CpuIndex>(1, std::thread::hardware_concurrency());
+    CpuIndex concurrency = std::thread::hardware_concurrency();
    // Get all processors across all processor groups on windows, since ::hardware_concurrency
    // only returns the number of processors in the first group, because only these
    // are available to std::thread.
 #ifdef _WIN64
    HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll"));
    auto    GetActiveProcessorCount_f =
      GetActiveProcessorCount_t((void (*)()) GetProcAddress(k32, "GetActiveProcessorCount"));
    if (GetActiveProcessorCount_f != nullptr)
    {
        concurrency = GetActiveProcessorCount_f(ALL_PROCESSOR_GROUPS);
    }
 #endif
    return concurrency;
 }
 inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());
 // We want to abstract the purpose of storing the numa node index somewhat.
 // Whoever is using this does not need to know the specifics of the replication