Add NumaPolicy "hardware" option that bypasses current processor affinity.

Can be used in case a GUI (e.g. ChessBase 17 see #5307) sets affinity to a single processor group, but the user would like to use the full capabilities of the hardware. Improves affinity handling on Windows in case of multiple available APIs and existing affinities. closes https://github.com/official-stockfish/Stockfish/pull/5353 No functional change
2025-07-11 03:29:14 +00:00 · 2024-06-04 12:48:13 +02:00 · 2024-06-04 12:48:13 +02:00 · 02ff76630b
commit 02ff76630b
parent daaccd9fc9
2 changed files with 232 additions and 167 deletions
--- a/src/engine.cpp
+++ b/src/engine.cpp
@ -133,6 +133,11 @@ void Engine::set_numa_config_from_option(const std::string& o) {
    {
        numaContext.set_numa_config(NumaConfig::from_system());
    }
+    else if (o == "hardware")
+    {
+        // Don't respect affinity set in the system.
+        numaContext.set_numa_config(NumaConfig::from_system(false));
+    }
    else if (o == "none")
    {
        numaContext.set_numa_config(NumaConfig{});
--- a/src/numa.h
+++ b/src/numa.h
@ -19,6 +19,7 @@
 #ifndef NUMA_H_INCLUDED
 #define NUMA_H_INCLUDED

+#include <algorithm>
 #include <atomic>
 #include <cstdint>
 #include <cstdlib>
@ -63,21 +64,9 @@ static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64;
 // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks
 using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT);

-// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
-using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
-
 // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks
 using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT);

-// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getprocessaffinitymask
-using GetProcessAffinityMask_t = BOOL (*)(HANDLE, PDWORD_PTR, PDWORD_PTR);
-
-// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-getprocessgroupaffinity
-using GetProcessGroupAffinity_t = BOOL (*)(HANDLE, PUSHORT, PUSHORT);
-
-// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getactiveprocessorcount
-using GetActiveProcessorCount_t = DWORD (*)(WORD);
-
 #endif

 #include "misc.h"
@ -94,14 +83,7 @@ inline CpuIndex get_hardware_concurrency() {
    // only returns the number of processors in the first group, because only these
    // are available to std::thread.
 #ifdef _WIN64
-    HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll"));
-    auto    GetActiveProcessorCount_f =
-      GetActiveProcessorCount_t((void (*)()) GetProcAddress(k32, "GetActiveProcessorCount"));
-
-    if (GetActiveProcessorCount_f != nullptr)
-    {
-        concurrency = GetActiveProcessorCount_f(ALL_PROCESSOR_GROUPS);
-    }
+    concurrency = std::max<CpuIndex>(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS));
 #endif

    return concurrency;
@ -109,6 +91,214 @@ inline CpuIndex get_hardware_concurrency() {

 inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());

+#if defined(_WIN64)
+
+struct WindowsAffinity {
+    std::optional<std::set<CpuIndex>> oldApi;
+    std::optional<std::set<CpuIndex>> newApi;
+    bool                              isDeterminate = true;
+
+    std::optional<std::set<CpuIndex>> get_combined() const {
+        // When the affinity is not determinate we treat it as no affinity,
+        // because otherwise we would have to set affinity to fewer
+        // processors than we currently have affinity to.
+        if (!isDeterminate)
+            return std::nullopt;
+
+        if (!oldApi.has_value())
+            return newApi;
+        if (!newApi.has_value())
+            return oldApi;
+
+        std::set<CpuIndex> intersect;
+        std::set_intersection(oldApi->begin(), oldApi->end(), newApi->begin(), newApi->end(),
+                              std::inserter(intersect, intersect.begin()));
+        return intersect;
+    }
+};
+
+inline std::pair<BOOL, std::vector<USHORT>> get_process_group_affinity() {
+    WORD numProcGroups = GetActiveProcessorGroupCount();
+
+    // GetProcessGroupAffinity requires the GroupArray argument to be
+    // aligned to 4 bytes instead of just 2.
+    static constexpr size_t GroupArrayMinimumAlignment = 4;
+    static_assert(GroupArrayMinimumAlignment >= alignof(USHORT));
+
+    auto GroupArray = std::make_unique<USHORT[]>(
+      numProcGroups + (GroupArrayMinimumAlignment / alignof(USHORT) - 1));
+
+    USHORT     GroupCount = static_cast<USHORT>(numProcGroups);
+    const BOOL status = GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount, GroupArray.get());
+
+    return std::make_pair(status, std::vector(GroupArray.get(), GroupArray.get() + GroupCount));
+}
+
+// Since Windows 11 and Windows Server 2022 thread affinities can span
+// processor groups and can be set as such by a new WinAPI function.
+// However, we may need to force using the old API if we detect
+// that the process has affinity set by the old API already and we want to override that.
+inline bool use_old_affinity_api() {
+    HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+    auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
+      (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
+
+    if (SetThreadSelectedCpuSetMasks_f == nullptr)
+        return true;
+
+    auto [status, groupAffinity] = get_process_group_affinity();
+
+    // If GroupCount > 1 then we know old API was never used and we can stick
+    // to the new API safely.
+    if (status != 0 && groupAffinity.size() > 1)
+        return false;
+
+    return true;
+};
+
+// On Windows there are two ways to set affinity, and therefore 2 ways to get it.
+// These are not consistent, so we have to check both.
+// In some cases it is actually not possible to determine affinity.
+// For example when two different threads have affinity on different processor groups,
+// set using SetThreadAffinityMask, we can't retrieve the actual affinities.
+// From documentation on GetProcessAffinityMask:
+//     > If the calling process contains threads in multiple groups,
+//     > the function returns zero for both affinity masks.
+// In such cases we just give up and assume we have affinity for all processors.
+// nullopt means no affinity is set, that is, all processors are allowed
+inline WindowsAffinity get_process_affinity() {
+    HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+    auto    GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
+      (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
+
+    WindowsAffinity affinity;
+
+    if (GetThreadSelectedCpuSetMasks_f != nullptr)
+    {
+        USHORT RequiredMaskCount;
+        BOOL   status =
+          GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
+
+        // If RequiredMaskCount then these affinities were never set, but it's not consistent
+        // so GetProcessAffinityMask may still return some affinity.
+        if (status == 0)
+        {
+            affinity.isDeterminate = false;
+            return affinity;
+        }
+
+        if (RequiredMaskCount > 0)
+        {
+            std::set<CpuIndex> cpus;
+
+            auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
+
+            GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
+                                           RequiredMaskCount, &RequiredMaskCount);
+
+            for (USHORT i = 0; i < RequiredMaskCount; ++i)
+            {
+                const size_t procGroupIndex = groupAffinities[i].Group;
+
+                for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+                {
+                    if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
+                        cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+                }
+            }
+
+            affinity.newApi = std::move(cpus);
+        }
+    }
+
+    DWORD_PTR proc, sys;
+    BOOL      status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys);
+
+    // If proc == 0 then we can't determine affinity because it spans processor groups.
+    if (status == 0 || proc == 0)
+    {
+        affinity.isDeterminate = false;
+        return affinity;
+    }
+
+    // If SetProcessAffinityMask was never called the affinity
+    // must span all processor groups, but if it was called it must only span one.
+    auto [status2, groupAffinity] = get_process_group_affinity();
+    if (status2 == 0)
+    {
+        affinity.isDeterminate = false;
+        return affinity;
+    }
+
+    // If we have affinity for more than 1 group then at this point we
+    // can assume SetProcessAffinityMask has never been called and therefore
+    // according ot old API we do not have any affinity set.
+    // Otherwise we have to assume we have affinity set and gather the processor IDs.
+    if (groupAffinity.size() == 1)
+    {
+        std::set<CpuIndex> cpus;
+
+        const size_t procGroupIndex = groupAffinity[0];
+
+        uint64_t mask = static_cast<uint64_t>(proc);
+        for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+        {
+            if (mask & (KAFFINITY(1) << j))
+                cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+        }
+
+        affinity.oldApi = std::move(cpus);
+    }
+
+    return affinity;
+}
+
+#endif
+
+#if defined(__linux__) && !defined(__ANDROID__)
+
+inline std::set<CpuIndex> get_process_affinity() {
+
+    std::set<CpuIndex> cpus;
+
+    // For unsupported systems, or in case of a soft error, we may assume all processors
+    // are available for use.
+    [[maybe_unused]] auto set_to_all_cpus = [&]() {
+        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+            cpus.insert(c);
+    };
+
+    // cpu_set_t by default holds 1024 entries. This may not be enough soon,
+    // but there is no easy way to determine how many threads there actually is.
+    // In this case we just choose a reasonable upper bound.
+    static constexpr CpuIndex MaxNumCpus = 1024 * 64;
+
+    cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
+    if (mask == nullptr)
+        std::exit(EXIT_FAILURE);
+
+    const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
+
+    CPU_ZERO_S(masksize, mask);
+
+    const int status = sched_getaffinity(0, masksize, mask);
+
+    if (status != 0)
+    {
+        CPU_FREE(mask);
+        std::exit(EXIT_FAILURE);
+    }
+
+    for (CpuIndex c = 0; c < MaxNumCpus; ++c)
+        if (CPU_ISSET_S(c, masksize, mask))
+            cpus.insert(c);
+
+    CPU_FREE(mask);
+
+    return cpus;
+}
+
+#endif

 // We want to abstract the purpose of storing the numa node index somewhat.
 // Whoever is using this does not need to know the specifics of the replication
@ -224,7 +414,7 @@ class NumaConfig {
        std::optional<std::set<CpuIndex>> allowedCpus;

        if (respectProcessAffinity)
-            allowedCpus = get_process_affinity();
+            allowedCpus = get_process_affinity().get_combined();

        // The affinity can't be determined in all cases on Windows, but we at least guarantee
        // that the number of allowed processors is >= number of processors in the affinity mask.
@ -233,15 +423,6 @@ class NumaConfig {
            return !allowedCpus.has_value() || allowedCpus->count(c) == 1;
        };

-        // Since Windows 11 and Windows Server 2022 thread affinities can span
-        // processor groups and can be set as such by a new WinAPI function.
-        static const bool CanAffinitySpanProcessorGroups = []() {
-            HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
-            auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
-              (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
-            return SetThreadSelectedCpuSetMasks_f != nullptr;
-        }();
-
        WORD numProcGroups = GetActiveProcessorGroupCount();
        for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup)
        {
@ -269,7 +450,8 @@ class NumaConfig {
        // the new NUMA allocation behaviour was introduced while there was
        // still no way to set thread affinity spanning multiple processor groups.
        // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
-        if (!CanAffinitySpanProcessorGroups)
+        // We also do this is if need to force old API for some reason.
+        if (use_old_affinity_api())
        {
            NumaConfig splitCfg = empty();

@ -307,6 +489,12 @@ class NumaConfig {
        // We have to ensure no empty NUMA nodes persist.
        cfg.remove_empty_numa_nodes();

+        // If the user explicitly opts out from respecting the current process affinity
+        // then it may be inconsistent with the current affinity (obviously), so we
+        // consider it custom.
+        if (!respectProcessAffinity)
+            cfg.customAffinity = true;
+
        return cfg;
    }

@ -510,9 +698,11 @@ class NumaConfig {
        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
        auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
          (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
-        auto SetThreadGroupAffinity_f =
-          SetThreadGroupAffinity_t((void (*)()) GetProcAddress(k32, "SetThreadGroupAffinity"));

+        // We ALWAYS set affinity with the new API if available,
+        // because there's no downsides, and we forcibly keep it consistent
+        // with the old API should we need to use it. I.e. we always keep this as a superset
+        // of what we set with SetThreadGroupAffinity.
        if (SetThreadSelectedCpuSetMasks_f != nullptr)
        {
            // Only available on Windows 11 and Windows Server 2022 onwards.
@ -541,7 +731,9 @@ class NumaConfig {
            // This is defensive, allowed because this code is not performance critical.
            SwitchToThread();
        }
-        else if (SetThreadGroupAffinity_f != nullptr)
+
+        // Sometimes we need to force the old API, but do not use it unless necessary.
+        if (SetThreadSelectedCpuSetMasks_f == nullptr || use_old_affinity_api())
        {
            // On earlier windows version (since windows 7) we can't run a single thread
            // on multiple processor groups, so we need to restrict the group.
@ -576,7 +768,7 @@ class NumaConfig {

            HANDLE hThread = GetCurrentThread();

-            const BOOL status = SetThreadGroupAffinity_f(hThread, &affinity, nullptr);
+            const BOOL status = SetThreadGroupAffinity(hThread, &affinity, nullptr);
            if (status == 0)
                std::exit(EXIT_FAILURE);

@ -665,138 +857,6 @@ class NumaConfig {
        return true;
    }

-#if defined(__linux__) && !defined(__ANDROID__)
-
-    static std::set<CpuIndex> get_process_affinity() {
-
-        std::set<CpuIndex> cpus;
-
-        // For unsupported systems, or in case of a soft error, we may assume all processors
-        // are available for use.
-        [[maybe_unused]] auto set_to_all_cpus = [&]() {
-            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-                cpus.insert(c);
-        };
-
-        // cpu_set_t by default holds 1024 entries. This may not be enough soon,
-        // but there is no easy way to determine how many threads there actually is.
-        // In this case we just choose a reasonable upper bound.
-        static constexpr CpuIndex MaxNumCpus = 1024 * 64;
-
-        cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
-        if (mask == nullptr)
-            std::exit(EXIT_FAILURE);
-
-        const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
-
-        CPU_ZERO_S(masksize, mask);
-
-        const int status = sched_getaffinity(0, masksize, mask);
-
-        if (status != 0)
-        {
-            CPU_FREE(mask);
-            std::exit(EXIT_FAILURE);
-        }
-
-        for (CpuIndex c = 0; c < MaxNumCpus; ++c)
-            if (CPU_ISSET_S(c, masksize, mask))
-                cpus.insert(c);
-
-        CPU_FREE(mask);
-
-        return cpus;
-    }
-
-#elif defined(_WIN64)
-
-    // On Windows there are two ways to set affinity, and therefore 2 ways to get it.
-    // These are not consistent, so we have to check both.
-    // In some cases it is actually not possible to determine affinity.
-    // For example when two different threads have affinity on different processor groups,
-    // set using SetThreadAffinityMask, we can't retrieve the actual affinities.
-    // From documentation on GetProcessAffinityMask:
-    //     > If the calling process contains threads in multiple groups,
-    //     > the function returns zero for both affinity masks.
-    // In such cases we just give up and assume we have affinity for all processors.
-    // nullopt means no affinity is set, that is, all processors are allowed
-    static std::optional<std::set<CpuIndex>> get_process_affinity() {
-        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
-        auto    GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
-          (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
-        auto GetProcessAffinityMask_f =
-          GetProcessAffinityMask_t((void (*)()) GetProcAddress(k32, "GetProcessAffinityMask"));
-        auto GetProcessGroupAffinity_f =
-          GetProcessGroupAffinity_t((void (*)()) GetProcAddress(k32, "GetProcessGroupAffinity"));
-
-        if (GetThreadSelectedCpuSetMasks_f != nullptr)
-        {
-            std::set<CpuIndex> cpus;
-
-            USHORT RequiredMaskCount;
-            GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
-
-            // If RequiredMaskCount then these affinities were never set, but it's not consistent
-            // so GetProcessAffinityMask may still return some affinity.
-            if (RequiredMaskCount > 0)
-            {
-                auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
-
-                GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
-                                               RequiredMaskCount, &RequiredMaskCount);
-
-                for (USHORT i = 0; i < RequiredMaskCount; ++i)
-                {
-                    const size_t procGroupIndex = groupAffinities[i].Group;
-
-                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
-                    {
-                        if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
-                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
-                    }
-                }
-
-                return cpus;
-            }
-        }
-
-        if (GetProcessAffinityMask_f != nullptr && GetProcessGroupAffinity_f != nullptr)
-        {
-            std::set<CpuIndex> cpus;
-
-            DWORD_PTR proc, sys;
-            BOOL      status = GetProcessAffinityMask_f(GetCurrentProcess(), &proc, &sys);
-            if (status == 0)
-                return std::nullopt;
-
-            // We can't determine affinity because it spans processor groups.
-            if (proc == 0)
-                return std::nullopt;
-
-            // We are expecting a single group.
-            USHORT            GroupCount = 1;
-            alignas(4) USHORT GroupArray[1];
-            status = GetProcessGroupAffinity_f(GetCurrentProcess(), &GroupCount, GroupArray);
-            if (status == 0 || GroupCount != 1)
-                return std::nullopt;
-
-            const size_t procGroupIndex = GroupArray[0];
-
-            uint64_t mask = static_cast<uint64_t>(proc);
-            for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
-            {
-                if (mask & (KAFFINITY(1) << j))
-                    cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
-            }
-
-            return cpus;
-        }
-
-        return std::nullopt;
-    }
-
-#endif
-
    static std::vector<size_t> indices_from_shortened_string(const std::string& s) {
        std::vector<size_t> indices;