Fix process' processor affinity determination on Windows.

Specialize and privatize NumaConfig::get_process_affinity. Only enable NUMA capability for 64-bit Windows. Following #5307 and some more testing it was determined that the way affinity was being determined on Windows was incorrect, based on incorrect assumptions about GetNumaProcessorNodeEx. This patch fixes the issue by attempting to retrieve the actual process' processor affinity using Windows API. However one issue persists that is not addressable due to limitations of Windows, and will have to be considered a limitation. If affinities were set using SetThreadAffinityMask instead of SetThreadSelectedCpuSetMasks and GetProcessGroupAffinity returns more than 1 group it is NOT POSSIBLE to determine the affinity programmatically on Windows. In such case the implementation assumes no affinites are set and will consider all processors available for execution. closes https://github.com/official-stockfish/Stockfish/pull/5312 No functional change
2025-04-29 16:23:09 +00:00 · 2024-05-30 12:56:44 +02:00 · 2024-05-30 12:56:44 +02:00 · f1bb4164bf
commit f1bb4164bf
parent 596fb4842b
1 changed files with 167 additions and 93 deletions
--- a/src/numa.h
+++ b/src/numa.h
@ -41,7 +41,7 @@
        #define _GNU_SOURCE
    #endif
    #include <sched.h>
-#elif defined(_WIN32)
+#elif defined(_WIN64)

 // On Windows each processor group can have up to 64 processors.
 // https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
@ -61,7 +61,18 @@ using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT)
 // https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
 using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);

+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks
+using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getprocessaffinitymask
+using GetProcessAffinityMask_t = BOOL (*)(HANDLE, PDWORD_PTR, PDWORD_PTR);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-getprocessgroupaffinity
+using GetProcessGroupAffinity_t = BOOL (*)(HANDLE, PUSHORT, PUSHORT);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getactiveprocessorcount
 using GetActiveProcessorCount_t = DWORD (*)(WORD);
+
 #endif

 #include "misc.h"
@ -115,8 +126,6 @@ class NumaReplicatedAccessToken {
 // in a way that doesn't require recreating it completely, and it would be complex and expensive
 // to maintain class invariants.
 // The CPU (processor) numbers always correspond to the actual numbering used by the system.
-//     NOTE: the numbering is only valid within the process, as for example on Windows
-//           every process gets a "virtualized" set of processors that respects the current affinity
 // The NUMA node numbers MAY NOT correspond to the system's numbering of the NUMA nodes.
 // In particular, empty nodes may be removed, or the user may create custom nodes.
 // It is guaranteed that NUMA nodes are NOT empty, i.e. every node exposed by NumaConfig
@ -133,92 +142,21 @@ class NumaConfig {
        add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1);
    }

-    static std::set<CpuIndex> get_process_affinity() {
-        std::set<CpuIndex> cpus;
-
-        // For unsupported systems, or in case of a soft error, we may assume all processors
-        // are available for use.
-        [[maybe_unused]] auto set_to_all_cpus = [&]() {
-            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-                cpus.insert(c);
-        };
-
-#if defined(__linux__) && !defined(__ANDROID__)
-
-        // cpu_set_t by default holds 1024 entries. This may not be enough soon,
-        // but there is no easy way to determine how many threads there actually is.
-        // In this case we just choose a reasonable upper bound.
-        static constexpr CpuIndex MaxNumCpus = 1024 * 64;
-
-        cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
-        if (mask == nullptr)
-            std::exit(EXIT_FAILURE);
-
-        const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
-
-        CPU_ZERO_S(masksize, mask);
-
-        const int status = sched_getaffinity(0, masksize, mask);
-
-        if (status != 0)
-        {
-            CPU_FREE(mask);
-            std::exit(EXIT_FAILURE);
-        }
-
-        for (CpuIndex c = 0; c < MaxNumCpus; ++c)
-            if (CPU_ISSET_S(c, masksize, mask))
-                cpus.insert(c);
-
-        CPU_FREE(mask);
-
-#elif defined(_WIN32)
-
-        // Windows is problematic and weird due to multiple ways of setting affinity, processor groups,
-        // and behaviour changes between versions. It's unclear if we can support this feature
-        // on Windows in the same way we do on Linux.
-        // Apparently when affinity is set via either start /affinity or msys2 taskset
-        // the function GetNumaProcessorNodeEx completely disregards the processors that we do not
-        // have affinity more. Moreover, the indices are shifted to start from 0, indicating that Windows
-        // is providing a whole new mapping of processors to this process. This is problematic in some cases
-        // but it at least allows us to [probably] support this affinity restriction feature by default.
-        // So overall, Windows appears to "virtualize" a set of processors and processor groups for every
-        // process. It's unclear if this assignment can change while the process is running.
-        // std::thread::hardware_concurrency() returns the number of processors that's consistent
-        // with GetNumaProcessorNodeEx, so we can just add all of them.
-
-        set_to_all_cpus();
-
-#else
-
-        // For other systems we assume the process is allowed to execute on all processors.
-        set_to_all_cpus();
-
-#endif
-
-        return cpus;
-    }
-
    // This function queries the system for the mapping of processors to NUMA nodes.
    // On Linux we utilize `lscpu` to avoid libnuma.
-    // On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see
-    // comment for Windows implementation of get_process_affinity
-    static NumaConfig from_system(bool respectProcessAffinity = true) {
+    static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) {
        NumaConfig cfg = empty();

+#if defined(__linux__) && !defined(__ANDROID__)
+
        std::set<CpuIndex> allowedCpus;

        if (respectProcessAffinity)
            allowedCpus = get_process_affinity();
-        else
-        {
-            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-                allowedCpus.insert(c);
-        }

-        auto is_cpu_allowed = [&](CpuIndex c) { return allowedCpus.count(c) == 1; };
-
-#if defined(__linux__) && !defined(__ANDROID__)
+        auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) {
+            return !respectProcessAffinity || allowedCpus.count(c) == 1;
+        };

        // On Linux things are straightforward, since there's no processor groups and
        // any thread can be scheduled on all processors.
@ -270,7 +208,19 @@ class NumaConfig {
                    cfg.add_cpu_to_node(NumaIndex{0}, c);
        }

-#elif defined(_WIN32)
+#elif defined(_WIN64)
+
+        std::optional<std::set<CpuIndex>> allowedCpus;
+
+        if (respectProcessAffinity)
+            allowedCpus = get_process_affinity();
+
+        // The affinity can't be determined in all cases on Windows, but we at least guarantee
+        // that the number of allowed processors is >= number of processors in the affinity mask.
+        // In case the user is not satisfied they must set the processor numbers explicitly.
+        auto is_cpu_allowed = [&allowedCpus](CpuIndex c) {
+            return !allowedCpus.has_value() || allowedCpus->count(c) == 1;
+        };

        // Since Windows 11 and Windows Server 2022 thread affinities can span
        // processor groups and can be set as such by a new WinAPI function.
@ -292,14 +242,6 @@ class NumaConfig {
                procnum.Reserved = 0;
                USHORT nodeNumber;

-                // When start /affinity or taskset was used to run this process with restricted affinity
-                // GetNumaProcessorNodeEx will NOT correspond to the system's processor setup, instead
-                // it appears to follow a completely new processor assignment, made specifically for this process,
-                // in which processors that this process has affinity for are remapped, and only those are remapped,
-                // to form a new set of processors. In other words, we can only get processors
-                // which we have affinity for this way. This means that the behaviour for
-                // `respectProcessAffinity == false` may be unexpected when affinity is set from outside,
-                // while the behaviour for `respectProcessAffinity == true` is given by default.
                const BOOL     status = GetNumaProcessorNodeEx(&procnum, &nodeNumber);
                const CpuIndex c      = static_cast<CpuIndex>(procGroup) * WIN_PROCESSOR_GROUP_SIZE
                                 + static_cast<CpuIndex>(number);
@ -347,8 +289,7 @@ class NumaConfig {

        // Fallback for unsupported systems.
        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-            if (is_cpu_allowed(c))
-                cfg.add_cpu_to_node(NumaIndex{0}, c);
+            cfg.add_cpu_to_node(NumaIndex{0}, c);

 #endif

@ -573,7 +514,7 @@ class NumaConfig {
        // This is defensive, allowed because this code is not performance critical.
        sched_yield();

-#elif defined(_WIN32)
+#elif defined(_WIN64)

        // Requires Windows 11. No good way to set thread affinity spanning processor groups before that.
        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
@ -627,9 +568,9 @@ class NumaConfig {
            // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
            GROUP_AFFINITY affinity;
            std::memset(&affinity, 0, sizeof(GROUP_AFFINITY));
-            affinity.Group = static_cast<WORD>(n);
            // We use an ordered set so we're guaranteed to get the smallest cpu number here.
            const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE;
+            affinity.Group                    = static_cast<WORD>(forcedProcGroupIndex);
            for (CpuIndex c : nodes[n])
            {
                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
@ -733,6 +674,139 @@ class NumaConfig {

        return true;
    }
+
+
+#if defined(__linux__) && !defined(__ANDROID__)
+
+    static std::set<CpuIndex> get_process_affinity() {
+
+        std::set<CpuIndex> cpus;
+
+        // For unsupported systems, or in case of a soft error, we may assume all processors
+        // are available for use.
+        [[maybe_unused]] auto set_to_all_cpus = [&]() {
+            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+                cpus.insert(c);
+        };
+
+        // cpu_set_t by default holds 1024 entries. This may not be enough soon,
+        // but there is no easy way to determine how many threads there actually is.
+        // In this case we just choose a reasonable upper bound.
+        static constexpr CpuIndex MaxNumCpus = 1024 * 64;
+
+        cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
+        if (mask == nullptr)
+            std::exit(EXIT_FAILURE);
+
+        const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
+
+        CPU_ZERO_S(masksize, mask);
+
+        const int status = sched_getaffinity(0, masksize, mask);
+
+        if (status != 0)
+        {
+            CPU_FREE(mask);
+            std::exit(EXIT_FAILURE);
+        }
+
+        for (CpuIndex c = 0; c < MaxNumCpus; ++c)
+            if (CPU_ISSET_S(c, masksize, mask))
+                cpus.insert(c);
+
+        CPU_FREE(mask);
+
+        return cpus;
+    }
+
+#elif defined(_WIN64)
+
+    // On Windows there are two ways to set affinity, and therefore 2 ways to get it.
+    // These are not consistent, so we have to check both.
+    // In some cases it is actually not possible to determine affinity.
+    // For example when two different threads have affinity on different processor groups,
+    // set using SetThreadAffinityMask, we can't retrieve the actual affinities.
+    // From documentation on GetProcessAffinityMask:
+    //     > If the calling process contains threads in multiple groups,
+    //     > the function returns zero for both affinity masks.
+    // In such cases we just give up and assume we have affinity for all processors.
+    // nullopt means no affinity is set, that is, all processors are allowed
+    static std::optional<std::set<CpuIndex>> get_process_affinity() {
+        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+        auto    GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
+          (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
+        auto GetProcessAffinityMask_f =
+          GetProcessAffinityMask_t((void (*)()) GetProcAddress(k32, "GetProcessAffinityMask"));
+        auto GetProcessGroupAffinity_f =
+          GetProcessGroupAffinity_t((void (*)()) GetProcAddress(k32, "GetProcessGroupAffinity"));
+
+        if (GetThreadSelectedCpuSetMasks_f != nullptr)
+        {
+            std::set<CpuIndex> cpus;
+
+            USHORT RequiredMaskCount;
+            GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
+
+            // If RequiredMaskCount then these affinities were never set, but it's not consistent
+            // so GetProcessAffinityMask may still return some affinity.
+            if (RequiredMaskCount > 0)
+            {
+                auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
+
+                GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
+                                               RequiredMaskCount, &RequiredMaskCount);
+
+                for (USHORT i = 0; i < RequiredMaskCount; ++i)
+                {
+                    const size_t procGroupIndex = groupAffinities[i].Group;
+
+                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+                    {
+                        if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
+                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+                    }
+                }
+
+                return cpus;
+            }
+        }
+
+        if (GetProcessAffinityMask_f != nullptr && GetProcessGroupAffinity_f != nullptr)
+        {
+            std::set<CpuIndex> cpus;
+
+            DWORD_PTR proc, sys;
+            BOOL      status = GetProcessAffinityMask_f(GetCurrentProcess(), &proc, &sys);
+            if (status == 0)
+                return std::nullopt;
+
+            // We can't determine affinity because it spans processor groups.
+            if (proc == 0)
+                return std::nullopt;
+
+            // We are expecting a single group.
+            USHORT GroupCount = 1;
+            USHORT GroupArray[1];
+            status = GetProcessGroupAffinity_f(GetCurrentProcess(), &GroupCount, GroupArray);
+            if (status == 0 || GroupCount != 1)
+                return std::nullopt;
+
+            const size_t procGroupIndex = GroupArray[0];
+
+            uint64_t mask = static_cast<uint64_t>(proc);
+            for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+            {
+                if (mask & (KAFFINITY(1) << j))
+                    cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+            }
+
+            return cpus;
+        }
+
+        return std::nullopt;
+    }
+
+#endif
 };

 class NumaReplicationContext;