diff --git a/src/numa.h b/src/numa.h index 644f212e..3c9c823a 100644 --- a/src/numa.h +++ b/src/numa.h @@ -41,7 +41,7 @@ #define _GNU_SOURCE #endif #include -#elif defined(_WIN32) +#elif defined(_WIN64) // On Windows each processor group can have up to 64 processors. // https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups @@ -61,7 +61,18 @@ using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT) // https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY); +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks +using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT); + +// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getprocessaffinitymask +using GetProcessAffinityMask_t = BOOL (*)(HANDLE, PDWORD_PTR, PDWORD_PTR); + +// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-getprocessgroupaffinity +using GetProcessGroupAffinity_t = BOOL (*)(HANDLE, PUSHORT, PUSHORT); + +// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getactiveprocessorcount using GetActiveProcessorCount_t = DWORD (*)(WORD); + #endif #include "misc.h" @@ -115,8 +126,6 @@ class NumaReplicatedAccessToken { // in a way that doesn't require recreating it completely, and it would be complex and expensive // to maintain class invariants. // The CPU (processor) numbers always correspond to the actual numbering used by the system. -// NOTE: the numbering is only valid within the process, as for example on Windows -// every process gets a "virtualized" set of processors that respects the current affinity // The NUMA node numbers MAY NOT correspond to the system's numbering of the NUMA nodes. // In particular, empty nodes may be removed, or the user may create custom nodes. // It is guaranteed that NUMA nodes are NOT empty, i.e. every node exposed by NumaConfig @@ -133,92 +142,21 @@ class NumaConfig { add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1); } - static std::set get_process_affinity() { - std::set cpus; - - // For unsupported systems, or in case of a soft error, we may assume all processors - // are available for use. - [[maybe_unused]] auto set_to_all_cpus = [&]() { - for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) - cpus.insert(c); - }; - -#if defined(__linux__) && !defined(__ANDROID__) - - // cpu_set_t by default holds 1024 entries. This may not be enough soon, - // but there is no easy way to determine how many threads there actually is. - // In this case we just choose a reasonable upper bound. - static constexpr CpuIndex MaxNumCpus = 1024 * 64; - - cpu_set_t* mask = CPU_ALLOC(MaxNumCpus); - if (mask == nullptr) - std::exit(EXIT_FAILURE); - - const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus); - - CPU_ZERO_S(masksize, mask); - - const int status = sched_getaffinity(0, masksize, mask); - - if (status != 0) - { - CPU_FREE(mask); - std::exit(EXIT_FAILURE); - } - - for (CpuIndex c = 0; c < MaxNumCpus; ++c) - if (CPU_ISSET_S(c, masksize, mask)) - cpus.insert(c); - - CPU_FREE(mask); - -#elif defined(_WIN32) - - // Windows is problematic and weird due to multiple ways of setting affinity, processor groups, - // and behaviour changes between versions. It's unclear if we can support this feature - // on Windows in the same way we do on Linux. - // Apparently when affinity is set via either start /affinity or msys2 taskset - // the function GetNumaProcessorNodeEx completely disregards the processors that we do not - // have affinity more. Moreover, the indices are shifted to start from 0, indicating that Windows - // is providing a whole new mapping of processors to this process. This is problematic in some cases - // but it at least allows us to [probably] support this affinity restriction feature by default. - // So overall, Windows appears to "virtualize" a set of processors and processor groups for every - // process. It's unclear if this assignment can change while the process is running. - // std::thread::hardware_concurrency() returns the number of processors that's consistent - // with GetNumaProcessorNodeEx, so we can just add all of them. - - set_to_all_cpus(); - -#else - - // For other systems we assume the process is allowed to execute on all processors. - set_to_all_cpus(); - -#endif - - return cpus; - } - // This function queries the system for the mapping of processors to NUMA nodes. // On Linux we utilize `lscpu` to avoid libnuma. - // On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see - // comment for Windows implementation of get_process_affinity - static NumaConfig from_system(bool respectProcessAffinity = true) { + static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) { NumaConfig cfg = empty(); +#if defined(__linux__) && !defined(__ANDROID__) + std::set allowedCpus; if (respectProcessAffinity) allowedCpus = get_process_affinity(); - else - { - for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) - allowedCpus.insert(c); - } - auto is_cpu_allowed = [&](CpuIndex c) { return allowedCpus.count(c) == 1; }; - -#if defined(__linux__) && !defined(__ANDROID__) + auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) { + return !respectProcessAffinity || allowedCpus.count(c) == 1; + }; // On Linux things are straightforward, since there's no processor groups and // any thread can be scheduled on all processors. @@ -270,7 +208,19 @@ class NumaConfig { cfg.add_cpu_to_node(NumaIndex{0}, c); } -#elif defined(_WIN32) +#elif defined(_WIN64) + + std::optional> allowedCpus; + + if (respectProcessAffinity) + allowedCpus = get_process_affinity(); + + // The affinity can't be determined in all cases on Windows, but we at least guarantee + // that the number of allowed processors is >= number of processors in the affinity mask. + // In case the user is not satisfied they must set the processor numbers explicitly. + auto is_cpu_allowed = [&allowedCpus](CpuIndex c) { + return !allowedCpus.has_value() || allowedCpus->count(c) == 1; + }; // Since Windows 11 and Windows Server 2022 thread affinities can span // processor groups and can be set as such by a new WinAPI function. @@ -292,14 +242,6 @@ class NumaConfig { procnum.Reserved = 0; USHORT nodeNumber; - // When start /affinity or taskset was used to run this process with restricted affinity - // GetNumaProcessorNodeEx will NOT correspond to the system's processor setup, instead - // it appears to follow a completely new processor assignment, made specifically for this process, - // in which processors that this process has affinity for are remapped, and only those are remapped, - // to form a new set of processors. In other words, we can only get processors - // which we have affinity for this way. This means that the behaviour for - // `respectProcessAffinity == false` may be unexpected when affinity is set from outside, - // while the behaviour for `respectProcessAffinity == true` is given by default. const BOOL status = GetNumaProcessorNodeEx(&procnum, &nodeNumber); const CpuIndex c = static_cast(procGroup) * WIN_PROCESSOR_GROUP_SIZE + static_cast(number); @@ -347,8 +289,7 @@ class NumaConfig { // Fallback for unsupported systems. for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) - if (is_cpu_allowed(c)) - cfg.add_cpu_to_node(NumaIndex{0}, c); + cfg.add_cpu_to_node(NumaIndex{0}, c); #endif @@ -573,7 +514,7 @@ class NumaConfig { // This is defensive, allowed because this code is not performance critical. sched_yield(); -#elif defined(_WIN32) +#elif defined(_WIN64) // Requires Windows 11. No good way to set thread affinity spanning processor groups before that. HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); @@ -627,9 +568,9 @@ class NumaConfig { // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support GROUP_AFFINITY affinity; std::memset(&affinity, 0, sizeof(GROUP_AFFINITY)); - affinity.Group = static_cast(n); // We use an ordered set so we're guaranteed to get the smallest cpu number here. const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE; + affinity.Group = static_cast(forcedProcGroupIndex); for (CpuIndex c : nodes[n]) { const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; @@ -733,6 +674,139 @@ class NumaConfig { return true; } + + +#if defined(__linux__) && !defined(__ANDROID__) + + static std::set get_process_affinity() { + + std::set cpus; + + // For unsupported systems, or in case of a soft error, we may assume all processors + // are available for use. + [[maybe_unused]] auto set_to_all_cpus = [&]() { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + cpus.insert(c); + }; + + // cpu_set_t by default holds 1024 entries. This may not be enough soon, + // but there is no easy way to determine how many threads there actually is. + // In this case we just choose a reasonable upper bound. + static constexpr CpuIndex MaxNumCpus = 1024 * 64; + + cpu_set_t* mask = CPU_ALLOC(MaxNumCpus); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus); + + CPU_ZERO_S(masksize, mask); + + const int status = sched_getaffinity(0, masksize, mask); + + if (status != 0) + { + CPU_FREE(mask); + std::exit(EXIT_FAILURE); + } + + for (CpuIndex c = 0; c < MaxNumCpus; ++c) + if (CPU_ISSET_S(c, masksize, mask)) + cpus.insert(c); + + CPU_FREE(mask); + + return cpus; + } + +#elif defined(_WIN64) + + // On Windows there are two ways to set affinity, and therefore 2 ways to get it. + // These are not consistent, so we have to check both. + // In some cases it is actually not possible to determine affinity. + // For example when two different threads have affinity on different processor groups, + // set using SetThreadAffinityMask, we can't retrieve the actual affinities. + // From documentation on GetProcessAffinityMask: + // > If the calling process contains threads in multiple groups, + // > the function returns zero for both affinity masks. + // In such cases we just give up and assume we have affinity for all processors. + // nullopt means no affinity is set, that is, all processors are allowed + static std::optional> get_process_affinity() { + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks")); + auto GetProcessAffinityMask_f = + GetProcessAffinityMask_t((void (*)()) GetProcAddress(k32, "GetProcessAffinityMask")); + auto GetProcessGroupAffinity_f = + GetProcessGroupAffinity_t((void (*)()) GetProcAddress(k32, "GetProcessGroupAffinity")); + + if (GetThreadSelectedCpuSetMasks_f != nullptr) + { + std::set cpus; + + USHORT RequiredMaskCount; + GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount); + + // If RequiredMaskCount then these affinities were never set, but it's not consistent + // so GetProcessAffinityMask may still return some affinity. + if (RequiredMaskCount > 0) + { + auto groupAffinities = std::make_unique(RequiredMaskCount); + + GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(), + RequiredMaskCount, &RequiredMaskCount); + + for (USHORT i = 0; i < RequiredMaskCount; ++i) + { + const size_t procGroupIndex = groupAffinities[i].Group; + + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (groupAffinities[i].Mask & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + } + + return cpus; + } + } + + if (GetProcessAffinityMask_f != nullptr && GetProcessGroupAffinity_f != nullptr) + { + std::set cpus; + + DWORD_PTR proc, sys; + BOOL status = GetProcessAffinityMask_f(GetCurrentProcess(), &proc, &sys); + if (status == 0) + return std::nullopt; + + // We can't determine affinity because it spans processor groups. + if (proc == 0) + return std::nullopt; + + // We are expecting a single group. + USHORT GroupCount = 1; + USHORT GroupArray[1]; + status = GetProcessGroupAffinity_f(GetCurrentProcess(), &GroupCount, GroupArray); + if (status == 0 || GroupCount != 1) + return std::nullopt; + + const size_t procGroupIndex = GroupArray[0]; + + uint64_t mask = static_cast(proc); + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (mask & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + + return cpus; + } + + return std::nullopt; + } + +#endif }; class NumaReplicationContext;