diff --git a/src/numa.h b/src/numa.h
index 644f212e..3c9c823a 100644
--- a/src/numa.h
+++ b/src/numa.h
@@ -41,7 +41,7 @@
         #define _GNU_SOURCE
     #endif
     #include <sched.h>
-#elif defined(_WIN32)
+#elif defined(_WIN64)
 
 // On Windows each processor group can have up to 64 processors.
 // https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
@@ -61,7 +61,18 @@ using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT)
 // https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-setthreadgroupaffinity
 using SetThreadGroupAffinity_t = BOOL (*)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
 
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks
+using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getprocessaffinitymask
+using GetProcessAffinityMask_t = BOOL (*)(HANDLE, PDWORD_PTR, PDWORD_PTR);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/processtopologyapi/nf-processtopologyapi-getprocessgroupaffinity
+using GetProcessGroupAffinity_t = BOOL (*)(HANDLE, PUSHORT, PUSHORT);
+
+// https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-getactiveprocessorcount
 using GetActiveProcessorCount_t = DWORD (*)(WORD);
+
 #endif
 
 #include "misc.h"
@@ -115,8 +126,6 @@ class NumaReplicatedAccessToken {
 // in a way that doesn't require recreating it completely, and it would be complex and expensive
 // to maintain class invariants.
 // The CPU (processor) numbers always correspond to the actual numbering used by the system.
-//     NOTE: the numbering is only valid within the process, as for example on Windows
-//           every process gets a "virtualized" set of processors that respects the current affinity
 // The NUMA node numbers MAY NOT correspond to the system's numbering of the NUMA nodes.
 // In particular, empty nodes may be removed, or the user may create custom nodes.
 // It is guaranteed that NUMA nodes are NOT empty, i.e. every node exposed by NumaConfig
@@ -133,92 +142,21 @@ class NumaConfig {
         add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1);
     }
 
-    static std::set<CpuIndex> get_process_affinity() {
-        std::set<CpuIndex> cpus;
-
-        // For unsupported systems, or in case of a soft error, we may assume all processors
-        // are available for use.
-        [[maybe_unused]] auto set_to_all_cpus = [&]() {
-            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-                cpus.insert(c);
-        };
-
-#if defined(__linux__) && !defined(__ANDROID__)
-
-        // cpu_set_t by default holds 1024 entries. This may not be enough soon,
-        // but there is no easy way to determine how many threads there actually is.
-        // In this case we just choose a reasonable upper bound.
-        static constexpr CpuIndex MaxNumCpus = 1024 * 64;
-
-        cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
-        if (mask == nullptr)
-            std::exit(EXIT_FAILURE);
-
-        const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
-
-        CPU_ZERO_S(masksize, mask);
-
-        const int status = sched_getaffinity(0, masksize, mask);
-
-        if (status != 0)
-        {
-            CPU_FREE(mask);
-            std::exit(EXIT_FAILURE);
-        }
-
-        for (CpuIndex c = 0; c < MaxNumCpus; ++c)
-            if (CPU_ISSET_S(c, masksize, mask))
-                cpus.insert(c);
-
-        CPU_FREE(mask);
-
-#elif defined(_WIN32)
-
-        // Windows is problematic and weird due to multiple ways of setting affinity, processor groups,
-        // and behaviour changes between versions. It's unclear if we can support this feature
-        // on Windows in the same way we do on Linux.
-        // Apparently when affinity is set via either start /affinity or msys2 taskset
-        // the function GetNumaProcessorNodeEx completely disregards the processors that we do not
-        // have affinity more. Moreover, the indices are shifted to start from 0, indicating that Windows
-        // is providing a whole new mapping of processors to this process. This is problematic in some cases
-        // but it at least allows us to [probably] support this affinity restriction feature by default.
-        // So overall, Windows appears to "virtualize" a set of processors and processor groups for every
-        // process. It's unclear if this assignment can change while the process is running.
-        // std::thread::hardware_concurrency() returns the number of processors that's consistent
-        // with GetNumaProcessorNodeEx, so we can just add all of them.
-
-        set_to_all_cpus();
-
-#else
-
-        // For other systems we assume the process is allowed to execute on all processors.
-        set_to_all_cpus();
-
-#endif
-
-        return cpus;
-    }
-
     // This function queries the system for the mapping of processors to NUMA nodes.
     // On Linux we utilize `lscpu` to avoid libnuma.
-    // On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see
-    // comment for Windows implementation of get_process_affinity
-    static NumaConfig from_system(bool respectProcessAffinity = true) {
+    static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) {
         NumaConfig cfg = empty();
 
+#if defined(__linux__) && !defined(__ANDROID__)
+
         std::set<CpuIndex> allowedCpus;
 
         if (respectProcessAffinity)
             allowedCpus = get_process_affinity();
-        else
-        {
-            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-                allowedCpus.insert(c);
-        }
 
-        auto is_cpu_allowed = [&](CpuIndex c) { return allowedCpus.count(c) == 1; };
-
-#if defined(__linux__) && !defined(__ANDROID__)
+        auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) {
+            return !respectProcessAffinity || allowedCpus.count(c) == 1;
+        };
 
         // On Linux things are straightforward, since there's no processor groups and
         // any thread can be scheduled on all processors.
@@ -270,7 +208,19 @@ class NumaConfig {
                     cfg.add_cpu_to_node(NumaIndex{0}, c);
         }
 
-#elif defined(_WIN32)
+#elif defined(_WIN64)
+
+        std::optional<std::set<CpuIndex>> allowedCpus;
+
+        if (respectProcessAffinity)
+            allowedCpus = get_process_affinity();
+
+        // The affinity can't be determined in all cases on Windows, but we at least guarantee
+        // that the number of allowed processors is >= number of processors in the affinity mask.
+        // In case the user is not satisfied they must set the processor numbers explicitly.
+        auto is_cpu_allowed = [&allowedCpus](CpuIndex c) {
+            return !allowedCpus.has_value() || allowedCpus->count(c) == 1;
+        };
 
         // Since Windows 11 and Windows Server 2022 thread affinities can span
         // processor groups and can be set as such by a new WinAPI function.
@@ -292,14 +242,6 @@ class NumaConfig {
                 procnum.Reserved = 0;
                 USHORT nodeNumber;
 
-                // When start /affinity or taskset was used to run this process with restricted affinity
-                // GetNumaProcessorNodeEx will NOT correspond to the system's processor setup, instead
-                // it appears to follow a completely new processor assignment, made specifically for this process,
-                // in which processors that this process has affinity for are remapped, and only those are remapped,
-                // to form a new set of processors. In other words, we can only get processors
-                // which we have affinity for this way. This means that the behaviour for
-                // `respectProcessAffinity == false` may be unexpected when affinity is set from outside,
-                // while the behaviour for `respectProcessAffinity == true` is given by default.
                 const BOOL     status = GetNumaProcessorNodeEx(&procnum, &nodeNumber);
                 const CpuIndex c      = static_cast<CpuIndex>(procGroup) * WIN_PROCESSOR_GROUP_SIZE
                                  + static_cast<CpuIndex>(number);
@@ -347,8 +289,7 @@ class NumaConfig {
 
         // Fallback for unsupported systems.
         for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
-            if (is_cpu_allowed(c))
-                cfg.add_cpu_to_node(NumaIndex{0}, c);
+            cfg.add_cpu_to_node(NumaIndex{0}, c);
 
 #endif
 
@@ -573,7 +514,7 @@ class NumaConfig {
         // This is defensive, allowed because this code is not performance critical.
         sched_yield();
 
-#elif defined(_WIN32)
+#elif defined(_WIN64)
 
         // Requires Windows 11. No good way to set thread affinity spanning processor groups before that.
         HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
@@ -627,9 +568,9 @@ class NumaConfig {
             // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
             GROUP_AFFINITY affinity;
             std::memset(&affinity, 0, sizeof(GROUP_AFFINITY));
-            affinity.Group = static_cast<WORD>(n);
             // We use an ordered set so we're guaranteed to get the smallest cpu number here.
             const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE;
+            affinity.Group                    = static_cast<WORD>(forcedProcGroupIndex);
             for (CpuIndex c : nodes[n])
             {
                 const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
@@ -733,6 +674,139 @@ class NumaConfig {
 
         return true;
     }
+
+
+#if defined(__linux__) && !defined(__ANDROID__)
+
+    static std::set<CpuIndex> get_process_affinity() {
+
+        std::set<CpuIndex> cpus;
+
+        // For unsupported systems, or in case of a soft error, we may assume all processors
+        // are available for use.
+        [[maybe_unused]] auto set_to_all_cpus = [&]() {
+            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+                cpus.insert(c);
+        };
+
+        // cpu_set_t by default holds 1024 entries. This may not be enough soon,
+        // but there is no easy way to determine how many threads there actually is.
+        // In this case we just choose a reasonable upper bound.
+        static constexpr CpuIndex MaxNumCpus = 1024 * 64;
+
+        cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
+        if (mask == nullptr)
+            std::exit(EXIT_FAILURE);
+
+        const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
+
+        CPU_ZERO_S(masksize, mask);
+
+        const int status = sched_getaffinity(0, masksize, mask);
+
+        if (status != 0)
+        {
+            CPU_FREE(mask);
+            std::exit(EXIT_FAILURE);
+        }
+
+        for (CpuIndex c = 0; c < MaxNumCpus; ++c)
+            if (CPU_ISSET_S(c, masksize, mask))
+                cpus.insert(c);
+
+        CPU_FREE(mask);
+
+        return cpus;
+    }
+
+#elif defined(_WIN64)
+
+    // On Windows there are two ways to set affinity, and therefore 2 ways to get it.
+    // These are not consistent, so we have to check both.
+    // In some cases it is actually not possible to determine affinity.
+    // For example when two different threads have affinity on different processor groups,
+    // set using SetThreadAffinityMask, we can't retrieve the actual affinities.
+    // From documentation on GetProcessAffinityMask:
+    //     > If the calling process contains threads in multiple groups,
+    //     > the function returns zero for both affinity masks.
+    // In such cases we just give up and assume we have affinity for all processors.
+    // nullopt means no affinity is set, that is, all processors are allowed
+    static std::optional<std::set<CpuIndex>> get_process_affinity() {
+        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+        auto    GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
+          (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
+        auto GetProcessAffinityMask_f =
+          GetProcessAffinityMask_t((void (*)()) GetProcAddress(k32, "GetProcessAffinityMask"));
+        auto GetProcessGroupAffinity_f =
+          GetProcessGroupAffinity_t((void (*)()) GetProcAddress(k32, "GetProcessGroupAffinity"));
+
+        if (GetThreadSelectedCpuSetMasks_f != nullptr)
+        {
+            std::set<CpuIndex> cpus;
+
+            USHORT RequiredMaskCount;
+            GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
+
+            // If RequiredMaskCount then these affinities were never set, but it's not consistent
+            // so GetProcessAffinityMask may still return some affinity.
+            if (RequiredMaskCount > 0)
+            {
+                auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
+
+                GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
+                                               RequiredMaskCount, &RequiredMaskCount);
+
+                for (USHORT i = 0; i < RequiredMaskCount; ++i)
+                {
+                    const size_t procGroupIndex = groupAffinities[i].Group;
+
+                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+                    {
+                        if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
+                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+                    }
+                }
+
+                return cpus;
+            }
+        }
+
+        if (GetProcessAffinityMask_f != nullptr && GetProcessGroupAffinity_f != nullptr)
+        {
+            std::set<CpuIndex> cpus;
+
+            DWORD_PTR proc, sys;
+            BOOL      status = GetProcessAffinityMask_f(GetCurrentProcess(), &proc, &sys);
+            if (status == 0)
+                return std::nullopt;
+
+            // We can't determine affinity because it spans processor groups.
+            if (proc == 0)
+                return std::nullopt;
+
+            // We are expecting a single group.
+            USHORT GroupCount = 1;
+            USHORT GroupArray[1];
+            status = GetProcessGroupAffinity_f(GetCurrentProcess(), &GroupCount, GroupArray);
+            if (status == 0 || GroupCount != 1)
+                return std::nullopt;
+
+            const size_t procGroupIndex = GroupArray[0];
+
+            uint64_t mask = static_cast<uint64_t>(proc);
+            for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+            {
+                if (mask & (KAFFINITY(1) << j))
+                    cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+            }
+
+            return cpus;
+        }
+
+        return std::nullopt;
+    }
+
+#endif
 };
 
 class NumaReplicationContext;