1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-30 16:53:09 +00:00

Handle Windows Processors Groups

Under Windows it is not possible for a process to run on more than one
logical processor group. This usually means to be limited to use max 64
cores. To overcome this, some special platform specific API should be
called to set group affinity for each thread. Original code from Texel by
Peter sterlund.

Tested by Jean-Paul Vael on a Xeon E7-8890 v4 with 88 threads and confimed
speed up between 44 and 88 threads is about 30%, as expected.

No functional change.
This commit is contained in:
Marco Costalba 2016-11-22 07:41:46 +01:00
parent 6036303bb6
commit 0d9a9f5e98
3 changed files with 120 additions and 0 deletions

View file

@ -18,10 +18,19 @@
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#ifdef _WIN32
#if _WIN32_WINNT < 0x0601
#undef _WIN32_WINNT
#define _WIN32_WINNT 0x0601 // Force to include newest API (Win 7 or later)
#endif
#include <windows.h> // For processor groups
#endif
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <vector>
#include "misc.h" #include "misc.h"
#include "thread.h" #include "thread.h"
@ -185,3 +194,101 @@ void prefetch(void* addr) {
} }
#endif #endif
namespace WinProcGroup {
#ifndef _WIN32
void bindThisThread(size_t) {}
#else
/// get_group() retrieves logical processor information using Windows specific
/// API and returns the best group id for the thread with index idx. Original
/// code from Texel by Peter Österlund.
int get_group(size_t idx) {
int threads = 0;
int nodes = 0;
int cores = 0;
DWORD returnLength = 0;
DWORD byteOffset = 0;
// Early exit if the needed API are not available at runtime
HMODULE k32 = GetModuleHandle("Kernel32.dll");
if ( !GetProcAddress(k32, "GetLogicalProcessorInformationEx")
|| !GetProcAddress(k32, "GetNumaNodeProcessorMaskEx")
|| !GetProcAddress(k32, "SetThreadGroupAffinity"))
return -1;
// First call to get returnLength. We expect it to fail due to null buffer
if (GetLogicalProcessorInformationEx(RelationAll, nullptr, &returnLength))
return -1;
// Once we know returnLength, allocate the buffer
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr;
ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)malloc(returnLength);
// Second call, now we expect to succeed
if (!GetLogicalProcessorInformationEx(RelationAll, buffer, &returnLength))
{
free(buffer);
return -1;
}
while (ptr->Size > 0 && byteOffset + ptr->Size <= returnLength)
{
if (ptr->Relationship == RelationNumaNode)
nodes++;
else if (ptr->Relationship == RelationProcessorCore)
{
cores++;
threads += (ptr->Processor.Flags == LTP_PC_SMT) ? 2 : 1;
}
byteOffset += ptr->Size;
ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size);
}
free(buffer);
std::vector<int> groups;
// Run as many threads as possible on the same node until core limit is
// reached, then move on filling the next node.
for (int n = 0; n < nodes; n++)
for (int i = 0; i < cores / nodes; i++)
groups.push_back(n);
// In case a core has more than one logical processor (we assume 2) and we
// have still threads to allocate, then spread them evenly across available
// nodes.
for (int t = 0; t < threads - cores; t++)
groups.push_back(t % nodes);
// If we still have more threads than the total number of logical processors
// then return -1 and let the OS to decide what to do.
return idx < groups.size() ? groups[idx] : -1;
}
/// bindThisThread() set the group affinity of the current thread
void bindThisThread(size_t idx) {
// Use a local variable instead of a static: slower but thread-safe
int group = get_group(idx);
if (group == -1)
return;
GROUP_AFFINITY mask;
if (GetNumaNodeProcessorMaskEx(group, &mask))
SetThreadGroupAffinity(GetCurrentThread(), &mask, nullptr);
}
#endif
} // namespace WinProcGroup

View file

@ -97,4 +97,15 @@ public:
{ return T(rand64() & rand64() & rand64()); } { return T(rand64() & rand64() & rand64()); }
}; };
/// Under Windows it is not possible for a process to run on more than one
/// logical processor group. This usually means to be limited to use max 64
/// cores. To overcome this, some special platform specific API should be
/// called to set group affinity for each thread. Original code from Texel by
/// Peter Österlund.
namespace WinProcGroup {
void bindThisThread(size_t idx);
}
#endif // #ifndef MISC_H_INCLUDED #endif // #ifndef MISC_H_INCLUDED

View file

@ -96,6 +96,8 @@ void Thread::start_searching(bool resume) {
void Thread::idle_loop() { void Thread::idle_loop() {
WinProcGroup::bindThisThread(idx);
while (!exit) while (!exit)
{ {
std::unique_lock<Mutex> lk(mutex); std::unique_lock<Mutex> lk(mutex);