1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-30 08:43:09 +00:00
BadFish/src/thread.cpp
Tomasz Sobczyk 8e560c4fd3 Replicate network weights only to used NUMA nodes
On a system with multiple NUMA nodes, this patch avoids unneeded replicated
(e.g. 8x for a single threaded run), reducting memory use in that case.

Lazy initialization forced before search.

Passed STC:
https://tests.stockfishchess.org/tests/view/66a28c524ff211be9d4ecdd4
LLR: 2.96 (-2.94,2.94) <-1.75,0.25>
Total: 691776 W: 179429 L: 179927 D: 332420
Ptnml(0-2): 2573, 79370, 182547, 78778, 2620

closes https://github.com/official-stockfish/Stockfish/pull/5515

No functional change
2024-08-03 09:41:37 +02:00

410 lines
14 KiB
C++

/*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "thread.h"
#include <algorithm>
#include <cassert>
#include <deque>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include "movegen.h"
#include "search.h"
#include "syzygy/tbprobe.h"
#include "timeman.h"
#include "types.h"
#include "uci.h"
#include "ucioption.h"
namespace Stockfish {
// Constructor launches the thread and waits until it goes to sleep
// in idle_loop(). Note that 'searching' and 'exit' should be already set.
Thread::Thread(Search::SharedState& sharedState,
std::unique_ptr<Search::ISearchManager> sm,
size_t n,
OptionalThreadToNumaNodeBinder binder) :
idx(n),
nthreads(sharedState.options["Threads"]),
stdThread(&Thread::idle_loop, this) {
wait_for_search_finished();
run_custom_job([this, &binder, &sharedState, &sm, n]() {
// Use the binder to [maybe] bind the threads to a NUMA node before doing
// the Worker allocation. Ideally we would also allocate the SearchManager
// here, but that's minor.
this->numaAccessToken = binder();
this->worker =
std::make_unique<Search::Worker>(sharedState, std::move(sm), n, this->numaAccessToken);
});
wait_for_search_finished();
}
// Destructor wakes up the thread in idle_loop() and waits
// for its termination. Thread should be already waiting.
Thread::~Thread() {
assert(!searching);
exit = true;
start_searching();
stdThread.join();
}
// Wakes up the thread that will start the search
void Thread::start_searching() {
assert(worker != nullptr);
run_custom_job([this]() { worker->start_searching(); });
}
// Clears the histories for the thread worker (usually before a new game)
void Thread::clear_worker() {
assert(worker != nullptr);
run_custom_job([this]() { worker->clear(); });
}
// Blocks on the condition variable until the thread has finished searching
void Thread::wait_for_search_finished() {
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [&] { return !searching; });
}
// Launching a function in the thread
void Thread::run_custom_job(std::function<void()> f) {
{
std::unique_lock<std::mutex> lk(mutex);
cv.wait(lk, [&] { return !searching; });
jobFunc = std::move(f);
searching = true;
}
cv.notify_one();
}
void Thread::ensure_network_replicated() { worker->ensure_network_replicated(); }
// Thread gets parked here, blocked on the condition variable
// when the thread has no work to do.
void Thread::idle_loop() {
while (true)
{
std::unique_lock<std::mutex> lk(mutex);
searching = false;
cv.notify_one(); // Wake up anyone waiting for search finished
cv.wait(lk, [&] { return searching; });
if (exit)
return;
std::function<void()> job = std::move(jobFunc);
jobFunc = nullptr;
lk.unlock();
if (job)
job();
}
}
Search::SearchManager* ThreadPool::main_manager() { return main_thread()->worker->main_manager(); }
uint64_t ThreadPool::nodes_searched() const { return accumulate(&Search::Worker::nodes); }
uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits); }
// Creates/destroys threads to match the requested number.
// Created and launched threads will immediately go to sleep in idle_loop.
// Upon resizing, threads are recreated to allow for binding if necessary.
void ThreadPool::set(const NumaConfig& numaConfig,
Search::SharedState sharedState,
const Search::SearchManager::UpdateContext& updateContext) {
if (threads.size() > 0) // destroy any existing thread(s)
{
main_thread()->wait_for_search_finished();
threads.clear();
boundThreadToNumaNode.clear();
}
const size_t requested = sharedState.options["Threads"];
if (requested > 0) // create new thread(s)
{
// Binding threads may be problematic when there's multiple NUMA nodes and
// multiple Stockfish instances running. In particular, if each instance
// runs a single thread then they would all be mapped to the first NUMA node.
// This is undesirable, and so the default behaviour (i.e. when the user does not
// change the NumaConfig UCI setting) is to not bind the threads to processors
// unless we know for sure that we span NUMA nodes and replication is required.
const std::string numaPolicy(sharedState.options["NumaPolicy"]);
const bool doBindThreads = [&]() {
if (numaPolicy == "none")
return false;
if (numaPolicy == "auto")
return numaConfig.suggests_binding_threads(requested);
// numaPolicy == "system", or explicitly set by the user
return true;
}();
boundThreadToNumaNode = doBindThreads
? numaConfig.distribute_threads_among_numa_nodes(requested)
: std::vector<NumaIndex>{};
while (threads.size() < requested)
{
const size_t threadId = threads.size();
const NumaIndex numaId = doBindThreads ? boundThreadToNumaNode[threadId] : 0;
auto manager = threadId == 0 ? std::unique_ptr<Search::ISearchManager>(
std::make_unique<Search::SearchManager>(updateContext))
: std::make_unique<Search::NullSearchManager>();
// When not binding threads we want to force all access to happen
// from the same NUMA node, because in case of NUMA replicated memory
// accesses we don't want to trash cache in case the threads get scheduled
// on the same NUMA node.
auto binder = doBindThreads ? OptionalThreadToNumaNodeBinder(numaConfig, numaId)
: OptionalThreadToNumaNodeBinder(numaId);
threads.emplace_back(
std::make_unique<Thread>(sharedState, std::move(manager), threadId, binder));
}
clear();
main_thread()->wait_for_search_finished();
}
}
// Sets threadPool data to initial values
void ThreadPool::clear() {
if (threads.size() == 0)
return;
for (auto&& th : threads)
th->clear_worker();
for (auto&& th : threads)
th->wait_for_search_finished();
// These two affect the time taken on the first move of a game:
main_manager()->bestPreviousAverageScore = VALUE_INFINITE;
main_manager()->previousTimeReduction = 0.85;
main_manager()->callsCnt = 0;
main_manager()->bestPreviousScore = VALUE_INFINITE;
main_manager()->originalTimeAdjust = -1;
main_manager()->tm.clear();
}
void ThreadPool::run_on_thread(size_t threadId, std::function<void()> f) {
assert(threads.size() > threadId);
threads[threadId]->run_custom_job(std::move(f));
}
void ThreadPool::wait_on_thread(size_t threadId) {
assert(threads.size() > threadId);
threads[threadId]->wait_for_search_finished();
}
size_t ThreadPool::num_threads() const { return threads.size(); }
// Wakes up main thread waiting in idle_loop() and returns immediately.
// Main thread will wake up other threads and start the search.
void ThreadPool::start_thinking(const OptionsMap& options,
Position& pos,
StateListPtr& states,
Search::LimitsType limits) {
main_thread()->wait_for_search_finished();
main_manager()->stopOnPonderhit = stop = abortedSearch = false;
main_manager()->ponder = limits.ponderMode;
increaseDepth = true;
Search::RootMoves rootMoves;
const auto legalmoves = MoveList<LEGAL>(pos);
for (const auto& uciMove : limits.searchmoves)
{
auto move = UCIEngine::to_move(pos, uciMove);
if (std::find(legalmoves.begin(), legalmoves.end(), move) != legalmoves.end())
rootMoves.emplace_back(move);
}
if (rootMoves.empty())
for (const auto& m : legalmoves)
rootMoves.emplace_back(m);
Tablebases::Config tbConfig = Tablebases::rank_root_moves(options, pos, rootMoves);
// After ownership transfer 'states' becomes empty, so if we stop the search
// and call 'go' again without setting a new position states.get() == nullptr.
assert(states.get() || setupStates.get());
if (states.get())
setupStates = std::move(states); // Ownership transfer, states is now empty
// We use Position::set() to set root position across threads. But there are
// some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot
// be deduced from a fen string, so set() clears them and they are set from
// setupStates->back() later. The rootState is per thread, earlier states are
// shared since they are read-only.
for (auto&& th : threads)
{
th->run_custom_job([&]() {
th->worker->limits = limits;
th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly =
th->worker->bestMoveChanges = 0;
th->worker->rootDepth = th->worker->completedDepth = 0;
th->worker->rootMoves = rootMoves;
th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState);
th->worker->rootState = setupStates->back();
th->worker->tbConfig = tbConfig;
});
}
for (auto&& th : threads)
th->wait_for_search_finished();
main_thread()->start_searching();
}
Thread* ThreadPool::get_best_thread() const {
Thread* bestThread = threads.front().get();
Value minScore = VALUE_NONE;
std::unordered_map<Move, int64_t, Move::MoveHash> votes(
2 * std::min(size(), bestThread->worker->rootMoves.size()));
// Find the minimum score of all threads
for (auto&& th : threads)
minScore = std::min(minScore, th->worker->rootMoves[0].score);
// Vote according to score and depth, and select the best thread
auto thread_voting_value = [minScore](Thread* th) {
return (th->worker->rootMoves[0].score - minScore + 14) * int(th->worker->completedDepth);
};
for (auto&& th : threads)
votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th.get());
for (auto&& th : threads)
{
const auto bestThreadScore = bestThread->worker->rootMoves[0].score;
const auto newThreadScore = th->worker->rootMoves[0].score;
const auto& bestThreadPV = bestThread->worker->rootMoves[0].pv;
const auto& newThreadPV = th->worker->rootMoves[0].pv;
const auto bestThreadMoveVote = votes[bestThreadPV[0]];
const auto newThreadMoveVote = votes[newThreadPV[0]];
const bool bestThreadInProvenWin = bestThreadScore >= VALUE_TB_WIN_IN_MAX_PLY;
const bool newThreadInProvenWin = newThreadScore >= VALUE_TB_WIN_IN_MAX_PLY;
const bool bestThreadInProvenLoss =
bestThreadScore != -VALUE_INFINITE && bestThreadScore <= VALUE_TB_LOSS_IN_MAX_PLY;
const bool newThreadInProvenLoss =
newThreadScore != -VALUE_INFINITE && newThreadScore <= VALUE_TB_LOSS_IN_MAX_PLY;
// We make sure not to pick a thread with truncated principal variation
const bool betterVotingValue =
thread_voting_value(th.get()) * int(newThreadPV.size() > 2)
> thread_voting_value(bestThread) * int(bestThreadPV.size() > 2);
if (bestThreadInProvenWin)
{
// Make sure we pick the shortest mate / TB conversion
if (newThreadScore > bestThreadScore)
bestThread = th.get();
}
else if (bestThreadInProvenLoss)
{
// Make sure we pick the shortest mated / TB conversion
if (newThreadInProvenLoss && newThreadScore < bestThreadScore)
bestThread = th.get();
}
else if (newThreadInProvenWin || newThreadInProvenLoss
|| (newThreadScore > VALUE_TB_LOSS_IN_MAX_PLY
&& (newThreadMoveVote > bestThreadMoveVote
|| (newThreadMoveVote == bestThreadMoveVote && betterVotingValue))))
bestThread = th.get();
}
return bestThread;
}
// Start non-main threads.
// Will be invoked by main thread after it has started searching.
void ThreadPool::start_searching() {
for (auto&& th : threads)
if (th != threads.front())
th->start_searching();
}
// Wait for non-main threads
void ThreadPool::wait_for_search_finished() const {
for (auto&& th : threads)
if (th != threads.front())
th->wait_for_search_finished();
}
std::vector<size_t> ThreadPool::get_bound_thread_count_by_numa_node() const {
std::vector<size_t> counts;
if (!boundThreadToNumaNode.empty())
{
NumaIndex highestNumaNode = 0;
for (NumaIndex n : boundThreadToNumaNode)
if (n > highestNumaNode)
highestNumaNode = n;
counts.resize(highestNumaNode + 1, 0);
for (NumaIndex n : boundThreadToNumaNode)
counts[n] += 1;
}
return counts;
}
void ThreadPool::ensure_network_replicated() {
for (auto&& th : threads)
th->ensure_network_replicated();
}
} // namespace Stockfish