mirror of
https://github.com/sockspls/badfish
synced 2025-07-12 03:59:15 +00:00
Implement proper stop signalling from root node
Previous behavior was to wait on all nodes to finish their search on their own TM and aggregate to root node via a blocking MPI_Allreduce call. This seems to be problematic. In this commit a proper non-blocking signalling barrier was implemented to use TM from root node to control the cluster search, and disable TM on all non-root nodes. Also includes some cosmetic fix to the nodes/NPS display.
This commit is contained in:
parent
3b7b632aa5
commit
8a95d269eb
6 changed files with 48 additions and 6 deletions
|
@ -37,10 +37,13 @@ namespace Cluster {
|
||||||
|
|
||||||
static int world_rank = MPI_PROC_NULL;
|
static int world_rank = MPI_PROC_NULL;
|
||||||
static int world_size = 0;
|
static int world_size = 0;
|
||||||
|
static bool stop_signal = false;
|
||||||
|
static MPI_Request reqStop = MPI_REQUEST_NULL;
|
||||||
|
|
||||||
static MPI_Comm InputComm = MPI_COMM_NULL;
|
static MPI_Comm InputComm = MPI_COMM_NULL;
|
||||||
static MPI_Comm TTComm = MPI_COMM_NULL;
|
static MPI_Comm TTComm = MPI_COMM_NULL;
|
||||||
static MPI_Comm MoveComm = MPI_COMM_NULL;
|
static MPI_Comm MoveComm = MPI_COMM_NULL;
|
||||||
|
static MPI_Comm StopComm = MPI_COMM_NULL;
|
||||||
|
|
||||||
static MPI_Datatype TTEntryDatatype = MPI_DATATYPE_NULL;
|
static MPI_Datatype TTEntryDatatype = MPI_DATATYPE_NULL;
|
||||||
static std::vector<TTEntry> TTBuff;
|
static std::vector<TTEntry> TTBuff;
|
||||||
|
@ -104,6 +107,7 @@ void init() {
|
||||||
MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
|
MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
|
||||||
MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
|
MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
|
||||||
MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
|
MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
|
||||||
|
MPI_Comm_dup(MPI_COMM_WORLD, &StopComm);
|
||||||
}
|
}
|
||||||
|
|
||||||
void finalize() {
|
void finalize() {
|
||||||
|
@ -131,6 +135,32 @@ bool getline(std::istream& input, std::string& str) {
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void sync_start() {
|
||||||
|
stop_signal = false;
|
||||||
|
|
||||||
|
// Start listening to stop signal
|
||||||
|
if (!is_root())
|
||||||
|
MPI_Ibarrier(StopComm, &reqStop);
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_stop() {
|
||||||
|
if (is_root()) {
|
||||||
|
if (!stop_signal && Threads.stop) {
|
||||||
|
// Signal the cluster about stopping
|
||||||
|
stop_signal = true;
|
||||||
|
MPI_Ibarrier(StopComm, &reqStop);
|
||||||
|
MPI_Wait(&reqStop, MPI_STATUS_IGNORE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int flagStop;
|
||||||
|
// Check if we've received any stop signal
|
||||||
|
MPI_Test(&reqStop, &flagStop, MPI_STATUS_IGNORE);
|
||||||
|
if (flagStop)
|
||||||
|
Threads.stop = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int size() {
|
int size() {
|
||||||
return world_size;
|
return world_size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,6 +69,8 @@ inline bool is_root() { return rank() == 0; }
|
||||||
void save(Thread* thread, TTEntry* tte,
|
void save(Thread* thread, TTEntry* tte,
|
||||||
Key k, Value v, Bound b, Depth d, Move m, Value ev);
|
Key k, Value v, Bound b, Depth d, Move m, Value ev);
|
||||||
void reduce_moves(MoveInfo& mi);
|
void reduce_moves(MoveInfo& mi);
|
||||||
|
void sync_start();
|
||||||
|
void sync_stop();
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -86,6 +88,8 @@ inline void save(Thread* thread, TTEntry* tte,
|
||||||
tte->save(k, v, b, d, m, ev);
|
tte->save(k, v, b, d, m, ev);
|
||||||
}
|
}
|
||||||
inline void reduce_moves(MoveInfo&) { }
|
inline void reduce_moves(MoveInfo&) { }
|
||||||
|
inline void sync_start() { }
|
||||||
|
inline void sync_stop() { }
|
||||||
|
|
||||||
#endif /* USE_MPI */
|
#endif /* USE_MPI */
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,6 @@
|
||||||
#include "tt.h"
|
#include "tt.h"
|
||||||
#include "uci.h"
|
#include "uci.h"
|
||||||
#include "syzygy/tbprobe.h"
|
#include "syzygy/tbprobe.h"
|
||||||
#include "cluster.h"
|
|
||||||
|
|
||||||
namespace PSQT {
|
namespace PSQT {
|
||||||
void init();
|
void init();
|
||||||
|
|
|
@ -234,12 +234,15 @@ void MainThread::search() {
|
||||||
Threads.stopOnPonderhit = true;
|
Threads.stopOnPonderhit = true;
|
||||||
|
|
||||||
while (!Threads.stop && (Threads.ponder || Limits.infinite))
|
while (!Threads.stop && (Threads.ponder || Limits.infinite))
|
||||||
{} // Busy wait for a stop or a ponder reset
|
{ } // Busy wait for a stop or a ponder reset
|
||||||
|
|
||||||
// Stop the threads if not already stopped (also raise the stop if
|
// Stop the threads if not already stopped (also raise the stop if
|
||||||
// "ponderhit" just reset Threads.ponder).
|
// "ponderhit" just reset Threads.ponder).
|
||||||
Threads.stop = true;
|
Threads.stop = true;
|
||||||
|
|
||||||
|
// Finish any outstanding barriers.
|
||||||
|
Cluster::sync_stop();
|
||||||
|
|
||||||
// Wait until all threads have finished
|
// Wait until all threads have finished
|
||||||
for (Thread* th : Threads)
|
for (Thread* th : Threads)
|
||||||
if (th != this)
|
if (th != this)
|
||||||
|
@ -292,8 +295,8 @@ void MainThread::search() {
|
||||||
|
|
||||||
previousScore = static_cast<Value>(mi.score);
|
previousScore = static_cast<Value>(mi.score);
|
||||||
|
|
||||||
// Send again PV info if we have a new best thread
|
|
||||||
if (Cluster::is_root()) {
|
if (Cluster::is_root()) {
|
||||||
|
// Send again PV info if we have a new best thread
|
||||||
if (bestThread != this)
|
if (bestThread != this)
|
||||||
sync_cout << UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE) << sync_endl;
|
sync_cout << UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE) << sync_endl;
|
||||||
|
|
||||||
|
@ -1608,6 +1611,9 @@ void MainThread::check_time() {
|
||||||
if (Threads.ponder)
|
if (Threads.ponder)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
// Check if root has reached a stop barrier
|
||||||
|
Cluster::sync_stop();
|
||||||
|
|
||||||
if ( (Limits.use_time_management() && elapsed > Time.maximum() - 10)
|
if ( (Limits.use_time_management() && elapsed > Time.maximum() - 10)
|
||||||
|| (Limits.movetime && elapsed >= Limits.movetime)
|
|| (Limits.movetime && elapsed >= Limits.movetime)
|
||||||
|| (Limits.nodes && Threads.nodes_searched() >= (uint64_t)Limits.nodes))
|
|| (Limits.nodes && Threads.nodes_searched() >= (uint64_t)Limits.nodes))
|
||||||
|
@ -1653,8 +1659,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
|
||||||
if (!tb && i == pvIdx)
|
if (!tb && i == pvIdx)
|
||||||
ss << (v >= beta ? " lowerbound" : v <= alpha ? " upperbound" : "");
|
ss << (v >= beta ? " lowerbound" : v <= alpha ? " upperbound" : "");
|
||||||
|
|
||||||
ss << " nodes " << nodesSearched
|
ss << " nodes " << nodesSearched * Cluster::size()
|
||||||
<< " nps " << nodesSearched * 1000 / elapsed;
|
<< " nps " << nodesSearched * Cluster::size() * 1000 / elapsed;
|
||||||
|
|
||||||
if (elapsed > 1000) // Earlier makes little sense
|
if (elapsed > 1000) // Earlier makes little sense
|
||||||
ss << " hashfull " << TT.hashfull();
|
ss << " hashfull " << TT.hashfull();
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "movepick.h"
|
#include "movepick.h"
|
||||||
#include "types.h"
|
#include "types.h"
|
||||||
|
#include "cluster.h"
|
||||||
|
|
||||||
class Position;
|
class Position;
|
||||||
|
|
||||||
|
@ -89,7 +90,7 @@ struct LimitsType {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool use_time_management() const {
|
bool use_time_management() const {
|
||||||
return !(mate | movetime | depth | nodes | perft | infinite);
|
return Cluster::is_root() && !(mate | movetime | depth | nodes | perft | infinite);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Move> searchmoves;
|
std::vector<Move> searchmoves;
|
||||||
|
|
|
@ -163,6 +163,8 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
|
||||||
main()->wait_for_search_finished();
|
main()->wait_for_search_finished();
|
||||||
|
|
||||||
stopOnPonderhit = stop = false;
|
stopOnPonderhit = stop = false;
|
||||||
|
Cluster::sync_start();
|
||||||
|
|
||||||
ponder = ponderMode;
|
ponder = ponderMode;
|
||||||
Search::Limits = limits;
|
Search::Limits = limits;
|
||||||
Search::RootMoves rootMoves;
|
Search::RootMoves rootMoves;
|
||||||
|
|
Loading…
Add table
Reference in a new issue