mirror of
https://github.com/sockspls/badfish
synced 2025-07-12 03:59:15 +00:00
[cluster] Improve user documentation
- add cluster info line - provides basic info on positions received/stored in a cluster run, useful to judge performance. - document most cluster functionality in the readme.md No functional change
This commit is contained in:
parent
21819b7bf8
commit
10a920d7d7
6 changed files with 153 additions and 31 deletions
24
Readme.md
24
Readme.md
|
@ -129,6 +129,30 @@ more compact than Nalimov tablebases, while still storing all information
|
||||||
needed for optimal play and in addition being able to take into account
|
needed for optimal play and in addition being able to take into account
|
||||||
the 50-move rule.
|
the 50-move rule.
|
||||||
|
|
||||||
|
## Stockfish on distributed memory systems
|
||||||
|
|
||||||
|
The cluster branch allows for running Stockfish on a cluster of servers (nodes)
|
||||||
|
that are connected with a high-speed and low-latency network, using the message
|
||||||
|
passing interface (MPI). In this case, one MPI process should be run per node,
|
||||||
|
and UCI options can be used to set the number of threads/hash per node as usual.
|
||||||
|
Typically, the engine will be invoked as
|
||||||
|
```
|
||||||
|
mpirun -np N /path/to/stockfish
|
||||||
|
```
|
||||||
|
where ```N``` stands for the number of MPI processes used. To build the cluster
|
||||||
|
branch, it is sufficient to specify ```COMPILER=mpicxx``` on the make command line,
|
||||||
|
and do a clean build:
|
||||||
|
```
|
||||||
|
make -j ARCH=x86-64-modern clean build COMPILER=mpicxx
|
||||||
|
```
|
||||||
|
If the name of the compiler wrapper (typically mpicxx, but sometimes e.g. CC) does
|
||||||
|
not match ```mpi``` an edit to the Makefile is required. Make sure that the MPI
|
||||||
|
installation is configured to support ```MPI_THREAD_MULTIPLE```, this might require
|
||||||
|
adding system specific compiler options to the Makefile. Stockfish employs
|
||||||
|
non-blocking (asynchronous) communication, and benefits from an MPI
|
||||||
|
implementation that efficiently supports this. Some MPI implentations might benefit
|
||||||
|
from leaving 1 core/thread free for these asynchronous communications, and might require
|
||||||
|
setting additional environment variables. Refer to your MPI documentation for more info.
|
||||||
|
|
||||||
## Compiling Stockfish yourself from the sources
|
## Compiling Stockfish yourself from the sources
|
||||||
|
|
||||||
|
|
125
src/cluster.cpp
125
src/cluster.cpp
|
@ -2,7 +2,7 @@
|
||||||
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
|
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
|
||||||
Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
|
Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
|
||||||
Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
|
Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
|
||||||
Copyright (C) 2015-2018 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
|
Copyright (C) 2015-2019 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
|
||||||
|
|
||||||
Stockfish is free software: you can redistribute it and/or modify
|
Stockfish is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -32,37 +32,46 @@
|
||||||
#include "cluster.h"
|
#include "cluster.h"
|
||||||
#include "thread.h"
|
#include "thread.h"
|
||||||
#include "tt.h"
|
#include "tt.h"
|
||||||
|
#include "timeman.h"
|
||||||
|
|
||||||
namespace Cluster {
|
namespace Cluster {
|
||||||
|
|
||||||
|
// Total number of ranks and rank within the communicator
|
||||||
static int world_rank = MPI_PROC_NULL;
|
static int world_rank = MPI_PROC_NULL;
|
||||||
static int world_size = 0;
|
static int world_size = 0;
|
||||||
|
|
||||||
|
// Signals between ranks exchange basic info using a dedicated communicator
|
||||||
|
static MPI_Comm signalsComm = MPI_COMM_NULL;
|
||||||
static MPI_Request reqSignals = MPI_REQUEST_NULL;
|
static MPI_Request reqSignals = MPI_REQUEST_NULL;
|
||||||
static uint64_t signalsCallCounter = 0;
|
static uint64_t signalsCallCounter = 0;
|
||||||
|
|
||||||
enum Signals : int { SIG_NODES = 0, SIG_STOP = 1, SIG_TB = 2, SIG_NB = 3};
|
// Signals are the number of nodes searched, stop, table base hits, transposition table saves
|
||||||
|
enum Signals : int { SIG_NODES = 0, SIG_STOP = 1, SIG_TB = 2, SIG_TTS = 3, SIG_NB = 4};
|
||||||
static uint64_t signalsSend[SIG_NB] = {};
|
static uint64_t signalsSend[SIG_NB] = {};
|
||||||
static uint64_t signalsRecv[SIG_NB] = {};
|
static uint64_t signalsRecv[SIG_NB] = {};
|
||||||
|
|
||||||
static uint64_t nodesSearchedOthers = 0;
|
static uint64_t nodesSearchedOthers = 0;
|
||||||
static uint64_t tbHitsOthers = 0;
|
static uint64_t tbHitsOthers = 0;
|
||||||
|
static uint64_t TTsavesOthers = 0;
|
||||||
static uint64_t stopSignalsPosted = 0;
|
static uint64_t stopSignalsPosted = 0;
|
||||||
|
|
||||||
|
// The UCI threads of each rank exchange use a dedicated communicator
|
||||||
static MPI_Comm InputComm = MPI_COMM_NULL;
|
static MPI_Comm InputComm = MPI_COMM_NULL;
|
||||||
static MPI_Comm TTComm = MPI_COMM_NULL;
|
|
||||||
|
// bestMove requires MoveInfo communicators and data types
|
||||||
static MPI_Comm MoveComm = MPI_COMM_NULL;
|
static MPI_Comm MoveComm = MPI_COMM_NULL;
|
||||||
static MPI_Comm signalsComm = MPI_COMM_NULL;
|
|
||||||
|
|
||||||
static std::vector<KeyedTTEntry> TTRecvBuff;
|
|
||||||
static MPI_Request reqGather = MPI_REQUEST_NULL;
|
|
||||||
static uint64_t gathersPosted = 0;
|
|
||||||
|
|
||||||
static std::atomic<uint64_t> TTCacheCounter = {};
|
|
||||||
|
|
||||||
static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
|
static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
|
||||||
|
|
||||||
|
// TT entries are communicated with a dedicated communicator.
|
||||||
|
// The receive buffer is used to gather information from all ranks.
|
||||||
|
// THe TTCacheCounter tracks the number of local elements that are ready to be sent.
|
||||||
|
static MPI_Comm TTComm = MPI_COMM_NULL;
|
||||||
|
static MPI_Request reqGather = MPI_REQUEST_NULL;
|
||||||
|
static uint64_t gathersPosted = 0;
|
||||||
|
static std::vector<KeyedTTEntry> TTRecvBuff;
|
||||||
|
static std::atomic<uint64_t> TTCacheCounter = {};
|
||||||
|
|
||||||
|
/// Initialize MPI and associated data types. Note that the MPI library must be configured
|
||||||
|
/// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
|
||||||
void init() {
|
void init() {
|
||||||
|
|
||||||
int thread_support;
|
int thread_support;
|
||||||
|
@ -91,10 +100,9 @@ void init() {
|
||||||
MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
|
MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Finalize MPI and free the associated data types.
|
||||||
void finalize() {
|
void finalize() {
|
||||||
|
|
||||||
|
|
||||||
// free data tyes and communicators
|
|
||||||
MPI_Type_free(&MIDatatype);
|
MPI_Type_free(&MIDatatype);
|
||||||
|
|
||||||
MPI_Comm_free(&InputComm);
|
MPI_Comm_free(&InputComm);
|
||||||
|
@ -105,16 +113,19 @@ void finalize() {
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the total number of ranks
|
||||||
int size() {
|
int size() {
|
||||||
|
|
||||||
return world_size;
|
return world_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the rank (index) of the process
|
||||||
int rank() {
|
int rank() {
|
||||||
|
|
||||||
return world_rank;
|
return world_rank;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The receive buffer depends on the number of MPI ranks and threads, resize as needed
|
||||||
void ttRecvBuff_resize(size_t nThreads) {
|
void ttRecvBuff_resize(size_t nThreads) {
|
||||||
|
|
||||||
TTRecvBuff.resize(TTCacheSize * world_size * nThreads);
|
TTRecvBuff.resize(TTCacheSize * world_size * nThreads);
|
||||||
|
@ -122,7 +133,10 @@ void ttRecvBuff_resize(size_t nThreads) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// As input is only received by the root (rank 0) of the cluster, this input must be relayed
|
||||||
|
/// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
|
||||||
|
/// dedicated getline implementation, where the root broadcasts to all other ranks the received
|
||||||
|
/// information.
|
||||||
bool getline(std::istream& input, std::string& str) {
|
bool getline(std::istream& input, std::string& str) {
|
||||||
|
|
||||||
int size;
|
int size;
|
||||||
|
@ -136,7 +150,8 @@ bool getline(std::istream& input, std::string& str) {
|
||||||
size = vec.size();
|
size = vec.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some MPI implementations use busy-wait polling, while we need yielding
|
// Some MPI implementations use busy-wait polling, while we need yielding as otherwise
|
||||||
|
// the UCI thread on the non-root ranks would be consuming resources.
|
||||||
static MPI_Request reqInput = MPI_REQUEST_NULL;
|
static MPI_Request reqInput = MPI_REQUEST_NULL;
|
||||||
MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
|
MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
|
||||||
if (is_root())
|
if (is_root())
|
||||||
|
@ -154,6 +169,7 @@ bool getline(std::istream& input, std::string& str) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Broadcast received string
|
||||||
if (!is_root())
|
if (!is_root())
|
||||||
vec.resize(size);
|
vec.resize(size);
|
||||||
MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
|
MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
|
||||||
|
@ -164,6 +180,7 @@ bool getline(std::istream& input, std::string& str) {
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sending part of the signal communication loop
|
||||||
void signals_send() {
|
void signals_send() {
|
||||||
|
|
||||||
signalsSend[SIG_NODES] = Threads.nodes_searched();
|
signalsSend[SIG_NODES] = Threads.nodes_searched();
|
||||||
|
@ -174,23 +191,33 @@ void signals_send() {
|
||||||
++signalsCallCounter;
|
++signalsCallCounter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Processing part of the signal communication loop.
|
||||||
|
/// For some counters (e.g. nodes) we only keep their sum on the other nodes
|
||||||
|
/// allowing to add local counters at any time for more fine grained process,
|
||||||
|
/// which is useful to indicate progress during early iterations, and to have
|
||||||
|
/// node counts that exactly match the non-MPI code in the single rank case.
|
||||||
|
/// This call also propagates the stop signal between ranks.
|
||||||
void signals_process() {
|
void signals_process() {
|
||||||
|
|
||||||
nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
|
nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
|
||||||
tbHitsOthers = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
|
tbHitsOthers = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
|
||||||
|
TTsavesOthers = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
|
||||||
stopSignalsPosted = signalsRecv[SIG_STOP];
|
stopSignalsPosted = signalsRecv[SIG_STOP];
|
||||||
if (signalsRecv[SIG_STOP] > 0)
|
if (signalsRecv[SIG_STOP] > 0)
|
||||||
Threads.stop = true;
|
Threads.stop = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// During search, most message passing is asynchronous, but at the end of
|
||||||
|
/// search it makes sense to bring them to a common, finalized state.
|
||||||
void signals_sync() {
|
void signals_sync() {
|
||||||
|
|
||||||
while(stopSignalsPosted < uint64_t(size()))
|
while(stopSignalsPosted < uint64_t(size()))
|
||||||
signals_poll();
|
signals_poll();
|
||||||
|
|
||||||
// finalize outstanding messages of the signal loops. We might have issued one call less than needed on some ranks.
|
// Finalize outstanding messages of the signal loops.
|
||||||
|
// We might have issued one call less than needed on some ranks.
|
||||||
uint64_t globalCounter;
|
uint64_t globalCounter;
|
||||||
MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm); // MoveComm needed
|
MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
|
||||||
if (signalsCallCounter < globalCounter)
|
if (signalsCallCounter < globalCounter)
|
||||||
{
|
{
|
||||||
MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
|
MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
|
||||||
|
@ -201,7 +228,7 @@ void signals_sync() {
|
||||||
|
|
||||||
signals_process();
|
signals_process();
|
||||||
|
|
||||||
// finalize outstanding messages in the gather loop
|
// Finalize outstanding messages in the gather loop
|
||||||
MPI_Allreduce(&gathersPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
|
MPI_Allreduce(&gathersPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
|
||||||
if (gathersPosted < globalCounter)
|
if (gathersPosted < globalCounter)
|
||||||
{
|
{
|
||||||
|
@ -217,16 +244,19 @@ void signals_sync() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Initialize signal counters to zero.
|
||||||
void signals_init() {
|
void signals_init() {
|
||||||
|
|
||||||
stopSignalsPosted = tbHitsOthers = nodesSearchedOthers = 0;
|
stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
|
||||||
|
|
||||||
signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
|
signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
|
||||||
signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
|
signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
|
||||||
|
signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
|
||||||
signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
|
signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Poll the signal loop, and start next round as needed.
|
||||||
void signals_poll() {
|
void signals_poll() {
|
||||||
|
|
||||||
int flag;
|
int flag;
|
||||||
|
@ -238,14 +268,39 @@ void signals_poll() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Provide basic info related the cluster performance, in particular, the number of signals send,
|
||||||
|
/// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
|
||||||
|
/// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
|
||||||
|
void cluster_info(Depth depth) {
|
||||||
|
|
||||||
|
TimePoint elapsed = Time.elapsed() + 1;
|
||||||
|
uint64_t TTSaves = TT_saves();
|
||||||
|
|
||||||
|
sync_cout << "info depth " << depth / ONE_PLY << " cluster "
|
||||||
|
<< " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
|
||||||
|
<< " gathers " << gathersPosted << " gpps " << TTRecvBuff.size() * gathersPosted * 1000 / elapsed
|
||||||
|
<< " TTSaves " << TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed
|
||||||
|
<< sync_endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
|
||||||
|
/// If sufficient entries has been collected, a communication is initiated.
|
||||||
|
/// If a communication has been completed, the received results are saved to the TT.
|
||||||
void save(Thread* thread, TTEntry* tte,
|
void save(Thread* thread, TTEntry* tte,
|
||||||
Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) {
|
Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) {
|
||||||
|
|
||||||
|
// Standard save to the TT
|
||||||
tte->save(k, v, PvHit, b, d, m, ev);
|
tte->save(k, v, PvHit, b, d, m, ev);
|
||||||
|
|
||||||
|
// If the entry is of sufficient depth to be worth communicating, take action.
|
||||||
if (d > 3 * ONE_PLY)
|
if (d > 3 * ONE_PLY)
|
||||||
{
|
{
|
||||||
// Try to add to thread's send buffer
|
// count the TTsaves to information: this should be relatively similar
|
||||||
|
// to the number of entries we can send/recv.
|
||||||
|
thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
|
||||||
|
|
||||||
|
// Add to thread's send buffer, the locking here avoids races when the master thread
|
||||||
|
// prepares the send buffer.
|
||||||
{
|
{
|
||||||
std::lock_guard<Mutex> lk(thread->ttCache.mutex);
|
std::lock_guard<Mutex> lk(thread->ttCache.mutex);
|
||||||
thread->ttCache.buffer.replace(KeyedTTEntry(k,*tte));
|
thread->ttCache.buffer.replace(KeyedTTEntry(k,*tte));
|
||||||
|
@ -254,7 +309,8 @@ void save(Thread* thread, TTEntry* tte,
|
||||||
|
|
||||||
size_t recvBuffPerRankSize = Threads.size() * TTCacheSize;
|
size_t recvBuffPerRankSize = Threads.size() * TTCacheSize;
|
||||||
|
|
||||||
// Communicate on main search thread
|
// Communicate on main search thread, as soon the threads combined have collected
|
||||||
|
// sufficient data to fill the send buffers.
|
||||||
if (thread == Threads.main() && TTCacheCounter > size() * recvBuffPerRankSize)
|
if (thread == Threads.main() && TTCacheCounter > size() * recvBuffPerRankSize)
|
||||||
{
|
{
|
||||||
// Test communication status
|
// Test communication status
|
||||||
|
@ -267,7 +323,7 @@ void save(Thread* thread, TTEntry* tte,
|
||||||
// Save all received entries to TT, and store our TTCaches, ready for the next round of communication
|
// Save all received entries to TT, and store our TTCaches, ready for the next round of communication
|
||||||
for (size_t irank = 0; irank < size_t(size()) ; ++irank)
|
for (size_t irank = 0; irank < size_t(size()) ; ++irank)
|
||||||
{
|
{
|
||||||
if (irank == size_t(rank()))
|
if (irank == size_t(rank())) // this is our part, fill the part of the buffer for sending
|
||||||
{
|
{
|
||||||
// Copy from the thread caches to the right spot in the buffer
|
// Copy from the thread caches to the right spot in the buffer
|
||||||
size_t i = irank * recvBuffPerRankSize;
|
size_t i = irank * recvBuffPerRankSize;
|
||||||
|
@ -284,7 +340,7 @@ void save(Thread* thread, TTEntry* tte,
|
||||||
|
|
||||||
TTCacheCounter = 0;
|
TTCacheCounter = 0;
|
||||||
}
|
}
|
||||||
else
|
else // process data received from the corresponding rank.
|
||||||
for (size_t i = irank * recvBuffPerRankSize; i < (irank + 1) * recvBuffPerRankSize; ++i)
|
for (size_t i = irank * recvBuffPerRankSize; i < (irank + 1) * recvBuffPerRankSize; ++i)
|
||||||
{
|
{
|
||||||
auto&& e = TTRecvBuff[i];
|
auto&& e = TTRecvBuff[i];
|
||||||
|
@ -302,7 +358,7 @@ void save(Thread* thread, TTEntry* tte,
|
||||||
TTComm, &reqGather);
|
TTComm, &reqGather);
|
||||||
++gathersPosted;
|
++gathersPosted;
|
||||||
|
|
||||||
// Force check of time on the next occasion.
|
// Force check of time on the next occasion, the above actions might have taken some time.
|
||||||
static_cast<MainThread*>(thread)->callsCnt = 0;
|
static_cast<MainThread*>(thread)->callsCnt = 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -310,8 +366,9 @@ void save(Thread* thread, TTEntry* tte,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
|
||||||
// TODO update to the scheme in master.. can this use aggregation of votes?
|
/// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
|
||||||
|
/// TODO update to the scheme in master.. can this use aggregation of votes?
|
||||||
void pick_moves(MoveInfo& mi, std::string& PVLine) {
|
void pick_moves(MoveInfo& mi, std::string& PVLine) {
|
||||||
|
|
||||||
MoveInfo* pMoveInfo = NULL;
|
MoveInfo* pMoveInfo = NULL;
|
||||||
|
@ -369,16 +426,25 @@ void pick_moves(MoveInfo& mi, std::string& PVLine) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return nodes searched (lazily updated cluster wide in the signal loop)
|
||||||
uint64_t nodes_searched() {
|
uint64_t nodes_searched() {
|
||||||
|
|
||||||
return nodesSearchedOthers + Threads.nodes_searched();
|
return nodesSearchedOthers + Threads.nodes_searched();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return table base hits (lazily updated cluster wide in the signal loop)
|
||||||
uint64_t tb_hits() {
|
uint64_t tb_hits() {
|
||||||
|
|
||||||
return tbHitsOthers + Threads.tb_hits();
|
return tbHitsOthers + Threads.tb_hits();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
|
||||||
|
uint64_t TT_saves() {
|
||||||
|
|
||||||
|
return TTsavesOthers + Threads.TT_saves();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -398,6 +464,11 @@ uint64_t tb_hits() {
|
||||||
return Threads.tb_hits();
|
return Threads.tb_hits();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t TT_saves() {
|
||||||
|
|
||||||
|
return Threads.TT_saves();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // USE_MPI
|
#endif // USE_MPI
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
|
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
|
||||||
Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
|
Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
|
||||||
Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
|
Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
|
||||||
Copyright (C) 2015-2017 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
|
Copyright (C) 2015-2019 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
|
||||||
|
|
||||||
Stockfish is free software: you can redistribute it and/or modify
|
Stockfish is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -30,8 +30,19 @@
|
||||||
|
|
||||||
class Thread;
|
class Thread;
|
||||||
|
|
||||||
|
/// The Cluster namespace contains functionality required to run on distributed
|
||||||
|
/// memory architectures using MPI as the message passing interface. On a high level,
|
||||||
|
/// a 'lazy SMP'-like scheme is implemented where TT saves of sufficient depth are
|
||||||
|
/// collected on each rank and distributed to, and used by, all other ranks,
|
||||||
|
/// which search essentially independently. The root (MPI rank 0) of the cluster
|
||||||
|
/// is responsible for all I/O and time management, communicating this info to
|
||||||
|
/// the other ranks as needed. UCI options such as Threads and Hash specify these
|
||||||
|
/// quantities per MPI rank. It is recommended to have one rank (MPI process) per node.
|
||||||
|
/// For the non-MPI case, wrappers that will be compiler-optimized away are provided.
|
||||||
|
|
||||||
namespace Cluster {
|
namespace Cluster {
|
||||||
|
|
||||||
|
/// Basic info to find the cluster-wide bestMove
|
||||||
struct MoveInfo {
|
struct MoveInfo {
|
||||||
int move;
|
int move;
|
||||||
int ponder;
|
int ponder;
|
||||||
|
@ -41,9 +52,12 @@ struct MoveInfo {
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef USE_MPI
|
#ifdef USE_MPI
|
||||||
using KeyedTTEntry = std::pair<Key, TTEntry>;
|
|
||||||
|
|
||||||
|
// store the TTEntry with its full key, so it can be saved on the receiver side
|
||||||
|
using KeyedTTEntry = std::pair<Key, TTEntry>;
|
||||||
constexpr std::size_t TTCacheSize = 16;
|
constexpr std::size_t TTCacheSize = 16;
|
||||||
|
|
||||||
|
// Threads locally cache their high-depth TT entries till a batch can be send by MPI
|
||||||
template <std::size_t N> class TTCache : public std::array<KeyedTTEntry, N> {
|
template <std::size_t N> class TTCache : public std::array<KeyedTTEntry, N> {
|
||||||
|
|
||||||
struct Compare {
|
struct Compare {
|
||||||
|
@ -54,6 +68,8 @@ template <std::size_t N> class TTCache : public std::array<KeyedTTEntry, N> {
|
||||||
Compare compare;
|
Compare compare;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
// Keep a heap of entries replacing low depth with high depth entries
|
||||||
bool replace(const KeyedTTEntry& value) {
|
bool replace(const KeyedTTEntry& value) {
|
||||||
|
|
||||||
if (compare(value, this->front()))
|
if (compare(value, this->front()))
|
||||||
|
@ -78,6 +94,8 @@ void pick_moves(MoveInfo& mi, std::string& PVLine);
|
||||||
void ttRecvBuff_resize(size_t nThreads);
|
void ttRecvBuff_resize(size_t nThreads);
|
||||||
uint64_t nodes_searched();
|
uint64_t nodes_searched();
|
||||||
uint64_t tb_hits();
|
uint64_t tb_hits();
|
||||||
|
uint64_t TT_saves();
|
||||||
|
void cluster_info(Depth depth);
|
||||||
void signals_init();
|
void signals_init();
|
||||||
void signals_poll();
|
void signals_poll();
|
||||||
void signals_sync();
|
void signals_sync();
|
||||||
|
@ -95,6 +113,8 @@ inline void pick_moves(MoveInfo&, std::string&) { }
|
||||||
inline void ttRecvBuff_resize(size_t) { }
|
inline void ttRecvBuff_resize(size_t) { }
|
||||||
uint64_t nodes_searched();
|
uint64_t nodes_searched();
|
||||||
uint64_t tb_hits();
|
uint64_t tb_hits();
|
||||||
|
uint64_t TT_saves();
|
||||||
|
inline void cluster_info(Depth) { }
|
||||||
inline void signals_init() { }
|
inline void signals_init() { }
|
||||||
inline void signals_poll() { }
|
inline void signals_poll() { }
|
||||||
inline void signals_sync() { }
|
inline void signals_sync() { }
|
||||||
|
|
|
@ -466,7 +466,10 @@ void Thread::search() {
|
||||||
&& multiPV == 1
|
&& multiPV == 1
|
||||||
&& (bestValue <= alpha || bestValue >= beta)
|
&& (bestValue <= alpha || bestValue >= beta)
|
||||||
&& Time.elapsed() > 3000)
|
&& Time.elapsed() > 3000)
|
||||||
|
{
|
||||||
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
|
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
|
||||||
|
Cluster::cluster_info(rootDepth);
|
||||||
|
}
|
||||||
|
|
||||||
// In case of failing low/high increase aspiration window and
|
// In case of failing low/high increase aspiration window and
|
||||||
// re-search, otherwise exit the loop.
|
// re-search, otherwise exit the loop.
|
||||||
|
@ -501,7 +504,10 @@ void Thread::search() {
|
||||||
|
|
||||||
if ( Cluster::is_root() && mainThread
|
if ( Cluster::is_root() && mainThread
|
||||||
&& (Threads.stop || pvIdx + 1 == multiPV || Time.elapsed() > 3000))
|
&& (Threads.stop || pvIdx + 1 == multiPV || Time.elapsed() > 3000))
|
||||||
|
{
|
||||||
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
|
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
|
||||||
|
Cluster::cluster_info(rootDepth);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!Threads.stop)
|
if (!Threads.stop)
|
||||||
|
|
|
@ -195,7 +195,7 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
|
||||||
|
|
||||||
for (Thread* th : *this)
|
for (Thread* th : *this)
|
||||||
{
|
{
|
||||||
th->nodes = th->tbHits = th->nmpMinPly = 0;
|
th->nodes = th->tbHits = th->TTsaves = th->nmpMinPly = 0;
|
||||||
th->rootDepth = th->completedDepth = DEPTH_ZERO;
|
th->rootDepth = th->completedDepth = DEPTH_ZERO;
|
||||||
th->rootMoves = rootMoves;
|
th->rootMoves = rootMoves;
|
||||||
th->rootPos.set(pos.fen(), pos.is_chess960(), &setupStates->back(), th);
|
th->rootPos.set(pos.fen(), pos.is_chess960(), &setupStates->back(), th);
|
||||||
|
|
|
@ -64,7 +64,7 @@ public:
|
||||||
size_t pvIdx, pvLast;
|
size_t pvIdx, pvLast;
|
||||||
int selDepth, nmpMinPly;
|
int selDepth, nmpMinPly;
|
||||||
Color nmpColor;
|
Color nmpColor;
|
||||||
std::atomic<uint64_t> nodes, tbHits;
|
std::atomic<uint64_t> nodes, tbHits, TTsaves;
|
||||||
|
|
||||||
Position rootPos;
|
Position rootPos;
|
||||||
Search::RootMoves rootMoves;
|
Search::RootMoves rootMoves;
|
||||||
|
@ -112,6 +112,7 @@ struct ThreadPool : public std::vector<Thread*> {
|
||||||
MainThread* main() const { return static_cast<MainThread*>(front()); }
|
MainThread* main() const { return static_cast<MainThread*>(front()); }
|
||||||
uint64_t nodes_searched() const { return accumulate(&Thread::nodes); }
|
uint64_t nodes_searched() const { return accumulate(&Thread::nodes); }
|
||||||
uint64_t tb_hits() const { return accumulate(&Thread::tbHits); }
|
uint64_t tb_hits() const { return accumulate(&Thread::tbHits); }
|
||||||
|
uint64_t TT_saves() const { return accumulate(&Thread::TTsaves); }
|
||||||
|
|
||||||
std::atomic_bool stop, ponder, stopOnPonderhit;
|
std::atomic_bool stop, ponder, stopOnPonderhit;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue