1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-07-11 19:49:14 +00:00

MPI/Cluster implementation for Stockfish

Based on Peter Österlund's "Lazy Cluster" algorithm,
but with some simplifications.
To compile, point COMPCXX to the MPI C++ compiler wrapper (mpicxx).
This commit is contained in:
Omri Mor 2017-12-07 17:33:28 -06:00 committed by Stéphane Nicolet
parent 800031c94c
commit 29c166a072
9 changed files with 388 additions and 41 deletions

View file

@ -36,7 +36,7 @@ BINDIR = $(PREFIX)/bin
PGOBENCH = ./$(EXE) bench
### Object files
OBJS = benchmark.o bitbase.o bitboard.o endgame.o evaluate.o main.o \
OBJS = benchmark.o bitbase.o bitboard.o cluster.o endgame.o evaluate.o main.o \
material.o misc.o movegen.o movepick.o pawns.o position.o psqt.o \
search.o thread.o timeman.o tt.o uci.o ucioption.o syzygy/tbprobe.o
@ -64,6 +64,7 @@ endif
# popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction
# sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions
# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction
# mpi = yes/no --- -DUSE_MPI --- Use Message Passing Interface
#
# Note that Makefile is space sensitive, so when adding new architectures
# or modifying existing flags, you have to make sure there are no extra spaces
@ -78,6 +79,7 @@ prefetch = no
popcnt = no
sse = no
pext = no
mpi = no
### 2.2 Architecture specific
@ -354,6 +356,15 @@ ifeq ($(OS), Android)
LDFLAGS += -fPIE -pie
endif
### 3.10 MPI
ifeq ($(CXX),$(filter $(CXX),mpicxx mpic++ mpiCC))
mpi = yes
endif
ifeq ($(mpi),yes)
CXXFLAGS += -DUSE_MPI
endif
### ==========================================================================
### Section 4. Public targets
@ -472,6 +483,7 @@ config-sanity:
@echo "popcnt: '$(popcnt)'"
@echo "sse: '$(sse)'"
@echo "pext: '$(pext)'"
@echo "mpi: '$(mpi)'"
@echo ""
@echo "Flags:"
@echo "CXX: $(CXX)"

197
src/cluster.cpp Normal file
View file

@ -0,0 +1,197 @@
/*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
Copyright (C) 2015-2018 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifdef USE_MPI
#include <array>
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <istream>
#include <mpi.h>
#include <string>
#include <vector>
#include "cluster.h"
#include "thread.h"
#include "tt.h"
namespace Cluster {
static int world_rank = MPI_PROC_NULL;
static int world_size = 0;
static MPI_Comm InputComm = MPI_COMM_NULL;
static MPI_Comm TTComm = MPI_COMM_NULL;
static MPI_Comm MoveComm = MPI_COMM_NULL;
static MPI_Datatype TTEntryDatatype = MPI_DATATYPE_NULL;
static std::vector<TTEntry> TTBuff;
static MPI_Op BestMoveOp = MPI_OP_NULL;
static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
static void BestMove(void* in, void* inout, int* len, MPI_Datatype* datatype) {
if (*datatype != MIDatatype)
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
MoveInfo* l = static_cast<MoveInfo*>(in);
MoveInfo* r = static_cast<MoveInfo*>(inout);
for (int i=0; i < *len; ++i)
{
if ( l[i].depth > r[i].depth
&& (l[i].score >= r[i].score || l[i].score >= VALUE_MATE_IN_MAX_PLY))
r[i] = l[i];
}
}
void init() {
int thread_support;
constexpr std::array<int, 6> TTblocklens = {1, 1, 1, 1, 1, 1};
const std::array<MPI_Aint, 6> TTdisps = {offsetof(TTEntry, key16),
offsetof(TTEntry, move16),
offsetof(TTEntry, value16),
offsetof(TTEntry, eval16),
offsetof(TTEntry, genBound8),
offsetof(TTEntry, depth8)};
const std::array<MPI_Datatype, 6> TTtypes = {MPI_UINT16_T,
MPI_UINT16_T,
MPI_INT16_T,
MPI_INT16_T,
MPI_UINT8_T,
MPI_INT8_T};
const std::array<MPI_Aint, 3> MIdisps = {offsetof(MoveInfo, depth),
offsetof(MoveInfo, score),
offsetof(MoveInfo, rank)};
MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
if (thread_support < MPI_THREAD_MULTIPLE)
{
std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE."
<< std::endl;
std::exit(EXIT_FAILURE);
}
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
TTBuff.resize(TTSendBufferSize * world_size);
MPI_Type_create_struct(6, TTblocklens.data(), TTdisps.data(), TTtypes.data(),
&TTEntryDatatype);
MPI_Type_commit(&TTEntryDatatype);
MPI_Type_create_hindexed_block(3, 1, MIdisps.data(), MPI_INT, &MIDatatype);
MPI_Type_commit(&MIDatatype);
MPI_Op_create(BestMove, true, &BestMoveOp);
MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
}
void finalize() {
MPI_Finalize();
}
bool getline(std::istream& input, std::string& str) {
int size;
std::vector<char> vec;
bool state;
if (is_root())
{
state = static_cast<bool>(std::getline(input, str));
vec.assign(str.begin(), str.end());
size = vec.size();
}
MPI_Bcast(&size, 1, MPI_UNSIGNED_LONG, 0, InputComm);
if (!is_root())
vec.resize(size);
MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
if (!is_root())
str.assign(vec.begin(), vec.end());
MPI_Bcast(&state, 1, MPI_CXX_BOOL, 0, InputComm);
return state;
}
int size() {
return world_size;
}
int rank() {
return world_rank;
}
void save(Thread* thread, TTEntry* tte,
Key k, Value v, Bound b, Depth d, Move m, Value ev, uint8_t g) {
tte->save(k, v, b, d, m, ev, g);
// Try to add to thread's send buffer
{
std::lock_guard<Mutex> lk(thread->ttBuffer.mutex);
thread->ttBuffer.buffer.replace(*tte);
}
// Communicate on main search thread
if (thread == Threads.main()) {
static MPI_Request req = MPI_REQUEST_NULL;
static TTSendBuffer<TTSendBufferSize> send_buff = {};
int flag;
bool found;
TTEntry* replace_tte;
// Test communication status
MPI_Test(&req, &flag, MPI_STATUS_IGNORE);
// Current communication is complete
if (flag) {
// Save all recieved entries
for (auto&& e : TTBuff) {
replace_tte = TT.probe(e.key(), found);
replace_tte->save(e.key(), e.value(), e.bound(), e.depth(),
e.move(), e.eval(), e.gen());
}
// Reset send buffer
send_buff = {};
// Build up new send buffer: best 16 found across all threads
for (auto&& th : Threads) {
std::lock_guard<Mutex> lk(th->ttBuffer.mutex);
for (auto&& e : th->ttBuffer.buffer)
send_buff.replace(e);
// Reset thread's send buffer
th->ttBuffer.buffer = {};
}
// Start next communication
MPI_Iallgather(send_buff.data(), send_buff.size(), TTEntryDatatype,
TTBuff.data(), TTSendBufferSize, TTEntryDatatype,
TTComm, &req);
}
}
}
void reduce_moves(MoveInfo& mi) {
MPI_Allreduce(MPI_IN_PLACE, &mi, 1, MIDatatype, BestMoveOp, MoveComm);
}
}
#endif // USE_MPI

94
src/cluster.h Normal file
View file

@ -0,0 +1,94 @@
/*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
Copyright (C) 2015-2017 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef CLUSTER_H_INCLUDED
#define CLUSTER_H_INCLUDED
#include <algorithm>
#include <array>
#include <istream>
#include <string>
#include "tt.h"
class Thread;
namespace Cluster {
struct MoveInfo {
int depth;
int score;
int rank;
};
#ifdef USE_MPI
constexpr std::size_t TTSendBufferSize = 16;
template <std::size_t N> class TTSendBuffer : public std::array<TTEntry, N> {
struct Compare {
inline bool operator()(const TTEntry& lhs, const TTEntry& rhs) {
return lhs.depth() > rhs.depth();
}
};
Compare compare;
public:
bool replace(const TTEntry& value) {
if (compare(value, this->front())) {
std::pop_heap(this->begin(), this->end(), compare);
this->back() = value;
std::push_heap(this->begin(), this->end(), compare);
return true;
}
return false;
}
};
void init();
void finalize();
bool getline(std::istream& input, std::string& str);
int size();
int rank();
inline bool is_root() { return rank() == 0; }
void save(Thread* thread, TTEntry* tte,
Key k, Value v, Bound b, Depth d, Move m, Value ev);
void reduce_moves(MoveInfo& mi);
#else
inline void init() { }
inline void finalize() { }
inline bool getline(std::istream& input, std::string& str) {
return static_cast<bool>(std::getline(input, str));
}
constexpr int size() { return 1; }
constexpr int rank() { return 0; }
constexpr bool is_root() { return true; }
inline void save(Thread* thread, TTEntry* tte,
Key k, Value v, Bound b, Depth d, Move m, Value ev) {
(void)thread;
tte->save(k, v, b, d, m, ev);
}
inline void reduce_moves(MoveInfo&) { }
#endif /* USE_MPI */
}
#endif // #ifndef CLUSTER_H_INCLUDED

View file

@ -27,6 +27,7 @@
#include "tt.h"
#include "uci.h"
#include "syzygy/tbprobe.h"
#include "cluster.h"
namespace PSQT {
void init();
@ -34,7 +35,9 @@ namespace PSQT {
int main(int argc, char* argv[]) {
std::cout << engine_info() << std::endl;
Cluster::init();
if (Cluster::is_root())
std::cout << engine_info() << std::endl;
UCI::init(Options);
PSQT::init();
@ -49,5 +52,6 @@ int main(int argc, char* argv[]) {
UCI::loop(argc, argv);
Threads.set(0);
Cluster::finalize();
return 0;
}

View file

@ -25,6 +25,7 @@
#include <iostream>
#include <sstream>
#include "cluster.h"
#include "evaluate.h"
#include "misc.h"
#include "movegen.h"
@ -143,7 +144,7 @@ namespace {
nodes += cnt;
pos.undo_move(m);
}
if (Root)
if (Root && Cluster::is_root())
sync_cout << UCI::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
}
return nodes;
@ -199,7 +200,8 @@ void MainThread::search() {
if (Limits.perft)
{
nodes = perft<true>(rootPos, Limits.perft * ONE_PLY);
sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
if (Cluster::is_root())
sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
return;
}
@ -210,9 +212,10 @@ void MainThread::search() {
if (rootMoves.empty())
{
rootMoves.emplace_back(MOVE_NONE);
sync_cout << "info depth 0 score "
<< UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW)
<< sync_endl;
if (Cluster::is_root())
sync_cout << "info depth 0 score "
<< UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW)
<< sync_endl;
}
else
{
@ -282,18 +285,25 @@ void MainThread::search() {
}
}
previousScore = bestThread->rootMoves[0].score;
Cluster::MoveInfo mi{bestThread->completedDepth,
bestThread->rootMoves[0].score,
Cluster::rank()};
Cluster::reduce_moves(mi);
previousScore = static_cast<Value>(mi.score);
// Send again PV info if we have a new best thread
if (bestThread != this)
sync_cout << UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE) << sync_endl;
if (mi.rank == Cluster::rank()) {
if (bestThread != this)
sync_cout << UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE) << sync_endl;
sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
if (bestThread->rootMoves[0].pv.size() > 1 || bestThread->rootMoves[0].extract_ponder_from_tt(rootPos))
std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
if (bestThread->rootMoves[0].pv.size() > 1 || bestThread->rootMoves[0].extract_ponder_from_tt(rootPos))
std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
std::cout << sync_endl;
std::cout << sync_endl;
}
}
@ -358,9 +368,9 @@ void Thread::search() {
&& !(Limits.depth && mainThread && rootDepth / ONE_PLY > Limits.depth))
{
// Distribute search depths across the helper threads
if (idx > 0)
if (idx + Cluster::rank() > 0)
{
int i = (idx - 1) % 20;
int i = (idx + Cluster::rank() - 1) % 20;
if (((rootDepth / ONE_PLY + SkipPhase[i]) / SkipSize[i]) % 2)
continue; // Retry with an incremented rootDepth
}
@ -431,7 +441,8 @@ void Thread::search() {
// When failing high/low give some update (without cluttering
// the UI) before a re-search.
if ( mainThread
if ( Cluster::is_root()
&& mainThread
&& multiPV == 1
&& (bestValue <= alpha || bestValue >= beta)
&& Time.elapsed() > 3000)
@ -468,7 +479,7 @@ void Thread::search() {
// Sort the PV lines searched so far and update the GUI
std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1);
if ( mainThread
if ( Cluster::is_root() && mainThread
&& (Threads.stop || pvIdx + 1 == multiPV || Time.elapsed() > 3000))
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
}
@ -709,9 +720,10 @@ namespace {
if ( b == BOUND_EXACT
|| (b == BOUND_LOWER ? value >= beta : value <= alpha))
{
tte->save(posKey, value_to_tt(value, ss->ply), b,
std::min(DEPTH_MAX - ONE_PLY, depth + 6 * ONE_PLY),
MOVE_NONE, VALUE_NONE);
Cluster::save(thisThread, tte,
posKey, value_to_tt(value, ss->ply), b,
std::min(DEPTH_MAX - ONE_PLY, depth + 6 * ONE_PLY),
MOVE_NONE, VALUE_NONE);
return value;
}
@ -760,7 +772,9 @@ namespace {
else
ss->staticEval = eval = pureStaticEval = -(ss-1)->staticEval + 2 * Eval::Tempo;
tte->save(posKey, VALUE_NONE, BOUND_NONE, DEPTH_NONE, MOVE_NONE, pureStaticEval);
Cluster::save(thisThread, tte,
posKey, VALUE_NONE, BOUND_NONE, DEPTH_NONE, MOVE_NONE,
pureStaticEval);
}
// Step 7. Razoring (~2 Elo)
@ -912,7 +926,7 @@ moves_loop: // When in check, search starts from here
ss->moveCount = ++moveCount;
if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000)
if (rootNode && Cluster::is_root() && thisThread == Threads.main() && Time.elapsed() > 3000)
sync_cout << "info depth " << depth / ONE_PLY
<< " currmove " << UCI::move(move, pos.is_chess960())
<< " currmovenumber " << moveCount + thisThread->pvIdx << sync_endl;
@ -1209,10 +1223,11 @@ moves_loop: // When in check, search starts from here
bestValue = std::min(bestValue, maxValue);
if (!excludedMove)
tte->save(posKey, value_to_tt(bestValue, ss->ply),
bestValue >= beta ? BOUND_LOWER :
PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
depth, bestMove, pureStaticEval);
Cluster::save(thisThread, tte,
posKey, value_to_tt(bestValue, ss->ply),
bestValue >= beta ? BOUND_LOWER :
PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
depth, bestMove, pureStaticEval);
assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
@ -1310,8 +1325,9 @@ moves_loop: // When in check, search starts from here
if (bestValue >= beta)
{
if (!ttHit)
tte->save(posKey, value_to_tt(bestValue, ss->ply), BOUND_LOWER,
DEPTH_NONE, MOVE_NONE, ss->staticEval);
Cluster::save(thisThread, tte,
posKey, value_to_tt(bestValue, ss->ply), BOUND_LOWER,
DEPTH_NONE, MOVE_NONE, ss->staticEval);
return bestValue;
}
@ -1421,10 +1437,11 @@ moves_loop: // When in check, search starts from here
if (inCheck && bestValue == -VALUE_INFINITE)
return mated_in(ss->ply); // Plies to mate from the root
tte->save(posKey, value_to_tt(bestValue, ss->ply),
bestValue >= beta ? BOUND_LOWER :
PvNode && bestValue > oldAlpha ? BOUND_EXACT : BOUND_UPPER,
ttDepth, bestMove, ss->staticEval);
Cluster::save(thisThread, tte,
posKey, value_to_tt(bestValue, ss->ply),
bestValue >= beta ? BOUND_LOWER :
PvNode && bestValue > oldAlpha ? BOUND_EXACT : BOUND_UPPER,
ttDepth, bestMove, ss->staticEval);
assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);

View file

@ -29,6 +29,7 @@
#include <type_traits>
#include "../bitboard.h"
#include "../cluster.h"
#include "../movegen.h"
#include "../position.h"
#include "../search.h"
@ -1373,7 +1374,8 @@ void Tablebases::init(const std::string& paths) {
}
}
sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
if (Cluster::is_root())
sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
}
// Probe the WDL table for a particular position.

View file

@ -27,6 +27,7 @@
#include <thread>
#include <vector>
#include "cluster.h"
#include "material.h"
#include "movepick.h"
#include "pawns.h"
@ -73,6 +74,13 @@ public:
CapturePieceToHistory captureHistory;
ContinuationHistory continuationHistory;
Score contempt;
#ifdef USE_MPI
struct {
Mutex mutex;
Cluster::TTSendBuffer<Cluster::TTSendBufferSize> buffer = {};
} ttBuffer;
#endif
};

View file

@ -24,6 +24,11 @@
#include "misc.h"
#include "types.h"
namespace Cluster {
void init();
}
//void Cluster::init();
/// TTEntry struct is the 10 bytes transposition table entry, defined as below:
///
/// key 16 bit
@ -36,6 +41,7 @@
struct TTEntry {
Key key() const { return (Key )(key16) << 48; }
Move move() const { return (Move )move16; }
Value value() const { return (Value)value16; }
Value eval() const { return (Value)eval16; }
@ -45,6 +51,7 @@ struct TTEntry {
private:
friend class TranspositionTable;
friend void Cluster::init();
uint16_t key16;
uint16_t move16;
@ -64,6 +71,8 @@ private:
class TranspositionTable {
friend void Cluster::init();
static constexpr int CacheLineSize = 64;
static constexpr int ClusterSize = 3;

View file

@ -24,6 +24,7 @@
#include <string>
#include "evaluate.h"
#include "cluster.h"
#include "movegen.h"
#include "position.h"
#include "search.h"
@ -97,7 +98,7 @@ namespace {
if (Options.count(name))
Options[name] = value;
else
else if (Cluster::is_root())
sync_cout << "No such option: " << name << sync_endl;
}
@ -199,7 +200,7 @@ void UCI::loop(int argc, char* argv[]) {
cmd += std::string(argv[i]) + " ";
do {
if (argc == 1 && !getline(cin, cmd)) // Block here waiting for input or EOF
if (argc == 1 && !Cluster::getline(cin, cmd)) // Block here waiting for input or EOF
cmd = "quit";
istringstream is(cmd);
@ -220,7 +221,7 @@ void UCI::loop(int argc, char* argv[]) {
else if (token == "ponderhit")
Threads.ponder = false; // Switch to normal search
else if (token == "uci")
else if (token == "uci" && Cluster::is_root())
sync_cout << "id name " << engine_info(true)
<< "\n" << Options
<< "\nuciok" << sync_endl;
@ -229,14 +230,17 @@ void UCI::loop(int argc, char* argv[]) {
else if (token == "go") go(pos, is, states);
else if (token == "position") position(pos, is, states);
else if (token == "ucinewgame") Search::clear();
else if (token == "isready") sync_cout << "readyok" << sync_endl;
else if (token == "isready" && Cluster::is_root())
sync_cout << "readyok" << sync_endl;
// Additional custom non-UCI commands, mainly for debugging
else if (token == "flip") pos.flip();
else if (token == "bench") bench(pos, is, states);
else if (token == "d") sync_cout << pos << sync_endl;
else if (token == "eval") sync_cout << Eval::trace(pos) << sync_endl;
else
else if (token == "d" && Cluster::is_root())
sync_cout << pos << sync_endl;
else if (token == "eval" && Cluster::is_root())
sync_cout << Eval::trace(pos) << sync_endl;
else if (Cluster::is_root())
sync_cout << "Unknown command: " << cmd << sync_endl;
} while (token != "quit" && argc == 1); // Command line args are one-shot