From c8213ba0d047569141ed58f5eb86579d976b5614 Mon Sep 17 00:00:00 2001 From: Dubslow Date: Mon, 10 Jun 2024 18:03:36 -0500 Subject: [PATCH] Simplify TT interface and avoid changing TT info This commit builds on the work and ideas of #5345, #5348, and #5364. Place as much as possible of the TT implementation in tt.cpp, rather than in the header. Some commentary is added to better document the public interface. Fix the search read-TT races, or at least contain them to within TT methods only. Passed SMP STC: https://tests.stockfishchess.org/tests/view/666134ab91e372763104b443 LLR: 2.94 (-2.94,2.94) <-1.75,0.25> Total: 512552 W: 132387 L: 132676 D: 247489 Ptnml(0-2): 469, 58429, 138771, 58136, 471 The unmerged version has bench identical to the other PR (see also #5348) and therefore those same-functionality tests: SMP LTC: https://tests.stockfishchess.org/tests/view/665c7021fd45fb0f907c214a SMP LTC: https://tests.stockfishchess.org/tests/view/665d28a7fd45fb0f907c5495 closes https://github.com/official-stockfish/Stockfish/pull/5369 bench 1205675 --- src/search.cpp | 199 +++++++++++++++++++++--------------------- src/tt.cpp | 148 +++++++++++++++++++++++++------ src/tt.h | 119 ++++++++++--------------- tests/instrumented.sh | 7 +- 4 files changed, 265 insertions(+), 208 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 3dbdfd47..9c3f915d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -546,16 +546,15 @@ Value Search::Worker::search( StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); - TTEntry* tte; - Key posKey; - Move ttMove, move, excludedMove, bestMove; - Depth extension, newDepth; - Value bestValue, value, ttValue, eval, maxValue, probCutBeta, singularValue; - bool givesCheck, improving, priorCapture, opponentWorsening; - bool capture, moveCountPruning, ttCapture; - Piece movedPiece; - int moveCount, captureCount, quietCount; - Bound singularBound; + Key posKey; + Move move, excludedMove, bestMove; + Depth extension, newDepth; + Value bestValue, value, eval, maxValue, probCutBeta, singularValue; + bool givesCheck, improving, priorCapture, opponentWorsening; + bool capture, moveCountPruning, ttCapture; + Piece movedPiece; + int moveCount, captureCount, quietCount; + Bound singularBound; // Step 1. Initialize node Worker* thisThread = this; @@ -605,31 +604,32 @@ Value Search::Worker::search( ss->statScore = 0; // Step 4. Transposition table lookup. - excludedMove = ss->excludedMove; - posKey = pos.key(); - tte = tt.probe(posKey, ss->ttHit); - ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; - ttMove = rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0] - : ss->ttHit ? tte->move() - : Move::none(); - ttCapture = ttMove && pos.capture_stage(ttMove); + excludedMove = ss->excludedMove; + posKey = pos.key(); + auto [ttHit, ttData, ttWriter] = tt.probe(posKey); + // Need further processing of the saved data + ss->ttHit = ttHit; + ttData.move = rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0] + : ttHit ? ttData.move + : Move::none(); + ttData.value = ttHit ? value_from_tt(ttData.value, ss->ply, pos.rule50_count()) : VALUE_NONE; + ss->ttPv = excludedMove ? ss->ttPv : PvNode || (ttHit && ttData.is_pv); + ttCapture = ttData.move && pos.capture_stage(ttData.move); // At this point, if excluded, skip straight to step 6, static eval. However, // to save indentation, we list the condition in all code between here and there. - if (!excludedMove) - ss->ttPv = PvNode || (ss->ttHit && tte->is_pv()); // At non-PV nodes we check for an early TT cutoff - if (!PvNode && !excludedMove && tte->depth() > depth - (ttValue <= beta) - && ttValue != VALUE_NONE // Possible in case of TT access race or if !ttHit - && (tte->bound() & (ttValue >= beta ? BOUND_LOWER : BOUND_UPPER))) + if (!PvNode && !excludedMove && ttData.depth > depth - (ttData.value <= beta) + && ttData.value != VALUE_NONE // Can happen when !ttHit or when access race in probe() + && (ttData.bound & (ttData.value >= beta ? BOUND_LOWER : BOUND_UPPER))) { // If ttMove is quiet, update move sorting heuristics on TT hit (~2 Elo) - if (ttMove && ttValue >= beta) + if (ttData.move && ttData.value >= beta) { // Bonus for a quiet ttMove that fails high (~2 Elo) if (!ttCapture) - update_quiet_stats(pos, ss, *this, ttMove, stat_bonus(depth)); + update_quiet_stats(pos, ss, *this, ttData.move, stat_bonus(depth)); // Extra penalty for early quiet moves of // the previous ply (~1 Elo on STC, ~2 Elo on LTC) @@ -641,7 +641,7 @@ Value Search::Worker::search( // Partial workaround for the graph history interaction problem // For high rule50 counts don't produce transposition table cutoffs. if (pos.rule50_count() < 90) - return ttValue; + return ttData.value; } // Step 5. Tablebases probe @@ -679,9 +679,9 @@ Value Search::Worker::search( if (b == BOUND_EXACT || (b == BOUND_LOWER ? value >= beta : value <= alpha)) { - tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b, - std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE, - tt.generation()); + ttWriter.write(posKey, value_to_tt(value, ss->ply), ss->ttPv, b, + std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE, + tt.generation()); return value; } @@ -716,7 +716,7 @@ Value Search::Worker::search( else if (ss->ttHit) { // Never assume anything about values stored in TT - unadjustedStaticEval = tte->eval(); + unadjustedStaticEval = ttData.eval; if (unadjustedStaticEval == VALUE_NONE) unadjustedStaticEval = evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); @@ -726,8 +726,9 @@ Value Search::Worker::search( ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); // ttValue can be used as a better position evaluation (~7 Elo) - if (ttValue != VALUE_NONE && (tte->bound() & (ttValue > eval ? BOUND_LOWER : BOUND_UPPER))) - eval = ttValue; + if (ttData.value != VALUE_NONE + && (ttData.bound & (ttData.value > eval ? BOUND_LOWER : BOUND_UPPER))) + eval = ttData.value; } else { @@ -736,8 +737,8 @@ Value Search::Worker::search( ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); // Static evaluation is saved as it was before adjustment by correction history - tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_UNSEARCHED, Move::none(), - unadjustedStaticEval, tt.generation()); + ttWriter.write(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_UNSEARCHED, Move::none(), + unadjustedStaticEval, tt.generation()); } // Use static evaluation difference to improve quiet move ordering (~9 Elo) @@ -778,7 +779,7 @@ Value Search::Worker::search( && eval - futility_margin(depth, cutNode && !ss->ttHit, improving, opponentWorsening) - (ss - 1)->statScore / 263 >= beta - && eval >= beta && eval < VALUE_TB_WIN_IN_MAX_PLY && (!ttMove || ttCapture)) + && eval >= beta && eval < VALUE_TB_WIN_IN_MAX_PLY && (!ttData.move || ttCapture)) return beta > VALUE_TB_LOSS_IN_MAX_PLY ? beta + (eval - beta) / 3 : eval; // Step 9. Null move search with verification search (~35 Elo) @@ -824,7 +825,7 @@ Value Search::Worker::search( // Step 10. Internal iterative reductions (~9 Elo) // For PV nodes without a ttMove, we decrease depth by 3. - if (PvNode && !ttMove) + if (PvNode && !ttData.move) depth -= 3; // Use qsearch if depth <= 0. @@ -833,8 +834,8 @@ Value Search::Worker::search( // For cutNodes, if depth is high enough, decrease depth by 2 if there is no ttMove, or // by 1 if there is a ttMove with an upper bound. - if (cutNode && depth >= 8 && (!ttMove || tte->bound() == BOUND_UPPER)) - depth -= 1 + !ttMove; + if (cutNode && depth >= 8 && (!ttData.move || ttData.bound == BOUND_UPPER)) + depth -= 1 + !ttData.move; // Step 11. ProbCut (~10 Elo) // If we have a good enough capture (or queen promotion) and a reduced search returns a value @@ -847,11 +848,11 @@ Value Search::Worker::search( // there and in further interactions with transposition table cutoff depth is set to depth - 3 // because probCut search has depth set to depth - 4 but we also do a move before it // So effective depth is equal to depth - 3 - && !(tte->depth() >= depth - 3 && ttValue != VALUE_NONE && ttValue < probCutBeta)) + && !(ttData.depth >= depth - 3 && ttData.value != VALUE_NONE && ttData.value < probCutBeta)) { assert(probCutBeta < VALUE_INFINITE && probCutBeta > beta); - MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &thisThread->captureHistory); + MovePicker mp(pos, ttData.move, probCutBeta - ss->staticEval, &thisThread->captureHistory); while ((move = mp.next_move()) != Move::none()) if (move != excludedMove && pos.legal(move)) @@ -882,8 +883,8 @@ Value Search::Worker::search( if (value >= probCutBeta) { // Save ProbCut data into transposition table - tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, depth - 3, - move, unadjustedStaticEval, tt.generation()); + ttWriter.write(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, + depth - 3, move, unadjustedStaticEval, tt.generation()); return std::abs(value) < VALUE_TB_WIN_IN_MAX_PLY ? value - (probCutBeta - beta) : value; } @@ -896,9 +897,10 @@ moves_loop: // When in check, search starts here // Step 12. A small Probcut idea, when we are in check (~4 Elo) probCutBeta = beta + 388; - if (ss->inCheck && !PvNode && ttCapture && (tte->bound() & BOUND_LOWER) - && tte->depth() >= depth - 4 && ttValue >= probCutBeta - && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY) + if (ss->inCheck && !PvNode && ttCapture && (ttData.bound & BOUND_LOWER) + && ttData.depth >= depth - 4 && ttData.value >= probCutBeta + && std::abs(ttData.value) < VALUE_TB_WIN_IN_MAX_PLY + && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY) return probCutBeta; const PieceToHistory* contHist[] = {(ss - 1)->continuationHistory, @@ -911,7 +913,7 @@ moves_loop: // When in check, search starts here Move countermove = prevSq != SQ_NONE ? thisThread->counterMoves[pos.piece_on(prevSq)][prevSq] : Move::none(); - MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory, &thisThread->captureHistory, + MovePicker mp(pos, ttData.move, depth, &thisThread->mainHistory, &thisThread->captureHistory, contHist, &thisThread->pawnHistory, countermove, ss->killers); value = bestValue; @@ -1046,12 +1048,12 @@ moves_loop: // When in check, search starts here // Generally, higher singularBeta (i.e closer to ttValue) and lower extension // margins scale well. - if (!rootNode && move == ttMove && !excludedMove + if (!rootNode && move == ttData.move && !excludedMove && depth >= 4 - (thisThread->completedDepth > 35) + ss->ttPv - && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY && (tte->bound() & BOUND_LOWER) - && tte->depth() >= depth - 3) + && std::abs(ttData.value) < VALUE_TB_WIN_IN_MAX_PLY && (ttData.bound & BOUND_LOWER) + && ttData.depth >= depth - 3) { - Value singularBeta = ttValue - (52 + 80 * (ss->ttPv && !PvNode)) * depth / 64; + Value singularBeta = ttData.value - (52 + 80 * (ss->ttPv && !PvNode)) * depth / 64; Depth singularDepth = newDepth / 2; ss->excludedMove = move; @@ -1086,7 +1088,7 @@ moves_loop: // When in check, search starts here // so we reduce the ttMove in favor of other moves based on some conditions: // If the ttMove is assumed to fail high over current beta (~7 Elo) - else if (ttValue >= beta) + else if (ttData.value >= beta) extension = -3; // If we are on a cutNode but the ttMove is not assumed to fail high over current beta (~1 Elo) @@ -1126,7 +1128,7 @@ moves_loop: // When in check, search starts here // Decrease reduction if position is or has been on the PV (~7 Elo) if (ss->ttPv) - r -= 1 + (ttValue > alpha) + (tte->depth() >= depth); + r -= 1 + (ttData.value > alpha) + (ttData.depth >= depth); // Decrease reduction for PvNodes (~0 Elo on STC, ~2 Elo on LTC) if (PvNode) @@ -1136,8 +1138,8 @@ moves_loop: // When in check, search starts here // Increase reduction for cut nodes (~4 Elo) if (cutNode) - r += 2 - (tte->depth() >= depth && ss->ttPv) - + (!ss->ttPv && move != ttMove && move != ss->killers[0]); + r += 2 - (ttData.depth >= depth && ss->ttPv) + + (!ss->ttPv && move != ttData.move && move != ss->killers[0]); // Increase reduction if ttMove is a capture (~3 Elo) if (ttCapture) @@ -1149,7 +1151,7 @@ moves_loop: // When in check, search starts here // For first picked move (ttMove) reduce reduction // but never allow it to go below 0 (~3 Elo) - else if (move == ttMove) + else if (move == ttData.move) r = std::max(0, r - 2); ss->statScore = 2 * thisThread->mainHistory[us][move.from_to()] @@ -1197,7 +1199,7 @@ moves_loop: // When in check, search starts here else if (!PvNode || moveCount > 1) { // Increase reduction if ttMove is not present (~6 Elo) - if (!ttMove) + if (!ttData.move) r += 2; // Note that if expected reduction is high, we reduce search depth by 1 here (~9 Elo) @@ -1287,7 +1289,7 @@ moves_loop: // When in check, search starts here if (value >= beta) { - ss->cutoffCnt += 1 + !ttMove - (extension >= 2); + ss->cutoffCnt += 1 + !ttData.move - (extension >= 2); assert(value >= beta); // Fail high break; } @@ -1363,11 +1365,11 @@ moves_loop: // When in check, search starts here // Write gathered information in transposition table // Static evaluation is saved as it was before correction history if (!excludedMove && !(rootNode && thisThread->pvIdx)) - tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, - bestValue >= beta ? BOUND_LOWER - : PvNode && bestMove ? BOUND_EXACT - : BOUND_UPPER, - depth, bestMove, unadjustedStaticEval, tt.generation()); + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, + bestValue >= beta ? BOUND_LOWER + : PvNode && bestMove ? BOUND_EXACT + : BOUND_UPPER, + depth, bestMove, unadjustedStaticEval, tt.generation()); // Adjust correction history if (!ss->inCheck && (!bestMove || !pos.capture(bestMove)) @@ -1414,14 +1416,12 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); - TTEntry* tte; - Key posKey; - Move ttMove, move, bestMove; - Depth ttDepth; - Value bestValue, value, ttValue, futilityBase; - bool pvHit, givesCheck, capture; - int moveCount; - Color us = pos.side_to_move(); + Key posKey; + Move move, bestMove; + Value bestValue, value, futilityBase; + bool pvHit, givesCheck, capture; + int moveCount; + Color us = pos.side_to_move(); // Step 1. Initialize node if (PvNode) @@ -1447,23 +1447,25 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, assert(0 <= ss->ply && ss->ply < MAX_PLY); - // Note that unlike regular search, which stores literal depth, in QS we only store the - // current movegen stage. If in check, we search all evasions and thus store - // DEPTH_QS_CHECKS. (Evasions may be quiet, and _CHECKS includes quiets.) - ttDepth = ss->inCheck || depth >= DEPTH_QS_CHECKS ? DEPTH_QS_CHECKS : DEPTH_QS_NORMAL; + // Note that unlike regular search, which stores the literal depth into the TT, from QS we + // only store the current movegen stage as "depth". If in check, we search all evasions and + // thus store DEPTH_QS_CHECKS. (Evasions may be quiet, and _CHECKS includes quiets.) + Depth qsTtDepth = ss->inCheck || depth >= DEPTH_QS_CHECKS ? DEPTH_QS_CHECKS : DEPTH_QS_NORMAL; // Step 3. Transposition table lookup - posKey = pos.key(); - tte = tt.probe(posKey, ss->ttHit); - ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; - ttMove = ss->ttHit ? tte->move() : Move::none(); - pvHit = ss->ttHit && tte->is_pv(); + posKey = pos.key(); + auto [ttHit, ttData, ttWriter] = tt.probe(posKey); + // Need further processing of the saved data + ss->ttHit = ttHit; + ttData.move = ttHit ? ttData.move : Move::none(); + ttData.value = ttHit ? value_from_tt(ttData.value, ss->ply, pos.rule50_count()) : VALUE_NONE; + pvHit = ttHit && ttData.is_pv; // At non-PV nodes we check for an early TT cutoff - if (!PvNode && tte->depth() >= ttDepth - && ttValue != VALUE_NONE // Only in case of TT access race or if !ttHit - && (tte->bound() & (ttValue >= beta ? BOUND_LOWER : BOUND_UPPER))) - return ttValue; + if (!PvNode && ttData.depth >= qsTtDepth + && ttData.value != VALUE_NONE // Can happen when !ttHit or when access race in probe() + && (ttData.bound & (ttData.value >= beta ? BOUND_LOWER : BOUND_UPPER))) + return ttData.value; // Step 4. Static evaluation of the position Value unadjustedStaticEval = VALUE_NONE; @@ -1474,7 +1476,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, if (ss->ttHit) { // Never assume anything about values stored in TT - unadjustedStaticEval = tte->eval(); + unadjustedStaticEval = ttData.eval; if (unadjustedStaticEval == VALUE_NONE) unadjustedStaticEval = evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); @@ -1482,9 +1484,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); // ttValue can be used as a better position evaluation (~13 Elo) - if (std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY - && (tte->bound() & (ttValue > bestValue ? BOUND_LOWER : BOUND_UPPER))) - bestValue = ttValue; + if (std::abs(ttData.value) < VALUE_TB_WIN_IN_MAX_PLY + && (ttData.bound & (ttData.value > bestValue ? BOUND_LOWER : BOUND_UPPER))) + bestValue = ttData.value; } else { @@ -1503,9 +1505,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, if (std::abs(bestValue) < VALUE_TB_WIN_IN_MAX_PLY && !PvNode) bestValue = (3 * bestValue + beta) / 4; if (!ss->ttHit) - tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, - DEPTH_UNSEARCHED, Move::none(), unadjustedStaticEval, tt.generation()); - + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, + DEPTH_UNSEARCHED, Move::none(), unadjustedStaticEval, + tt.generation()); return bestValue; } @@ -1524,7 +1526,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // (Presently, having the checks stage is worth only 1 Elo, and may be removable in the near future, // which would result in only a single stage of QS movegen.) Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; - MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory, &thisThread->captureHistory, + MovePicker mp(pos, ttData.move, depth, &thisThread->mainHistory, &thisThread->captureHistory, contHist, &thisThread->pawnHistory); // Step 5. Loop through all pseudo-legal moves until no moves remain or a beta cutoff occurs. @@ -1643,9 +1645,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // Save gathered info in transposition table // Static evaluation is saved as it was before adjustment by correction history - tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit, - bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove, - unadjustedStaticEval, tt.generation()); + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), pvHit, + bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, qsTtDepth, bestMove, + unadjustedStaticEval, tt.generation()); assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE); @@ -1986,20 +1988,17 @@ bool RootMove::extract_ponder_from_tt(const TranspositionTable& tt, Position& po StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); - bool ttHit; - assert(pv.size() == 1); if (pv[0] == Move::none()) return false; pos.do_move(pv[0], st); - TTEntry* tte = tt.probe(pos.key(), ttHit); + auto [ttHit, ttData, ttWriter] = tt.probe(pos.key()); if (ttHit) { - Move m = tte->move(); // Local copy to be SMP safe - if (MoveList(pos).contains(m)) - pv.push_back(m); + if (MoveList(pos).contains(ttData.move)) + pv.push_back(ttData.move); } pos.undo_move(pv[0]); diff --git a/src/tt.cpp b/src/tt.cpp index 5a44759e..763e2c9b 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -25,11 +25,63 @@ #include #include "memory.h" +#include "misc.h" #include "syzygy/tbprobe.h" #include "thread.h" namespace Stockfish { + +// TTEntry struct is the 10 bytes transposition table entry, defined as below: +// +// key 16 bit +// depth 8 bit +// generation 5 bit +// pv node 1 bit +// bound type 2 bit +// move 16 bit +// value 16 bit +// evaluation 16 bit +// +// These fields are in the same order as accessed by TT::probe(), since memory is fastest sequentially. +// Equally, the store order in save() matches this order. + +struct TTEntry { + + // Convert internal bitfields to external types + TTData read() const { + return TTData{Move(move16), Value(value16), + Value(eval16), Depth(depth8 + DEPTH_ENTRY_OFFSET), + Bound(genBound8 & 0x3), bool(genBound8 & 0x4)}; + } + + void save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); + // The returned age is a multiple of TranspositionTable::GENERATION_DELTA + uint8_t relative_age(const uint8_t generation8) const; + + private: + friend class TranspositionTable; + + uint16_t key16; + uint8_t depth8; + uint8_t genBound8; + Move move16; + int16_t value16; + int16_t eval16; +}; + +// `genBound8` is where most of the details are. We use the following constants to manipulate 5 leading generation bits +// and 3 trailing miscellaneous bits. + +// These bits are reserved for other things. +static constexpr unsigned GENERATION_BITS = 3; +// increment for generation field +static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS); +// cycle length +static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA; +// mask to pull out generation number +static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; + // DEPTH_ENTRY_OFFSET exists because 1) we use `bool(depth8)` as the occupancy check, but // 2) we need to store negative depths for QS. (`depth8` is the only field with "spare bits": // we sacrifice the ability to store depths greater than 1<<8 less the offset, as asserted below.) @@ -65,12 +117,34 @@ uint8_t TTEntry::relative_age(const uint8_t generation8) const { // is needed to keep the unrelated lowest n bits from affecting // the result) to calculate the entry age correctly even after // generation8 overflows into the next cycle. - - return (TranspositionTable::GENERATION_CYCLE + generation8 - genBound8) - & TranspositionTable::GENERATION_MASK; + return (GENERATION_CYCLE + generation8 - genBound8) & GENERATION_MASK; } +// TTWriter is but a very thin wrapper around the pointer +TTWriter::TTWriter(TTEntry* tte) : + entry(tte) {} + +void TTWriter::write( + Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { + entry->save(k, v, pv, b, d, m, ev, generation8); +} + + +// A TranspositionTable is an array of Cluster, of size clusterCount. Each cluster consists of ClusterSize number +// of TTEntry. Each non-empty TTEntry contains information on exactly one position. The size of a Cluster should +// divide the size of a cache line for best performance, as the cacheline is prefetched when possible. + +static constexpr int ClusterSize = 3; + +struct Cluster { + TTEntry entry[ClusterSize]; + char padding[2]; // Pad to 32 bytes +}; + +static_assert(sizeof(Cluster) == 32, "Suboptimal Cluster size"); + + // Sets the size of the transposition table, // measured in megabytes. Transposition table consists // of clusters and each cluster consists of ClusterSize number of TTEntry. @@ -114,32 +188,6 @@ void TranspositionTable::clear(ThreadPool& threads) { } -// Looks up the current position in the transposition -// table. It returns true and a pointer to the TTEntry if the position is found. -// Otherwise, it returns false and a pointer to an empty or least valuable TTEntry -// to be replaced later. The replace value of an entry is calculated as its depth -// minus 8 times its relative age. TTEntry t1 is considered more valuable than -// TTEntry t2 if its replace value is greater than that of t2. -TTEntry* TranspositionTable::probe(const Key key, bool& found) const { - - TTEntry* const tte = first_entry(key); - const uint16_t key16 = uint16_t(key); // Use the low 16 bits as key inside the cluster - - for (int i = 0; i < ClusterSize; ++i) - if (tte[i].key16 == key16) - return found = bool(tte[i].depth8), &tte[i]; - - // Find an entry to be replaced according to the replacement strategy - TTEntry* replace = tte; - for (int i = 1; i < ClusterSize; ++i) - if (replace->depth8 - replace->relative_age(generation8) * 2 - > tte[i].depth8 - tte[i].relative_age(generation8) * 2) - replace = &tte[i]; - - return found = false, replace; -} - - // Returns an approximation of the hashtable // occupation during a search. The hash is x permill full, as per UCI protocol. // Only counts entries which match the current generation. @@ -154,4 +202,46 @@ int TranspositionTable::hashfull() const { return cnt / ClusterSize; } + +void TranspositionTable::new_search() { + // increment by delta to keep lower bits as is + generation8 += GENERATION_DELTA; +} + + +uint8_t TranspositionTable::generation() const { return generation8; } + + +// Looks up the current position in the transposition +// table. It returns true if the position is found. +// Otherwise, it returns false and a pointer to an empty or least valuable TTEntry +// to be replaced later. The replace value of an entry is calculated as its depth +// minus 8 times its relative age. TTEntry t1 is considered more valuable than +// TTEntry t2 if its replace value is greater than that of t2. +std::tuple TranspositionTable::probe(const Key key) const { + + TTEntry* const tte = first_entry(key); + const uint16_t key16 = uint16_t(key); // Use the low 16 bits as key inside the cluster + + for (int i = 0; i < ClusterSize; ++i) + if (tte[i].key16 == key16) + // This gap is the main place for read races. + // After `read()` completes that copy is final, but may be self-inconsistent. + return {bool(tte[i].depth8), tte[i].read(), TTWriter(&tte[i])}; + + // Find an entry to be replaced according to the replacement strategy + TTEntry* replace = tte; + for (int i = 1; i < ClusterSize; ++i) + if (replace->depth8 - replace->relative_age(generation8) * 2 + > tte[i].depth8 - tte[i].relative_age(generation8) * 2) + replace = &tte[i]; + + return {false, replace->read(), TTWriter(replace)}; +} + + +TTEntry* TranspositionTable::first_entry(const Key key) const { + return &table[mul_hi64(key, clusterCount)].entry[0]; +} + } // namespace Stockfish diff --git a/src/tt.h b/src/tt.h index b2e8f582..1bece002 100644 --- a/src/tt.h +++ b/src/tt.h @@ -21,103 +21,76 @@ #include #include +#include #include "memory.h" -#include "misc.h" #include "types.h" namespace Stockfish { -// TTEntry struct is the 10 bytes transposition table entry, defined as below: -// -// key 16 bit -// depth 8 bit -// generation 5 bit -// pv node 1 bit -// bound type 2 bit -// move 16 bit -// value 16 bit -// eval value 16 bit -// -// These fields are in the same order as accessed by TT::probe(), since memory is fastest sequentially. -// Equally, the store order in save() matches this order. -struct TTEntry { +class ThreadPool; +struct TTEntry; +struct Cluster; - Move move() const { return Move(move16); } - Value value() const { return Value(value16); } - Value eval() const { return Value(eval16); } - Depth depth() const { return Depth(depth8 + DEPTH_ENTRY_OFFSET); } - bool is_pv() const { return bool(genBound8 & 0x4); } - Bound bound() const { return Bound(genBound8 & 0x3); } - void save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); - // The returned age is a multiple of TranspositionTable::GENERATION_DELTA - uint8_t relative_age(const uint8_t generation8) const; +// There is only one global hash table for the engine and all its threads. For chess in particular, we even allow racy +// updates between threads to and from the TT, as taking the time to synchronize access would cost thinking time and +// thus elo. As a hash table, collisions are possible and may cause chess playing issues (bizarre blunders, faulty mate +// reports, etc). Fixing these also loses elo; however such risk decreases quickly with larger TT size. +// +// `probe` is the primary method: given a board position, we lookup its entry in the table, and return a tuple of: +// 1) whether the entry already has this position +// 2) a copy of the prior data (if any) (may be inconsistent due to read races) +// 3) a writer object to this entry +// The copied data and the writer are separated to maintain clear boundaries between local vs global objects. + + +// A copy of the data already in the entry (possibly collided). `probe` may be racy, resulting in inconsistent data. +struct TTData { + Move move; + Value value, eval; + Depth depth; + Bound bound; + bool is_pv; +}; + + +// This is used to make racy writes to the global TT. +struct TTWriter { + public: + void write(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); private: friend class TranspositionTable; - - uint16_t key16; - uint8_t depth8; - uint8_t genBound8; - Move move16; - int16_t value16; - int16_t eval16; + TTEntry* entry; + TTWriter(TTEntry* tte); }; -class ThreadPool; -// A TranspositionTable is an array of Cluster, of size clusterCount. Each -// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry -// contains information on exactly one position. The size of a Cluster should -// divide the size of a cache line for best performance, as the cacheline is -// prefetched when possible. class TranspositionTable { - static constexpr int ClusterSize = 3; - - struct Cluster { - TTEntry entry[ClusterSize]; - char padding[2]; // Pad to 32 bytes - }; - - static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size"); - - // Constants used to refresh the hash table periodically - - // We have 8 bits available where the lowest 3 bits are - // reserved for other things. - static constexpr unsigned GENERATION_BITS = 3; - // increment for generation field - static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS); - // cycle length - static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA; - // mask to pull out generation number - static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; - public: ~TranspositionTable() { aligned_large_pages_free(table); } - void new_search() { - // increment by delta to keep lower bits as is - generation8 += GENERATION_DELTA; - } - TTEntry* probe(const Key key, bool& found) const; - int hashfull() const; - void resize(size_t mbSize, ThreadPool& threads); - void clear(ThreadPool& threads); + void resize(size_t mbSize, ThreadPool& threads); // Set TT size + void clear(ThreadPool& threads); // Re-initialize memory, multithreaded + int hashfull() + const; // Approximate what fraction of entries (permille) have been written to during this root search - TTEntry* first_entry(const Key key) const { - return &table[mul_hi64(key, clusterCount)].entry[0]; - } - - uint8_t generation() const { return generation8; } + void + new_search(); // This must be called at the beginning of each root search to track entry aging + uint8_t generation() const; // The current age, used when writing new data to the TT + std::tuple + probe(const Key key) const; // The main method, whose retvals separate local vs global objects + TTEntry* first_entry(const Key key) + const; // This is the hash function; its only external use is memory prefetching. private: friend struct TTEntry; size_t clusterCount; - Cluster* table = nullptr; - uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 + Cluster* table = nullptr; + + uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 }; } // namespace Stockfish diff --git a/tests/instrumented.sh b/tests/instrumented.sh index 4c63fc57..e77ee0dd 100755 --- a/tests/instrumented.sh +++ b/tests/instrumented.sh @@ -39,13 +39,8 @@ case $1 in threads="2" cat << EOF > tsan.supp -race:Stockfish::TTEntry::move -race:Stockfish::TTEntry::depth -race:Stockfish::TTEntry::bound +race:Stockfish::TTEntry::read race:Stockfish::TTEntry::save -race:Stockfish::TTEntry::value -race:Stockfish::TTEntry::eval -race:Stockfish::TTEntry::is_pv race:Stockfish::TranspositionTable::probe race:Stockfish::TranspositionTable::hashfull