1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-29 16:23:09 +00:00

Cache small net w/ psqtOnly support

Caching the small net in the same way as the big net allows them to
share the same code path and completely removes
update_accumulator_refresh().

STC:
https://tests.stockfishchess.org/tests/view/662bfb5ed46f72253dcfed85
LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 151712 W: 39252 L: 39158 D: 73302
Ptnml(0-2): 565, 17474, 39683, 17570, 564

closes https://github.com/official-stockfish/Stockfish/pull/5194

Bench: 1836777
This commit is contained in:
mstembera 2024-04-25 18:20:08 -07:00 committed by Disservin
parent bc45cbc820
commit 940a3a7383
6 changed files with 88 additions and 195 deletions

View file

@ -60,7 +60,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks,
int nnueComplexity;
int v;
Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly)
Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
: networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,

View file

@ -263,8 +263,8 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
template<typename Arch, typename Transformer>
void Network<Arch, Transformer>::hint_common_access(const Position& pos,
AccumulatorCaches::Cache<FTDimensions>* cache,
bool psqtOnl) const {
featureTransformer->hint_common_access(pos, cache, psqtOnl);
bool psqtOnly) const {
featureTransformer->hint_common_access(pos, cache, psqtOnly);
}
template<typename Arch, typename Transformer>

View file

@ -62,7 +62,7 @@ class Network {
void hint_common_access(const Position& pos,
AccumulatorCaches::Cache<FTDimensions>* cache,
bool psqtOnl) const;
bool psqtOnly) const;
void verify(std::string evalfilePath) const;
NnueEvalTrace trace_evaluate(const Position& pos,

View file

@ -63,6 +63,7 @@ struct AccumulatorCaches {
PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
Bitboard byColorBB[COLOR_NB][COLOR_NB];
Bitboard byTypeBB[COLOR_NB][PIECE_TYPE_NB];
bool psqtOnly;
// To initialize a refresh entry, we set all its bitboards empty,
// so we put the biases in the accumulation, without any weights on top
@ -70,6 +71,7 @@ struct AccumulatorCaches {
std::memset(byColorBB, 0, sizeof(byColorBB));
std::memset(byTypeBB, 0, sizeof(byTypeBB));
psqtOnly = false;
std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType));
std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType));
@ -97,11 +99,11 @@ struct AccumulatorCaches {
template<typename Networks>
void clear(const Networks& networks) {
big.clear(networks.big);
small.clear(networks.small);
}
// When adding a new cache for a network, i.e. the smallnet
// the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh.
Cache<TransformedFeatureDimensionsBig> big;
Cache<TransformedFeatureDimensionsSmall> small;
};
} // namespace Stockfish::Eval::NNUE

View file

@ -656,75 +656,84 @@ class FeatureTransformer {
template<Color Perspective>
void update_accumulator_refresh_cache(const Position& pos,
AccumulatorCaches::Cache<HalfDimensions>* cache) const {
AccumulatorCaches::Cache<HalfDimensions>* cache,
bool psqtOnly) const {
assert(cache != nullptr);
Square ksq = pos.square<KING>(Perspective);
auto& entry = (*cache)[ksq];
auto& accumulator = pos.state()->*accPtr;
accumulator.computed[Perspective] = true;
accumulator.computedPSQT[Perspective] = true;
FeatureSet::IndexList removed, added;
for (Color c : {WHITE, BLACK})
{
for (PieceType pt = PAWN; pt <= KING; ++pt)
{
const Piece piece = make_piece(c, pt);
const Bitboard oldBB =
entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
const Bitboard newBB = pos.pieces(c, pt);
Bitboard toRemove = oldBB & ~newBB;
Bitboard toAdd = newBB & ~oldBB;
while (toRemove)
if (entry.psqtOnly && !psqtOnly)
{
entry.clear(biases);
FeatureSet::append_active_indices<Perspective>(pos, added);
}
else
{
for (Color c : {WHITE, BLACK})
{
for (PieceType pt = PAWN; pt <= KING; ++pt)
{
Square sq = pop_lsb(toRemove);
removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
}
while (toAdd)
{
Square sq = pop_lsb(toAdd);
added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
const Piece piece = make_piece(c, pt);
const Bitboard oldBB =
entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
const Bitboard newBB = pos.pieces(c, pt);
Bitboard toRemove = oldBB & ~newBB;
Bitboard toAdd = newBB & ~oldBB;
while (toRemove)
{
Square sq = pop_lsb(toRemove);
removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
}
while (toAdd)
{
Square sq = pop_lsb(toAdd);
added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
}
}
}
}
auto& accumulator = pos.state()->*accPtr;
accumulator.computed[Perspective] = !psqtOnly;
accumulator.computedPSQT[Perspective] = true;
#ifdef VECTOR
vec_t acc[NumRegs];
psqt_vec_t psqt[NumPsqtRegs];
for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
{
auto entryTile =
reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
for (IndexType k = 0; k < NumRegs; ++k)
acc[k] = entryTile[k];
for (int i = 0; i < int(added.size()); ++i)
if (!psqtOnly)
for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
{
IndexType index = added[i];
const IndexType offset = HalfDimensions * index + j * TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
auto entryTile =
reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
for (IndexType k = 0; k < NumRegs; ++k)
acc[k] = entryTile[k];
for (unsigned k = 0; k < NumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
for (int i = 0; i < int(added.size()); ++i)
{
IndexType index = added[i];
const IndexType offset = HalfDimensions * index + j * TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (unsigned k = 0; k < NumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
for (int i = 0; i < int(removed.size()); ++i)
{
IndexType index = removed[i];
const IndexType offset = HalfDimensions * index + j * TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (unsigned k = 0; k < NumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]);
}
for (IndexType k = 0; k < NumRegs; k++)
vec_store(&entryTile[k], acc[k]);
}
for (int i = 0; i < int(removed.size()); ++i)
{
IndexType index = removed[i];
const IndexType offset = HalfDimensions * index + j * TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (unsigned k = 0; k < NumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]);
}
for (IndexType k = 0; k < NumRegs; k++)
vec_store(&entryTile[k], acc[k]);
}
for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
{
@ -760,18 +769,24 @@ class FeatureTransformer {
for (const auto index : added)
{
const IndexType offset = HalfDimensions * index;
for (IndexType j = 0; j < HalfDimensions; ++j)
entry.accumulation[Perspective][j] += weights[offset + j];
if (!psqtOnly)
{
const IndexType offset = HalfDimensions * index;
for (IndexType j = 0; j < HalfDimensions; ++j)
entry.accumulation[Perspective][j] += weights[offset + j];
}
for (std::size_t k = 0; k < PSQTBuckets; ++k)
entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
}
for (const auto index : removed)
{
const IndexType offset = HalfDimensions * index;
for (IndexType j = 0; j < HalfDimensions; ++j)
entry.accumulation[Perspective][j] -= weights[offset + j];
if (!psqtOnly)
{
const IndexType offset = HalfDimensions * index;
for (IndexType j = 0; j < HalfDimensions; ++j)
entry.accumulation[Perspective][j] -= weights[offset + j];
}
for (std::size_t k = 0; k < PSQTBuckets; ++k)
entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
@ -782,144 +797,20 @@ class FeatureTransformer {
// The accumulator of the refresh entry has been updated.
// Now copy its content to the actual accumulator we were refreshing
if (!psqtOnly)
std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
sizeof(BiasType) * HalfDimensions);
std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
sizeof(int32_t) * PSQTBuckets);
std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
sizeof(BiasType) * HalfDimensions);
for (Color c : {WHITE, BLACK})
entry.byColorBB[Perspective][c] = pos.pieces(c);
for (PieceType pt = PAWN; pt <= KING; ++pt)
entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
}
template<Color Perspective>
void
update_accumulator_refresh(const Position& pos,
[[maybe_unused]] AccumulatorCaches::Cache<HalfDimensions>* cache,
bool psqtOnly) const {
// When we are refreshing the accumulator of the big net,
// redirect to the version of refresh that uses the refresh table.
// Using the cache for the small net is not beneficial.
if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig)
{
update_accumulator_refresh_cache<Perspective>(pos, cache);
return;
}
#ifdef VECTOR
// Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch
vec_t acc[NumRegs];
psqt_vec_t psqt[NumPsqtRegs];
#endif
// Refresh the accumulator
// Could be extracted to a separate function because it's done in 2 places,
// but it's unclear if compilers would correctly handle register allocation.
auto& accumulator = pos.state()->*accPtr;
accumulator.computed[Perspective] = !psqtOnly;
accumulator.computedPSQT[Perspective] = true;
FeatureSet::IndexList active;
FeatureSet::append_active_indices<Perspective>(pos, active);
#ifdef VECTOR
if (!psqtOnly)
for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
{
auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
for (IndexType k = 0; k < NumRegs; ++k)
acc[k] = biasesTile[k];
int i = 0;
for (; i < int(active.size()) - 1; i += 2)
{
IndexType index0 = active[i];
IndexType index1 = active[i + 1];
const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
auto column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
auto column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
for (unsigned k = 0; k < NumRegs; ++k)
acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
}
for (; i < int(active.size()); ++i)
{
IndexType index = active[i];
const IndexType offset = HalfDimensions * index + j * TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (unsigned k = 0; k < NumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
auto accTile =
reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
for (unsigned k = 0; k < NumRegs; k++)
vec_store(&accTile[k], acc[k]);
}
for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
{
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
psqt[k] = vec_zero_psqt();
int i = 0;
for (; i < int(active.size()) - 1; i += 2)
{
IndexType index0 = active[i];
IndexType index1 = active[i + 1];
const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
psqt[k] =
vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
}
for (; i < int(active.size()); ++i)
{
IndexType index = active[i];
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
}
auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
&accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
vec_store_psqt(&accTilePsqt[k], psqt[k]);
}
#else
if (!psqtOnly)
std::memcpy(accumulator.accumulation[Perspective], biases,
HalfDimensions * sizeof(BiasType));
for (std::size_t k = 0; k < PSQTBuckets; ++k)
accumulator.psqtAccumulation[Perspective][k] = 0;
for (const auto index : active)
{
if (!psqtOnly)
{
const IndexType offset = HalfDimensions * index;
for (IndexType j = 0; j < HalfDimensions; ++j)
accumulator.accumulation[Perspective][j] += weights[offset + j];
}
for (std::size_t k = 0; k < PSQTBuckets; ++k)
accumulator.psqtAccumulation[Perspective][k] +=
psqtWeights[index * PSQTBuckets + k];
}
#endif
entry.psqtOnly = psqtOnly;
}
template<Color Perspective>
@ -948,7 +839,7 @@ class FeatureTransformer {
psqtOnly);
}
else
update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
}
template<Color Perspective>
@ -976,7 +867,7 @@ class FeatureTransformer {
psqtOnly);
}
else
update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
}
template<IndexType Size>

View file

@ -48,7 +48,7 @@ void hint_common_parent_position(const Position& pos,
int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
if (simpleEvalAbs > Eval::SmallNetThreshold)
networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold);
networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold);
else
networks.big.hint_common_access(pos, &caches.big, false);
}