Cache small net w/ psqtOnly support

Caching the small net in the same way as the big net allows them to share the same code path and completely removes update_accumulator_refresh(). STC: https://tests.stockfishchess.org/tests/view/662bfb5ed46f72253dcfed85 LLR: 2.94 (-2.94,2.94) <-1.75,0.25> Total: 151712 W: 39252 L: 39158 D: 73302 Ptnml(0-2): 565, 17474, 39683, 17570, 564 closes https://github.com/official-stockfish/Stockfish/pull/5194 Bench: 1836777
2025-07-15 13:29:14 +00:00 · 2024-04-25 18:20:08 -07:00 · 2024-04-25 18:20:08 -07:00 · 940a3a7383
commit 940a3a7383
parent bc45cbc820
6 changed files with 88 additions and 195 deletions
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@ -60,7 +60,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
    int  nnueComplexity;
    int  v;

-    Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly)
+    Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
                          : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);

    const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@ -263,8 +263,8 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::hint_common_access(const Position&                         pos,
                                                    AccumulatorCaches::Cache<FTDimensions>* cache,
-                                                    bool psqtOnl) const {
-    featureTransformer->hint_common_access(pos, cache, psqtOnl);
+                                                    bool psqtOnly) const {
+    featureTransformer->hint_common_access(pos, cache, psqtOnly);
 }

 template<typename Arch, typename Transformer>
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@ -62,7 +62,7 @@ class Network {

    void hint_common_access(const Position&                         pos,
                            AccumulatorCaches::Cache<FTDimensions>* cache,
-                            bool                                    psqtOnl) const;
+                            bool                                    psqtOnly) const;

    void          verify(std::string evalfilePath) const;
    NnueEvalTrace trace_evaluate(const Position&                         pos,
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@ -63,6 +63,7 @@ struct AccumulatorCaches {
            PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
            Bitboard       byColorBB[COLOR_NB][COLOR_NB];
            Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+            bool           psqtOnly;

            // To initialize a refresh entry, we set all its bitboards empty,
            // so we put the biases in the accumulation, without any weights on top
@ -70,6 +71,7 @@ struct AccumulatorCaches {

                std::memset(byColorBB, 0, sizeof(byColorBB));
                std::memset(byTypeBB, 0, sizeof(byTypeBB));
+                psqtOnly = false;

                std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType));
                std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType));
@ -97,11 +99,11 @@ struct AccumulatorCaches {
    template<typename Networks>
    void clear(const Networks& networks) {
        big.clear(networks.big);
+        small.clear(networks.small);
    }

-    // When adding a new cache for a network, i.e. the smallnet
-    // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh.
    Cache<TransformedFeatureDimensionsBig> big;
+    Cache<TransformedFeatureDimensionsSmall> small;
 };

 }  // namespace Stockfish::Eval::NNUE
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@ -656,75 +656,84 @@ class FeatureTransformer {

    template<Color Perspective>
    void update_accumulator_refresh_cache(const Position&                           pos,
-                                          AccumulatorCaches::Cache<HalfDimensions>* cache) const {
+                                          AccumulatorCaches::Cache<HalfDimensions>* cache,
+                                          bool psqtOnly) const {
        assert(cache != nullptr);

        Square ksq = pos.square<KING>(Perspective);
-
        auto& entry = (*cache)[ksq];
-
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = true;
-        accumulator.computedPSQT[Perspective] = true;
-
        FeatureSet::IndexList removed, added;
-        for (Color c : {WHITE, BLACK})
-        {
-            for (PieceType pt = PAWN; pt <= KING; ++pt)
-            {
-                const Piece    piece = make_piece(c, pt);
-                const Bitboard oldBB =
-                  entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
-                const Bitboard newBB    = pos.pieces(c, pt);
-                Bitboard       toRemove = oldBB & ~newBB;
-                Bitboard       toAdd    = newBB & ~oldBB;

-                while (toRemove)
+        if (entry.psqtOnly && !psqtOnly)
+        {
+            entry.clear(biases);
+            FeatureSet::append_active_indices<Perspective>(pos, added);
+        }
+        else
+        {
+            for (Color c : {WHITE, BLACK})
+            {
+                for (PieceType pt = PAWN; pt <= KING; ++pt)
                {
-                    Square sq = pop_lsb(toRemove);
-                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-                while (toAdd)
-                {
-                    Square sq = pop_lsb(toAdd);
-                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    const Piece    piece = make_piece(c, pt);
+                    const Bitboard oldBB =
+                      entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
+                    const Bitboard newBB    = pos.pieces(c, pt);
+                    Bitboard       toRemove = oldBB & ~newBB;
+                    Bitboard       toAdd    = newBB & ~oldBB;
+
+                    while (toRemove)
+                    {
+                        Square sq = pop_lsb(toRemove);
+                        removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
+                    while (toAdd)
+                    {
+                        Square sq = pop_lsb(toAdd);
+                        added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
                }
            }
        }

+        auto& accumulator                     = pos.state()->*accPtr;
+        accumulator.computed[Perspective]     = !psqtOnly;
+        accumulator.computedPSQT[Perspective] = true;
+
 #ifdef VECTOR
        vec_t      acc[NumRegs];
        psqt_vec_t psqt[NumPsqtRegs];

-        for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-        {
-            auto entryTile =
-              reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
-            for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = entryTile[k];
-
-            for (int i = 0; i < int(added.size()); ++i)
+        if (!psqtOnly)
+            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
            {
-                IndexType       index  = added[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                auto entryTile =
+                  reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
+                for (IndexType k = 0; k < NumRegs; ++k)
+                    acc[k] = entryTile[k];

-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
+                for (int i = 0; i < int(added.size()); ++i)
+                {
+                    IndexType       index  = added[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_add_16(acc[k], column[k]);
+                }
+                for (int i = 0; i < int(removed.size()); ++i)
+                {
+                    IndexType       index  = removed[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_sub_16(acc[k], column[k]);
+                }
+
+                for (IndexType k = 0; k < NumRegs; k++)
+                    vec_store(&entryTile[k], acc[k]);
            }
-            for (int i = 0; i < int(removed.size()); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_sub_16(acc[k], column[k]);
-            }
-
-            for (IndexType k = 0; k < NumRegs; k++)
-                vec_store(&entryTile[k], acc[k]);
-        }

        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
        {
@ -760,18 +769,24 @@ class FeatureTransformer {

        for (const auto index : added)
        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] += weights[offset + j];
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[Perspective][j] += weights[offset + j];
+            }

            for (std::size_t k = 0; k < PSQTBuckets; ++k)
                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
        }
        for (const auto index : removed)
        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] -= weights[offset + j];
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[Perspective][j] -= weights[offset + j];
+            }

            for (std::size_t k = 0; k < PSQTBuckets; ++k)
                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
@ -782,144 +797,20 @@ class FeatureTransformer {
        // The accumulator of the refresh entry has been updated.
        // Now copy its content to the actual accumulator we were refreshing

+        if (!psqtOnly)
+            std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
+                        sizeof(BiasType) * HalfDimensions);
+
        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
                    sizeof(int32_t) * PSQTBuckets);

-        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
-                    sizeof(BiasType) * HalfDimensions);
-
        for (Color c : {WHITE, BLACK})
            entry.byColorBB[Perspective][c] = pos.pieces(c);

        for (PieceType pt = PAWN; pt <= KING; ++pt)
            entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
-    }

-    template<Color Perspective>
-    void
-    update_accumulator_refresh(const Position&                                            pos,
-                               [[maybe_unused]] AccumulatorCaches::Cache<HalfDimensions>* cache,
-                               bool psqtOnly) const {
-
-        // When we are refreshing the accumulator of the big net,
-        // redirect to the version of refresh that uses the refresh table.
-        // Using the cache for the small net is not beneficial.
-        if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig)
-        {
-            update_accumulator_refresh_cache<Perspective>(pos, cache);
-            return;
-        }
-
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
-
-        // Refresh the accumulator
-        // Could be extracted to a separate function because it's done in 2 places,
-        // but it's unclear if compilers would correctly handle register allocation.
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = !psqtOnly;
-        accumulator.computedPSQT[Perspective] = true;
-        FeatureSet::IndexList active;
-        FeatureSet::append_active_indices<Perspective>(pos, active);
-
-#ifdef VECTOR
-        if (!psqtOnly)
-            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-            {
-                auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
-                for (IndexType k = 0; k < NumRegs; ++k)
-                    acc[k] = biasesTile[k];
-
-                int i = 0;
-                for (; i < int(active.size()) - 1; i += 2)
-                {
-                    IndexType       index0  = active[i];
-                    IndexType       index1  = active[i + 1];
-                    const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
-                    const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
-                    auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
-                    auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
-
-                    for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
-                }
-                for (; i < int(active.size()); ++i)
-                {
-                    IndexType       index  = active[i];
-                    const IndexType offset = HalfDimensions * index + j * TileHeight;
-                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                    for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], column[k]);
-                }
-
-                auto accTile =
-                  reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-                for (unsigned k = 0; k < NumRegs; k++)
-                    vec_store(&accTile[k], acc[k]);
-            }
-
-        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
-        {
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = vec_zero_psqt();
-
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
-            {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
-                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
-                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
-                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] =
-                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
-            }
-            for (; i < int(active.size()); ++i)
-            {
-                IndexType       index  = active[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
-            }
-
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
-        }
-
-#else
-        if (!psqtOnly)
-            std::memcpy(accumulator.accumulation[Perspective], biases,
-                        HalfDimensions * sizeof(BiasType));
-
-        for (std::size_t k = 0; k < PSQTBuckets; ++k)
-            accumulator.psqtAccumulation[Perspective][k] = 0;
-
-        for (const auto index : active)
-        {
-            if (!psqtOnly)
-            {
-                const IndexType offset = HalfDimensions * index;
-                for (IndexType j = 0; j < HalfDimensions; ++j)
-                    accumulator.accumulation[Perspective][j] += weights[offset + j];
-            }
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                accumulator.psqtAccumulation[Perspective][k] +=
-                  psqtWeights[index * PSQTBuckets + k];
-        }
-#endif
+        entry.psqtOnly = psqtOnly;
    }

    template<Color Perspective>
@ -948,7 +839,7 @@ class FeatureTransformer {
                                                           psqtOnly);
        }
        else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
    }

    template<Color Perspective>
@ -976,7 +867,7 @@ class FeatureTransformer {
                                                           psqtOnly);
        }
        else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
    }

    template<IndexType Size>
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@ -48,7 +48,7 @@ void hint_common_parent_position(const Position&    pos,

    int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
    if (simpleEvalAbs > Eval::SmallNetThreshold)
-        networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold);
+        networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold);
    else
        networks.big.hint_common_access(pos, &caches.big, false);
 }