Avoid unnecessary stores in the affine transform

This patch improves the codegen in the AffineTransform::forward function for architectures >=SSSE3. Current code works directly on memory and the compiler cannot see that the stores through outptr do not alias the loads through weights and input32. The solution implemented is to perform the affine transform with local variables as accumulators and only store the result to memory at the end. The number of accumulators required is OutputDimensions / OutputSimdWidth, which means that for the 1024->16 affine transform it requires 4 registers with SSSE3, 2 with AVX2, 1 with AVX512. It also cuts the number of stores required by NumRegs * 256 for each node evaluated. The local accumulators are expected to be assigned to registers, but even if this cannot be done in some case due to register pressure it will help the compiler to see that there is no aliasing between the loads and stores and may still result in better codegen. See https://godbolt.org/z/59aTKbbYc for codegen comparison. passed STC: LLR: 2.94 (-2.94,2.94) <-0.50,2.50> Total: 140328 W: 10635 L: 10358 D: 119335 Ptnml(0-2): 302, 8339, 52636, 8554, 333 closes https://github.com/official-stockfish/Stockfish/pull/3634 No functional change
2025-07-11 19:49:14 +00:00 · 2021-07-27 22:12:14 +02:00 · 2021-07-27 22:12:14 +02:00 · 26edf9534a
commit 26edf9534a
parent e973eee919
1 changed files with 16 additions and 8 deletions
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@ -251,9 +251,6 @@ namespace Stockfish::Eval::NNUE::Layers {
 #endif

 #if defined (USE_SSSE3)
-      // Different layout, we process 4 inputs at a time, always.
-      static_assert(InputDimensions % 4 == 0);
-
      const auto output = reinterpret_cast<OutputType*>(buffer);
      const auto inputVector = reinterpret_cast<const vec_t*>(input);

@ -263,13 +260,18 @@ namespace Stockfish::Eval::NNUE::Layers {
      // because then it is also an input dimension.
      if constexpr (OutputDimensions % OutputSimdWidth == 0)
      {
+          static_assert(InputDimensions % 16 == 0);
+
          constexpr IndexType NumChunks = InputDimensions / 4;
+          constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;

          const auto input32 = reinterpret_cast<const std::int32_t*>(input);
-          vec_t* outptr = reinterpret_cast<vec_t*>(output);
-          std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
+          const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
+          vec_t outs[NumRegs];
+          for (IndexType k = 0; k < NumRegs; ++k)
+              outs[k] = biasvec[k];

-          for (int i = 0; i < (int)NumChunks - 3; i += 4)
+          for (IndexType i = 0; i < NumChunks; i += 4)
          {
              const vec_t in0 = vec_set_32(input32[i + 0]);
              const vec_t in1 = vec_set_32(input32[i + 1]);
@ -279,12 +281,18 @@ namespace Stockfish::Eval::NNUE::Layers {
              const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
              const auto col2 = reinterpret_cast<const vec_t*>(&weights[(i + 2) * OutputDimensions * 4]);
              const auto col3 = reinterpret_cast<const vec_t*>(&weights[(i + 3) * OutputDimensions * 4]);
-              for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
-                  vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
+              for (IndexType k = 0; k < NumRegs; ++k)
+                  vec_add_dpbusd_32x4(outs[k], in0, col0[k], in1, col1[k], in2, col2[k], in3, col3[k]);
          }
+
+          vec_t* outptr = reinterpret_cast<vec_t*>(output);
+          for (IndexType k = 0; k < NumRegs; ++k)
+              outptr[k] = outs[k];
      }
      else if constexpr (OutputDimensions == 1)
      {
+          static_assert(InputDimensions % 4 == 0);
+
 #if defined (USE_AVX512)
          if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0)
          {