Clean SIMD code a bit

Cleaner vector code structure in feature transformer. This patch just regroups the parts of the inner loop for each SIMD instruction set. Tested for non-regression: LLR: 2.96 (-2.94,2.94) <-2.50,0.50> Total: 115760 W: 9835 L: 9831 D: 96094 Ptnml(0-2): 326, 7776, 41715, 7694, 369 https://tests.stockfishchess.org/tests/view/60b96b39457376eb8bcaa26e It would be nice if a future patch could use some of the macros at the top of the file to unify the code between the distincts SIMD instruction sets (of course, unifying the Relu will be the challenge). closes https://github.com/official-stockfish/Stockfish/pull/3506 No functional change
2025-07-11 19:49:14 +00:00 · 2021-06-04 13:56:40 +02:00 · 2021-06-04 13:56:40 +02:00 · 8f081c86f7
commit 8f081c86f7
parent 4445965f97
1 changed files with 110 additions and 84 deletions
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@ -180,118 +180,144 @@ namespace Stockfish::Eval::NNUE {
      const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
      const auto psqt = (
-            psqtAccumulation[static_cast<int>(perspectives[0])][bucket]
+            psqtAccumulation[perspectives[0]][bucket]
-          - psqtAccumulation[static_cast<int>(perspectives[1])][bucket]
+          - psqtAccumulation[perspectives[1]][bucket]
        ) / 2;
  #if defined(USE_AVX512)
      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
      static_assert(HalfDimensions % (SimdWidth * 2) == 0);
      const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
      const __m512i Zero = _mm512_setzero_si512();
      for (IndexType p = 0; p < 2; ++p)
      {
          const IndexType offset = HalfDimensions * p;
          auto out = reinterpret_cast<__m512i*>(&output[offset]);
          for (IndexType j = 0; j < NumChunks; ++j)
          {
              __m512i sum0 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
                                              (accumulation[perspectives[p]])[j * 2 + 0]);
              __m512i sum1 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
                                              (accumulation[perspectives[p]])[j * 2 + 1]);
              _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
                                 _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
          }
      }
      return psqt;
  #elif defined(USE_AVX2)
      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
      constexpr int Control = 0b11011000;
      const __m256i Zero = _mm256_setzero_si256();
      for (IndexType p = 0; p < 2; ++p)
      {
          const IndexType offset = HalfDimensions * p;
          auto out = reinterpret_cast<__m256i*>(&output[offset]);
          for (IndexType j = 0; j < NumChunks; ++j)
          {
              __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
                                              (accumulation[perspectives[p]])[j * 2 + 0]);
              __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
                                              (accumulation[perspectives[p]])[j * 2 + 1]);
              _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(
                                 _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control));
          }
      }
      return psqt;
  #elif defined(USE_SSE2)
      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
      #ifdef USE_SSE41
      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
      const __m128i Zero = _mm_setzero_si128();
      #else
      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
      const __m128i k0x80s = _mm_set1_epi8(-128);
      #endif
      for (IndexType p = 0; p < 2; ++p)
      {
          const IndexType offset = HalfDimensions * p;
          auto out = reinterpret_cast<__m128i*>(&output[offset]);
          for (IndexType j = 0; j < NumChunks; ++j)
          {
              __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>
                                           (accumulation[perspectives[p]])[j * 2 + 0]);
              __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>
                                           (accumulation[perspectives[p]])[j * 2 + 1]);
              const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
              #ifdef USE_SSE41
              _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero));
              #else
              _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
              #endif
          }
      }
      return psqt;
  #elif defined(USE_MMX)
      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
      const __m64 k0x80s = _mm_set1_pi8(-128);
-  #elif defined(USE_NEON)
+      for (IndexType p = 0; p < 2; ++p)
-      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
+      {
      const int8x8_t Zero = {0};
  #endif
      for (IndexType p = 0; p < 2; ++p) {
          const IndexType offset = HalfDimensions * p;
  #if defined(USE_AVX512)
        auto out = reinterpret_cast<__m512i*>(&output[offset]);
        for (IndexType j = 0; j < NumChunks; ++j) {
          __m512i sum0 = _mm512_load_si512(
              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]])[j * 2 + 0]);
          __m512i sum1 = _mm512_load_si512(
              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]])[j * 2 + 1]);
          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
        }
  #elif defined(USE_AVX2)
        auto out = reinterpret_cast<__m256i*>(&output[offset]);
        for (IndexType j = 0; j < NumChunks; ++j) {
          __m256i sum0 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]])[j * 2 + 0]);
          __m256i sum1 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]])[j * 2 + 1]);
          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
              _mm256_packs_epi16(sum0, sum1), Zero), Control));
        }
  #elif defined(USE_SSE2)
        auto out = reinterpret_cast<__m128i*>(&output[offset]);
        for (IndexType j = 0; j < NumChunks; ++j) {
          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
              accumulation[perspectives[p]])[j * 2 + 0]);
          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
              accumulation[perspectives[p]])[j * 2 + 1]);
      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
          _mm_store_si128(&out[j],
  #ifdef USE_SSE41
              _mm_max_epi8(packedbytes, Zero)
  #else
              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
  #endif
          );
        }
  #elif defined(USE_MMX)
          auto out = reinterpret_cast<__m64*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
+          for (IndexType j = 0; j < NumChunks; ++j)
-          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+          {
-              accumulation[perspectives[p]])[j * 2 + 0]);
+              __m64 sum0 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 0]);
-          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              __m64 sum1 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 1]);
              accumulation[perspectives[p]])[j * 2 + 1]);
              const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
              out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
          }
      }
      _mm_empty();
      return psqt;
  #elif defined(USE_NEON)
      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
      const int8x8_t Zero = {0};
      for (IndexType p = 0; p < 2; ++p)
      {
          const IndexType offset = HalfDimensions * p;
          const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
+          for (IndexType j = 0; j < NumChunks; ++j)
-          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+          {
-              accumulation[perspectives[p]])[j];
+              int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
              out[j] = vmax_s8(vqmovn_s16(sum), Zero);
          }
      }
      return psqt;
  #else
        for (IndexType j = 0; j < HalfDimensions; ++j) {
          BiasType sum = accumulation[static_cast<int>(perspectives[p])][j];
          output[offset + j] = static_cast<OutputType>(
              std::max<int>(0, std::min<int>(127, sum)));
        }
  #endif
      for (IndexType p = 0; p < 2; ++p)
      {
          const IndexType offset = HalfDimensions * p;
          for (IndexType j = 0; j < HalfDimensions; ++j)
          {
              BiasType sum = accumulation[perspectives[p]][j];
              output[offset + j] = static_cast<OutputType>(std::max<int>(0, std::min<int>(127, sum)));
          }
      }
  #if defined(USE_MMX)
      _mm_empty();
  #endif
      return psqt;
-    }
+
  #endif
   } // end of function transform()
   private:
    void update_accumulator(const Position& pos, const Color perspective) const {