Generalize the feature transform to use vec_t macros

This commit generalizes the feature transform to use vec_t macros that are architecture defined instead of using a seperate code path for each one. It should make some old architectures (MMX, including improvements by Fanael) faster and make further such improvements easier in the future. Includes some corrections to CI for mingw. closes https://github.com/official-stockfish/Stockfish/pull/3955 closes https://github.com/official-stockfish/Stockfish/pull/3928 No functional change
2025-06-28 00:19:50 +00:00 · 2022-03-01 17:49:02 -05:00 · 2022-03-01 17:49:02 -05:00 · 270a0e737f
commit 270a0e737f
parent 4ac7d726ec
3 changed files with 78 additions and 100 deletions
--- a/.github/workflows/stockfish.yml
+++ b/.github/workflows/stockfish.yml
@ -82,20 +82,20 @@ jobs:
              name: "Windows 2022 Mingw-w64 GCC x86_64",
              os: windows-2022,
              compiler: g++,
-              comp: gcc,
+              comp: mingw,
              run_64bit_tests: true,
              msys_sys: 'mingw64',
-              msys_env: 'x86_64',
+              msys_env: 'x86_64-gcc',
              shell: 'msys2 {0}'
            }
          - {
              name: "Windows 2022 Mingw-w64 GCC i686",
              os: windows-2022,
              compiler: g++,
-              comp: gcc,
+              comp: mingw,
              run_32bit_tests: true,
              msys_sys: 'mingw32',
-              msys_env: 'i686',
+              msys_env: 'i686-gcc',
              shell: 'msys2 {0}'
            }
          - {
@ -105,7 +105,7 @@ jobs:
              comp: clang,
              run_64bit_tests: true,
              msys_sys: 'clang64',
-              msys_env: 'clang-x86_64',
+              msys_env: 'clang-x86_64-clang',
              shell: 'msys2 {0}'
            }
@ -129,7 +129,7 @@ jobs:
        uses: msys2/setup-msys2@v2
        with:
          msystem: ${{matrix.config.msys_sys}}
-          install: mingw-w64-${{matrix.config.msys_env}}-${{matrix.config.comp}} make git expect
+          install: mingw-w64-${{matrix.config.msys_env}} make git expect
      - name: Download the used network from the fishtest framework
        run: |
--- a/1
+++ b/1
@ -31,6 +31,7 @@ Arjun Temurnikar
 Artem Solopiy (EntityFX)
 Auguste Pop
 Balint Pfliegel
 Ben Chaney (Chaneybenjamini)
 Ben Koshy (BKSpurgeon)
 Bill Henry (VoyagerOne)
 Bojun Guo (noobpwnftw, Nooby)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@ -47,12 +47,22 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) _mm512_store_si512(a,b)
  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
  #define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
  #define vec_zero() _mm512_setzero_epi32()
  #define vec_set_16(a) _mm512_set1_epi16(a)
  #define vec_max_16(a,b) _mm512_max_epi16(a,b)
  #define vec_min_16(a,b) _mm512_min_epi16(a,b)
  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
    vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
    return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
  }
  #define vec_load_psqt(a) _mm256_load_si256(a)
  #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
  #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
  #define vec_zero_psqt() _mm256_setzero_si256()
  #define NumRegistersSIMD 32
  #define MaxChunkSize 64
  #elif USE_AVX2
  typedef __m256i vec_t;
@ -61,12 +71,22 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) _mm256_store_si256(a,b)
  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
  #define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
  #define vec_zero() _mm256_setzero_si256()
  #define vec_set_16(a) _mm256_set1_epi16(a)
  #define vec_max_16(a,b) _mm256_max_epi16(a,b)
  #define vec_min_16(a,b) _mm256_min_epi16(a,b)
  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
    vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
    return _mm256_permute4x64_epi64(compacted, 0b11011000);
  }
  #define vec_load_psqt(a) _mm256_load_si256(a)
  #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
  #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
  #define vec_zero_psqt() _mm256_setzero_si256()
  #define NumRegistersSIMD 16
  #define MaxChunkSize 32
  #elif USE_SSE2
  typedef __m128i vec_t;
@ -75,12 +95,19 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
  #define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
  #define vec_zero() _mm_setzero_si128()
  #define vec_set_16(a) _mm_set1_epi16(a)
  #define vec_max_16(a,b) _mm_max_epi16(a,b)
  #define vec_min_16(a,b) _mm_min_epi16(a,b)
  #define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
  #define vec_load_psqt(a) (*(a))
  #define vec_store_psqt(a,b) *(a)=(b)
  #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
  #define vec_zero_psqt() _mm_setzero_si128()
  #define NumRegistersSIMD (Is64Bit ? 16 : 8)
  #define MaxChunkSize 16
  #elif USE_MMX
  typedef __m64 vec_t;
@ -89,12 +116,26 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_pi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
  #define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
  #define vec_zero() _mm_setzero_si64()
  #define vec_set_16(a) _mm_set1_pi16(a)
  inline vec_t vec_max_16(vec_t a,vec_t b){
      vec_t comparison = _mm_cmpgt_pi16(a,b);
      return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
  }
  inline vec_t vec_min_16(vec_t a,vec_t b){
      vec_t comparison = _mm_cmpgt_pi16(a,b);
      return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
  }
  #define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
  #define vec_load_psqt(a) (*(a))
  #define vec_store_psqt(a,b) *(a)=(b)
  #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
  #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
  #define vec_zero_psqt() _mm_setzero_si64()
  #define vec_cleanup() _mm_empty()
  #define NumRegistersSIMD 8
  #define MaxChunkSize 8
  #elif USE_NEON
  typedef int16x8_t vec_t;
@ -103,12 +144,24 @@ namespace Stockfish::Eval::NNUE {
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) vaddq_s16(a,b)
  #define vec_sub_16(a,b) vsubq_s16(a,b)
  #define vec_mul_16(a,b) vmulq_s16(a,b)
  #define vec_zero() vec_t{0}
  #define vec_set_16(a) vdupq_n_s16(a)
  #define vec_max_16(a,b) vmaxq_s16(a,b)
  #define vec_min_16(a,b) vminq_s16(a,b)
  inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
        const int8x8_t shifta = vshrn_n_s16(a, 7);
        const int8x8_t shiftb = vshrn_n_s16(b, 7);
 	const int8x16_t compacted = vcombine_s8(shifta,shiftb);
 	return *reinterpret_cast<const vec_t*> (&compacted);
  }
  #define vec_load_psqt(a) (*(a))
  #define vec_store_psqt(a,b) *(a)=(b)
  #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
  #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
  #define vec_zero_psqt() psqt_vec_t{0}
  #define NumRegistersSIMD 16
  #define MaxChunkSize 16
  #else
  #undef VECTOR
@ -235,110 +288,30 @@ namespace Stockfish::Eval::NNUE {
      {
          const IndexType offset = (HalfDimensions / 2) * p;
-#if defined(USE_AVX512)
+#if defined(VECTOR)
-          constexpr IndexType OutputChunkSize = 512 / 8;
+	  constexpr IndexType OutputChunkSize = MaxChunkSize;
          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-          const __m512i Zero = _mm512_setzero_si512();
+          vec_t Zero = vec_zero();
-          const __m512i One = _mm512_set1_epi16(127);
+          vec_t One = vec_set_16(127);
          const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-          const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0]));
+          const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
-          const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+          const vec_t* in1 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
-                __m512i* out = reinterpret_cast<      __m512i*>(output + offset);
+                vec_t* out = reinterpret_cast<      vec_t*>(output + offset);
          for (IndexType j = 0; j < NumOutputChunks; j += 1)
          {
-              const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero);
+              const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero);
-              const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero);
+              const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero);
-              const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero);
+              const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero);
-              const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero);
+              const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero);
-              const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7);
+              const vec_t pa = vec_mul_16(sum0a, sum1a);
-              const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7);
+              const vec_t pb = vec_mul_16(sum0b, sum1b);
-              out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb));
+              out[j] = vec_msb_pack_16(pa, pb);
          }
 #elif defined(USE_AVX2)
          constexpr IndexType OutputChunkSize = 256 / 8;
          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
          const __m256i Zero = _mm256_setzero_si256();
          const __m256i One = _mm256_set1_epi16(127);
          constexpr int Control = 0b11011000;
          const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
          const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
                __m256i* out = reinterpret_cast<      __m256i*>(output + offset);
          for (IndexType j = 0; j < NumOutputChunks; j += 1)
          {
              const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
              const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
              const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
              const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);
              const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
              const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);
              out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
          }
 #elif defined(USE_SSE2)
          constexpr IndexType OutputChunkSize = 128 / 8;
          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
          const __m128i Zero = _mm_setzero_si128();
          const __m128i One = _mm_set1_epi16(127);
          const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
          const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
                __m128i* out = reinterpret_cast<      __m128i*>(output + offset);
          for (IndexType j = 0; j < NumOutputChunks; j += 1)
          {
              const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
              const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
              const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
              const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);
              const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
              const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);
              out[j] = _mm_packs_epi16(pa, pb);
          }
 #elif defined(USE_NEON)
          constexpr IndexType OutputChunkSize = 128 / 8;
          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
          const int16x8_t Zero = vdupq_n_s16(0);
          const int16x8_t One  = vdupq_n_s16(127);
          const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
          const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
                int8x16_t* out = reinterpret_cast<      int8x16_t*>(output + offset);
          for (IndexType j = 0; j < NumOutputChunks; j += 1)
          {
              const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
              const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
              const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
              const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);
              const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
              const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);
              out[j] = vcombine_s8(pa, pb);
          }
 #else
@ -354,6 +327,10 @@ namespace Stockfish::Eval::NNUE {
 #endif
      }
 #if defined(vec_cleanup)
      vec_cleanup();
 #endif
      return psqt;
   } // end of function transform()