1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-06-28 00:19:50 +00:00

Generalize the feature transform to use vec_t macros

This commit generalizes the feature transform to use vec_t macros
that are architecture defined instead of using a seperate code path for each one.

It should make some old architectures (MMX, including improvements by Fanael) faster
and make further such improvements easier in the future.

Includes some corrections to CI for mingw.

closes https://github.com/official-stockfish/Stockfish/pull/3955
closes https://github.com/official-stockfish/Stockfish/pull/3928

No functional change
This commit is contained in:
Ben Chaney 2022-03-01 17:49:02 -05:00 committed by Joost VandeVondele
parent 4ac7d726ec
commit 270a0e737f
3 changed files with 78 additions and 100 deletions

View file

@ -82,20 +82,20 @@ jobs:
name: "Windows 2022 Mingw-w64 GCC x86_64", name: "Windows 2022 Mingw-w64 GCC x86_64",
os: windows-2022, os: windows-2022,
compiler: g++, compiler: g++,
comp: gcc, comp: mingw,
run_64bit_tests: true, run_64bit_tests: true,
msys_sys: 'mingw64', msys_sys: 'mingw64',
msys_env: 'x86_64', msys_env: 'x86_64-gcc',
shell: 'msys2 {0}' shell: 'msys2 {0}'
} }
- { - {
name: "Windows 2022 Mingw-w64 GCC i686", name: "Windows 2022 Mingw-w64 GCC i686",
os: windows-2022, os: windows-2022,
compiler: g++, compiler: g++,
comp: gcc, comp: mingw,
run_32bit_tests: true, run_32bit_tests: true,
msys_sys: 'mingw32', msys_sys: 'mingw32',
msys_env: 'i686', msys_env: 'i686-gcc',
shell: 'msys2 {0}' shell: 'msys2 {0}'
} }
- { - {
@ -105,7 +105,7 @@ jobs:
comp: clang, comp: clang,
run_64bit_tests: true, run_64bit_tests: true,
msys_sys: 'clang64', msys_sys: 'clang64',
msys_env: 'clang-x86_64', msys_env: 'clang-x86_64-clang',
shell: 'msys2 {0}' shell: 'msys2 {0}'
} }
@ -129,7 +129,7 @@ jobs:
uses: msys2/setup-msys2@v2 uses: msys2/setup-msys2@v2
with: with:
msystem: ${{matrix.config.msys_sys}} msystem: ${{matrix.config.msys_sys}}
install: mingw-w64-${{matrix.config.msys_env}}-${{matrix.config.comp}} make git expect install: mingw-w64-${{matrix.config.msys_env}} make git expect
- name: Download the used network from the fishtest framework - name: Download the used network from the fishtest framework
run: | run: |

View file

@ -31,6 +31,7 @@ Arjun Temurnikar
Artem Solopiy (EntityFX) Artem Solopiy (EntityFX)
Auguste Pop Auguste Pop
Balint Pfliegel Balint Pfliegel
Ben Chaney (Chaneybenjamini)
Ben Koshy (BKSpurgeon) Ben Koshy (BKSpurgeon)
Bill Henry (VoyagerOne) Bill Henry (VoyagerOne)
Bojun Guo (noobpwnftw, Nooby) Bojun Guo (noobpwnftw, Nooby)

View file

@ -47,12 +47,22 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) _mm512_store_si512(a,b) #define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
#define vec_zero() _mm512_setzero_epi32()
#define vec_set_16(a) _mm512_set1_epi16(a)
#define vec_max_16(a,b) _mm512_max_epi16(a,b)
#define vec_min_16(a,b) _mm512_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
}
#define vec_load_psqt(a) _mm256_load_si256(a) #define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b) #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256() #define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 32 #define NumRegistersSIMD 32
#define MaxChunkSize 64
#elif USE_AVX2 #elif USE_AVX2
typedef __m256i vec_t; typedef __m256i vec_t;
@ -61,12 +71,22 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) _mm256_store_si256(a,b) #define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
#define vec_zero() _mm256_setzero_si256()
#define vec_set_16(a) _mm256_set1_epi16(a)
#define vec_max_16(a,b) _mm256_max_epi16(a,b)
#define vec_min_16(a,b) _mm256_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
return _mm256_permute4x64_epi64(compacted, 0b11011000);
}
#define vec_load_psqt(a) _mm256_load_si256(a) #define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b) #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256() #define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 16 #define NumRegistersSIMD 16
#define MaxChunkSize 32
#elif USE_SSE2 #elif USE_SSE2
typedef __m128i vec_t; typedef __m128i vec_t;
@ -75,12 +95,19 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) *(a)=(b) #define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b) #define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b) #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
#define vec_zero() _mm_setzero_si128()
#define vec_set_16(a) _mm_set1_epi16(a)
#define vec_max_16(a,b) _mm_max_epi16(a,b)
#define vec_min_16(a,b) _mm_min_epi16(a,b)
#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
#define vec_load_psqt(a) (*(a)) #define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b) #define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_epi32(a,b) #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
#define vec_zero_psqt() _mm_setzero_si128() #define vec_zero_psqt() _mm_setzero_si128()
#define NumRegistersSIMD (Is64Bit ? 16 : 8) #define NumRegistersSIMD (Is64Bit ? 16 : 8)
#define MaxChunkSize 16
#elif USE_MMX #elif USE_MMX
typedef __m64 vec_t; typedef __m64 vec_t;
@ -89,12 +116,26 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) *(a)=(b) #define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b) #define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b) #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
#define vec_zero() _mm_setzero_si64()
#define vec_set_16(a) _mm_set1_pi16(a)
inline vec_t vec_max_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
}
inline vec_t vec_min_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
}
#define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
#define vec_load_psqt(a) (*(a)) #define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b) #define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b) #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
#define vec_zero_psqt() _mm_setzero_si64() #define vec_zero_psqt() _mm_setzero_si64()
#define vec_cleanup() _mm_empty()
#define NumRegistersSIMD 8 #define NumRegistersSIMD 8
#define MaxChunkSize 8
#elif USE_NEON #elif USE_NEON
typedef int16x8_t vec_t; typedef int16x8_t vec_t;
@ -103,12 +144,24 @@ namespace Stockfish::Eval::NNUE {
#define vec_store(a,b) *(a)=(b) #define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b) #define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b) #define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_mul_16(a,b) vmulq_s16(a,b)
#define vec_zero() vec_t{0}
#define vec_set_16(a) vdupq_n_s16(a)
#define vec_max_16(a,b) vmaxq_s16(a,b)
#define vec_min_16(a,b) vminq_s16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
const int8x8_t shifta = vshrn_n_s16(a, 7);
const int8x8_t shiftb = vshrn_n_s16(b, 7);
const int8x16_t compacted = vcombine_s8(shifta,shiftb);
return *reinterpret_cast<const vec_t*> (&compacted);
}
#define vec_load_psqt(a) (*(a)) #define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b) #define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) vaddq_s32(a,b) #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
#define vec_sub_psqt_32(a,b) vsubq_s32(a,b) #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
#define vec_zero_psqt() psqt_vec_t{0} #define vec_zero_psqt() psqt_vec_t{0}
#define NumRegistersSIMD 16 #define NumRegistersSIMD 16
#define MaxChunkSize 16
#else #else
#undef VECTOR #undef VECTOR
@ -235,110 +288,30 @@ namespace Stockfish::Eval::NNUE {
{ {
const IndexType offset = (HalfDimensions / 2) * p; const IndexType offset = (HalfDimensions / 2) * p;
#if defined(USE_AVX512) #if defined(VECTOR)
constexpr IndexType OutputChunkSize = 512 / 8; constexpr IndexType OutputChunkSize = MaxChunkSize;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0); static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
const __m512i Zero = _mm512_setzero_si512(); vec_t Zero = vec_zero();
const __m512i One = _mm512_set1_epi16(127); vec_t One = vec_set_16(127);
const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0])); const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2])); const vec_t* in1 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
__m512i* out = reinterpret_cast< __m512i*>(output + offset); vec_t* out = reinterpret_cast< vec_t*>(output + offset);
for (IndexType j = 0; j < NumOutputChunks; j += 1) for (IndexType j = 0; j < NumOutputChunks; j += 1)
{ {
const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero); const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero);
const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero); const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero);
const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero); const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero);
const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero); const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero);
const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7); const vec_t pa = vec_mul_16(sum0a, sum1a);
const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7); const vec_t pb = vec_mul_16(sum0b, sum1b);
out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb)); out[j] = vec_msb_pack_16(pa, pb);
}
#elif defined(USE_AVX2)
constexpr IndexType OutputChunkSize = 256 / 8;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
const __m256i Zero = _mm256_setzero_si256();
const __m256i One = _mm256_set1_epi16(127);
constexpr int Control = 0b11011000;
const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
__m256i* out = reinterpret_cast< __m256i*>(output + offset);
for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);
const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);
out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
}
#elif defined(USE_SSE2)
constexpr IndexType OutputChunkSize = 128 / 8;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
const __m128i Zero = _mm_setzero_si128();
const __m128i One = _mm_set1_epi16(127);
const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
__m128i* out = reinterpret_cast< __m128i*>(output + offset);
for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);
const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);
out[j] = _mm_packs_epi16(pa, pb);
}
#elif defined(USE_NEON)
constexpr IndexType OutputChunkSize = 128 / 8;
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
const int16x8_t Zero = vdupq_n_s16(0);
const int16x8_t One = vdupq_n_s16(127);
const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
int8x16_t* out = reinterpret_cast< int8x16_t*>(output + offset);
for (IndexType j = 0; j < NumOutputChunks; j += 1)
{
const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);
const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);
out[j] = vcombine_s8(pa, pb);
} }
#else #else
@ -354,6 +327,10 @@ namespace Stockfish::Eval::NNUE {
#endif #endif
} }
#if defined(vec_cleanup)
vec_cleanup();
#endif
return psqt; return psqt;
} // end of function transform() } // end of function transform()