mirror of
https://github.com/sockspls/badfish
synced 2025-06-28 00:19:50 +00:00
Generalize the feature transform to use vec_t macros
This commit generalizes the feature transform to use vec_t macros that are architecture defined instead of using a seperate code path for each one. It should make some old architectures (MMX, including improvements by Fanael) faster and make further such improvements easier in the future. Includes some corrections to CI for mingw. closes https://github.com/official-stockfish/Stockfish/pull/3955 closes https://github.com/official-stockfish/Stockfish/pull/3928 No functional change
This commit is contained in:
parent
4ac7d726ec
commit
270a0e737f
3 changed files with 78 additions and 100 deletions
12
.github/workflows/stockfish.yml
vendored
12
.github/workflows/stockfish.yml
vendored
|
@ -82,20 +82,20 @@ jobs:
|
||||||
name: "Windows 2022 Mingw-w64 GCC x86_64",
|
name: "Windows 2022 Mingw-w64 GCC x86_64",
|
||||||
os: windows-2022,
|
os: windows-2022,
|
||||||
compiler: g++,
|
compiler: g++,
|
||||||
comp: gcc,
|
comp: mingw,
|
||||||
run_64bit_tests: true,
|
run_64bit_tests: true,
|
||||||
msys_sys: 'mingw64',
|
msys_sys: 'mingw64',
|
||||||
msys_env: 'x86_64',
|
msys_env: 'x86_64-gcc',
|
||||||
shell: 'msys2 {0}'
|
shell: 'msys2 {0}'
|
||||||
}
|
}
|
||||||
- {
|
- {
|
||||||
name: "Windows 2022 Mingw-w64 GCC i686",
|
name: "Windows 2022 Mingw-w64 GCC i686",
|
||||||
os: windows-2022,
|
os: windows-2022,
|
||||||
compiler: g++,
|
compiler: g++,
|
||||||
comp: gcc,
|
comp: mingw,
|
||||||
run_32bit_tests: true,
|
run_32bit_tests: true,
|
||||||
msys_sys: 'mingw32',
|
msys_sys: 'mingw32',
|
||||||
msys_env: 'i686',
|
msys_env: 'i686-gcc',
|
||||||
shell: 'msys2 {0}'
|
shell: 'msys2 {0}'
|
||||||
}
|
}
|
||||||
- {
|
- {
|
||||||
|
@ -105,7 +105,7 @@ jobs:
|
||||||
comp: clang,
|
comp: clang,
|
||||||
run_64bit_tests: true,
|
run_64bit_tests: true,
|
||||||
msys_sys: 'clang64',
|
msys_sys: 'clang64',
|
||||||
msys_env: 'clang-x86_64',
|
msys_env: 'clang-x86_64-clang',
|
||||||
shell: 'msys2 {0}'
|
shell: 'msys2 {0}'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -129,7 +129,7 @@ jobs:
|
||||||
uses: msys2/setup-msys2@v2
|
uses: msys2/setup-msys2@v2
|
||||||
with:
|
with:
|
||||||
msystem: ${{matrix.config.msys_sys}}
|
msystem: ${{matrix.config.msys_sys}}
|
||||||
install: mingw-w64-${{matrix.config.msys_env}}-${{matrix.config.comp}} make git expect
|
install: mingw-w64-${{matrix.config.msys_env}} make git expect
|
||||||
|
|
||||||
- name: Download the used network from the fishtest framework
|
- name: Download the used network from the fishtest framework
|
||||||
run: |
|
run: |
|
||||||
|
|
1
AUTHORS
1
AUTHORS
|
@ -31,6 +31,7 @@ Arjun Temurnikar
|
||||||
Artem Solopiy (EntityFX)
|
Artem Solopiy (EntityFX)
|
||||||
Auguste Pop
|
Auguste Pop
|
||||||
Balint Pfliegel
|
Balint Pfliegel
|
||||||
|
Ben Chaney (Chaneybenjamini)
|
||||||
Ben Koshy (BKSpurgeon)
|
Ben Koshy (BKSpurgeon)
|
||||||
Bill Henry (VoyagerOne)
|
Bill Henry (VoyagerOne)
|
||||||
Bojun Guo (noobpwnftw, Nooby)
|
Bojun Guo (noobpwnftw, Nooby)
|
||||||
|
|
|
@ -47,12 +47,22 @@ namespace Stockfish::Eval::NNUE {
|
||||||
#define vec_store(a,b) _mm512_store_si512(a,b)
|
#define vec_store(a,b) _mm512_store_si512(a,b)
|
||||||
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
|
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
|
||||||
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
|
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
|
||||||
|
#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
|
||||||
|
#define vec_zero() _mm512_setzero_epi32()
|
||||||
|
#define vec_set_16(a) _mm512_set1_epi16(a)
|
||||||
|
#define vec_max_16(a,b) _mm512_max_epi16(a,b)
|
||||||
|
#define vec_min_16(a,b) _mm512_min_epi16(a,b)
|
||||||
|
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
|
||||||
|
vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
|
||||||
|
return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
|
||||||
|
}
|
||||||
#define vec_load_psqt(a) _mm256_load_si256(a)
|
#define vec_load_psqt(a) _mm256_load_si256(a)
|
||||||
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
|
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
|
||||||
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
|
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
|
||||||
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
|
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
|
||||||
#define vec_zero_psqt() _mm256_setzero_si256()
|
#define vec_zero_psqt() _mm256_setzero_si256()
|
||||||
#define NumRegistersSIMD 32
|
#define NumRegistersSIMD 32
|
||||||
|
#define MaxChunkSize 64
|
||||||
|
|
||||||
#elif USE_AVX2
|
#elif USE_AVX2
|
||||||
typedef __m256i vec_t;
|
typedef __m256i vec_t;
|
||||||
|
@ -61,12 +71,22 @@ namespace Stockfish::Eval::NNUE {
|
||||||
#define vec_store(a,b) _mm256_store_si256(a,b)
|
#define vec_store(a,b) _mm256_store_si256(a,b)
|
||||||
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
|
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
|
||||||
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
|
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
|
||||||
|
#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
|
||||||
|
#define vec_zero() _mm256_setzero_si256()
|
||||||
|
#define vec_set_16(a) _mm256_set1_epi16(a)
|
||||||
|
#define vec_max_16(a,b) _mm256_max_epi16(a,b)
|
||||||
|
#define vec_min_16(a,b) _mm256_min_epi16(a,b)
|
||||||
|
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
|
||||||
|
vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
|
||||||
|
return _mm256_permute4x64_epi64(compacted, 0b11011000);
|
||||||
|
}
|
||||||
#define vec_load_psqt(a) _mm256_load_si256(a)
|
#define vec_load_psqt(a) _mm256_load_si256(a)
|
||||||
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
|
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
|
||||||
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
|
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
|
||||||
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
|
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
|
||||||
#define vec_zero_psqt() _mm256_setzero_si256()
|
#define vec_zero_psqt() _mm256_setzero_si256()
|
||||||
#define NumRegistersSIMD 16
|
#define NumRegistersSIMD 16
|
||||||
|
#define MaxChunkSize 32
|
||||||
|
|
||||||
#elif USE_SSE2
|
#elif USE_SSE2
|
||||||
typedef __m128i vec_t;
|
typedef __m128i vec_t;
|
||||||
|
@ -75,12 +95,19 @@ namespace Stockfish::Eval::NNUE {
|
||||||
#define vec_store(a,b) *(a)=(b)
|
#define vec_store(a,b) *(a)=(b)
|
||||||
#define vec_add_16(a,b) _mm_add_epi16(a,b)
|
#define vec_add_16(a,b) _mm_add_epi16(a,b)
|
||||||
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
|
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
|
||||||
|
#define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
|
||||||
|
#define vec_zero() _mm_setzero_si128()
|
||||||
|
#define vec_set_16(a) _mm_set1_epi16(a)
|
||||||
|
#define vec_max_16(a,b) _mm_max_epi16(a,b)
|
||||||
|
#define vec_min_16(a,b) _mm_min_epi16(a,b)
|
||||||
|
#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
|
||||||
#define vec_load_psqt(a) (*(a))
|
#define vec_load_psqt(a) (*(a))
|
||||||
#define vec_store_psqt(a,b) *(a)=(b)
|
#define vec_store_psqt(a,b) *(a)=(b)
|
||||||
#define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
|
#define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
|
||||||
#define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
|
#define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
|
||||||
#define vec_zero_psqt() _mm_setzero_si128()
|
#define vec_zero_psqt() _mm_setzero_si128()
|
||||||
#define NumRegistersSIMD (Is64Bit ? 16 : 8)
|
#define NumRegistersSIMD (Is64Bit ? 16 : 8)
|
||||||
|
#define MaxChunkSize 16
|
||||||
|
|
||||||
#elif USE_MMX
|
#elif USE_MMX
|
||||||
typedef __m64 vec_t;
|
typedef __m64 vec_t;
|
||||||
|
@ -89,12 +116,26 @@ namespace Stockfish::Eval::NNUE {
|
||||||
#define vec_store(a,b) *(a)=(b)
|
#define vec_store(a,b) *(a)=(b)
|
||||||
#define vec_add_16(a,b) _mm_add_pi16(a,b)
|
#define vec_add_16(a,b) _mm_add_pi16(a,b)
|
||||||
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
|
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
|
||||||
|
#define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
|
||||||
|
#define vec_zero() _mm_setzero_si64()
|
||||||
|
#define vec_set_16(a) _mm_set1_pi16(a)
|
||||||
|
inline vec_t vec_max_16(vec_t a,vec_t b){
|
||||||
|
vec_t comparison = _mm_cmpgt_pi16(a,b);
|
||||||
|
return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
|
||||||
|
}
|
||||||
|
inline vec_t vec_min_16(vec_t a,vec_t b){
|
||||||
|
vec_t comparison = _mm_cmpgt_pi16(a,b);
|
||||||
|
return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
|
||||||
|
}
|
||||||
|
#define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
|
||||||
#define vec_load_psqt(a) (*(a))
|
#define vec_load_psqt(a) (*(a))
|
||||||
#define vec_store_psqt(a,b) *(a)=(b)
|
#define vec_store_psqt(a,b) *(a)=(b)
|
||||||
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
|
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
|
||||||
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
|
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
|
||||||
#define vec_zero_psqt() _mm_setzero_si64()
|
#define vec_zero_psqt() _mm_setzero_si64()
|
||||||
|
#define vec_cleanup() _mm_empty()
|
||||||
#define NumRegistersSIMD 8
|
#define NumRegistersSIMD 8
|
||||||
|
#define MaxChunkSize 8
|
||||||
|
|
||||||
#elif USE_NEON
|
#elif USE_NEON
|
||||||
typedef int16x8_t vec_t;
|
typedef int16x8_t vec_t;
|
||||||
|
@ -103,12 +144,24 @@ namespace Stockfish::Eval::NNUE {
|
||||||
#define vec_store(a,b) *(a)=(b)
|
#define vec_store(a,b) *(a)=(b)
|
||||||
#define vec_add_16(a,b) vaddq_s16(a,b)
|
#define vec_add_16(a,b) vaddq_s16(a,b)
|
||||||
#define vec_sub_16(a,b) vsubq_s16(a,b)
|
#define vec_sub_16(a,b) vsubq_s16(a,b)
|
||||||
|
#define vec_mul_16(a,b) vmulq_s16(a,b)
|
||||||
|
#define vec_zero() vec_t{0}
|
||||||
|
#define vec_set_16(a) vdupq_n_s16(a)
|
||||||
|
#define vec_max_16(a,b) vmaxq_s16(a,b)
|
||||||
|
#define vec_min_16(a,b) vminq_s16(a,b)
|
||||||
|
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
|
||||||
|
const int8x8_t shifta = vshrn_n_s16(a, 7);
|
||||||
|
const int8x8_t shiftb = vshrn_n_s16(b, 7);
|
||||||
|
const int8x16_t compacted = vcombine_s8(shifta,shiftb);
|
||||||
|
return *reinterpret_cast<const vec_t*> (&compacted);
|
||||||
|
}
|
||||||
#define vec_load_psqt(a) (*(a))
|
#define vec_load_psqt(a) (*(a))
|
||||||
#define vec_store_psqt(a,b) *(a)=(b)
|
#define vec_store_psqt(a,b) *(a)=(b)
|
||||||
#define vec_add_psqt_32(a,b) vaddq_s32(a,b)
|
#define vec_add_psqt_32(a,b) vaddq_s32(a,b)
|
||||||
#define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
|
#define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
|
||||||
#define vec_zero_psqt() psqt_vec_t{0}
|
#define vec_zero_psqt() psqt_vec_t{0}
|
||||||
#define NumRegistersSIMD 16
|
#define NumRegistersSIMD 16
|
||||||
|
#define MaxChunkSize 16
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#undef VECTOR
|
#undef VECTOR
|
||||||
|
@ -235,110 +288,30 @@ namespace Stockfish::Eval::NNUE {
|
||||||
{
|
{
|
||||||
const IndexType offset = (HalfDimensions / 2) * p;
|
const IndexType offset = (HalfDimensions / 2) * p;
|
||||||
|
|
||||||
#if defined(USE_AVX512)
|
#if defined(VECTOR)
|
||||||
|
|
||||||
constexpr IndexType OutputChunkSize = 512 / 8;
|
constexpr IndexType OutputChunkSize = MaxChunkSize;
|
||||||
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
|
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
|
||||||
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
|
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
|
||||||
|
|
||||||
const __m512i Zero = _mm512_setzero_si512();
|
vec_t Zero = vec_zero();
|
||||||
const __m512i One = _mm512_set1_epi16(127);
|
vec_t One = vec_set_16(127);
|
||||||
const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
|
|
||||||
|
|
||||||
const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0]));
|
const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
|
||||||
const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
|
const vec_t* in1 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
|
||||||
__m512i* out = reinterpret_cast< __m512i*>(output + offset);
|
vec_t* out = reinterpret_cast< vec_t*>(output + offset);
|
||||||
|
|
||||||
for (IndexType j = 0; j < NumOutputChunks; j += 1)
|
for (IndexType j = 0; j < NumOutputChunks; j += 1)
|
||||||
{
|
{
|
||||||
const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero);
|
const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero);
|
||||||
const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero);
|
const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero);
|
||||||
const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero);
|
const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero);
|
||||||
const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero);
|
const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero);
|
||||||
|
|
||||||
const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7);
|
const vec_t pa = vec_mul_16(sum0a, sum1a);
|
||||||
const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7);
|
const vec_t pb = vec_mul_16(sum0b, sum1b);
|
||||||
|
|
||||||
out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb));
|
out[j] = vec_msb_pack_16(pa, pb);
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(USE_AVX2)
|
|
||||||
|
|
||||||
constexpr IndexType OutputChunkSize = 256 / 8;
|
|
||||||
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
|
|
||||||
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
|
|
||||||
|
|
||||||
const __m256i Zero = _mm256_setzero_si256();
|
|
||||||
const __m256i One = _mm256_set1_epi16(127);
|
|
||||||
constexpr int Control = 0b11011000;
|
|
||||||
|
|
||||||
const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
|
|
||||||
const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
|
|
||||||
__m256i* out = reinterpret_cast< __m256i*>(output + offset);
|
|
||||||
|
|
||||||
for (IndexType j = 0; j < NumOutputChunks; j += 1)
|
|
||||||
{
|
|
||||||
const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
|
|
||||||
const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
|
|
||||||
const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
|
|
||||||
const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);
|
|
||||||
|
|
||||||
const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
|
|
||||||
const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);
|
|
||||||
|
|
||||||
out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(USE_SSE2)
|
|
||||||
|
|
||||||
constexpr IndexType OutputChunkSize = 128 / 8;
|
|
||||||
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
|
|
||||||
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
|
|
||||||
|
|
||||||
const __m128i Zero = _mm_setzero_si128();
|
|
||||||
const __m128i One = _mm_set1_epi16(127);
|
|
||||||
|
|
||||||
const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
|
|
||||||
const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
|
|
||||||
__m128i* out = reinterpret_cast< __m128i*>(output + offset);
|
|
||||||
|
|
||||||
for (IndexType j = 0; j < NumOutputChunks; j += 1)
|
|
||||||
{
|
|
||||||
const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
|
|
||||||
const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
|
|
||||||
const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
|
|
||||||
const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);
|
|
||||||
|
|
||||||
const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
|
|
||||||
const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);
|
|
||||||
|
|
||||||
out[j] = _mm_packs_epi16(pa, pb);
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(USE_NEON)
|
|
||||||
|
|
||||||
constexpr IndexType OutputChunkSize = 128 / 8;
|
|
||||||
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
|
|
||||||
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
|
|
||||||
|
|
||||||
const int16x8_t Zero = vdupq_n_s16(0);
|
|
||||||
const int16x8_t One = vdupq_n_s16(127);
|
|
||||||
|
|
||||||
const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
|
|
||||||
const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
|
|
||||||
int8x16_t* out = reinterpret_cast< int8x16_t*>(output + offset);
|
|
||||||
|
|
||||||
for (IndexType j = 0; j < NumOutputChunks; j += 1)
|
|
||||||
{
|
|
||||||
const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
|
|
||||||
const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
|
|
||||||
const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
|
|
||||||
const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);
|
|
||||||
|
|
||||||
const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
|
|
||||||
const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);
|
|
||||||
|
|
||||||
out[j] = vcombine_s8(pa, pb);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -354,6 +327,10 @@ namespace Stockfish::Eval::NNUE {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(vec_cleanup)
|
||||||
|
vec_cleanup();
|
||||||
|
#endif
|
||||||
|
|
||||||
return psqt;
|
return psqt;
|
||||||
|
|
||||||
} // end of function transform()
|
} // end of function transform()
|
||||||
|
|
Loading…
Add table
Reference in a new issue