mirror of
https://github.com/sockspls/badfish
synced 2025-04-30 16:53:09 +00:00
Use packed 32-bit MMX operations for updating the PSQT accumulator
This improves the speed of NNUE by a bit on old hardware that code path is intended for, like a Pentium III 1.13 GHz: 10 repeats of "./stockfish bench 16 1 13 default depth NNUE": Before: 54 642 504 897 cycles (± 0.12%) 62 301 937 829 instructions (± 0.03%) After: 54 320 821 928 cycles (± 0.13%) 62 084 742 699 instructions (± 0.02%) Speed of go depth 20 from startpos: Before: 53103 nps After: 53856 nps closes https://github.com/official-stockfish/Stockfish/pull/3476 No functional change.
This commit is contained in:
parent
0faf81d1f6
commit
038487f954
1 changed files with 5 additions and 5 deletions
|
@ -84,18 +84,18 @@ namespace Stockfish::Eval::NNUE {
|
|||
|
||||
#elif USE_MMX
|
||||
typedef __m64 vec_t;
|
||||
typedef std::int32_t psqt_vec_t;
|
||||
typedef __m64 psqt_vec_t;
|
||||
#define vec_load(a) (*(a))
|
||||
#define vec_store(a,b) *(a)=(b)
|
||||
#define vec_add_16(a,b) _mm_add_pi16(a,b)
|
||||
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
|
||||
#define vec_load_psqt(a) (*(a))
|
||||
#define vec_store_psqt(a,b) *(a)=(b)
|
||||
#define vec_add_psqt_32(a,b) a+b
|
||||
#define vec_sub_psqt_32(a,b) a-b
|
||||
#define vec_zero_psqt() 0
|
||||
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
|
||||
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
|
||||
#define vec_zero_psqt() _mm_setzero_si64()
|
||||
static constexpr IndexType NumRegs = 8;
|
||||
static constexpr IndexType NumPsqtRegs = 8;
|
||||
static constexpr IndexType NumPsqtRegs = 4;
|
||||
|
||||
#elif USE_NEON
|
||||
typedef int16x8_t vec_t;
|
||||
|
|
Loading…
Add table
Reference in a new issue