mirror of
https://github.com/sockspls/badfish
synced 2025-04-30 16:53:09 +00:00
Add support for ARM dot product instructions
The sdot instruction computes (and accumulates) a signed dot product, which is quite handy for Stockfish's NNUE code. The instruction is optional for Armv8.2 and Armv8.3, and mandatory for Armv8.4 and above. The commit adds a new 'arm-dotprod' architecture with enabled dot product support. It also enables dot product support for the existing 'apple-silicon' architecture, which is at least Armv8.5. The following local speed test was performed on an Apple M1 with ARCH=apple-silicon. I had to remove CPU pinning from the benchmark script. However, the results were still consistent: Checking both binaries against themselves reported a speedup of +0.0000 and +0.0005, respectively. ``` Result of 100 runs ================== base (...ish.037ef3e1) = 1917997 +/- 7152 test (...fish.dotprod) = 2159682 +/- 9066 diff = +241684 +/- 2923 speedup = +0.1260 P(speedup > 0) = 1.0000 CPU: 10 x arm Hyperthreading: off ``` Fixes #4193 closes https://github.com/official-stockfish/Stockfish/pull/4400 No functional change
This commit is contained in:
parent
037ef3e18d
commit
b4ad3a3c4b
3 changed files with 78 additions and 24 deletions
65
src/Makefile
65
src/Makefile
|
@ -69,32 +69,33 @@ VPATH = syzygy:nnue:nnue/features
|
|||
### Section 2. High-level Configuration
|
||||
### ==========================================================================
|
||||
#
|
||||
# flag --- Comp switch --- Description
|
||||
# flag --- Comp switch --- Description
|
||||
# ----------------------------------------------------------------------------
|
||||
#
|
||||
# debug = yes/no --- -DNDEBUG --- Enable/Disable debug mode
|
||||
# debug = yes/no --- -DNDEBUG --- Enable/Disable debug mode
|
||||
# sanitize = none/<sanitizer> ... (-fsanitize )
|
||||
# --- ( undefined ) --- enable undefined behavior checks
|
||||
# --- ( thread ) --- enable threading error checks
|
||||
# --- ( address ) --- enable memory access checks
|
||||
# --- ...etc... --- see compiler documentation for supported sanitizers
|
||||
# optimize = yes/no --- (-O3/-fast etc.) --- Enable/Disable optimizations
|
||||
# arch = (name) --- (-arch) --- Target architecture
|
||||
# bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system
|
||||
# prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction
|
||||
# popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction
|
||||
# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction
|
||||
# sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions
|
||||
# mmx = yes/no --- -mmmx --- Use Intel MMX instructions
|
||||
# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2
|
||||
# ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3
|
||||
# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1
|
||||
# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
|
||||
# avxvnni = yes/no --- -mavxvnni --- Use Intel Vector Neural Network Instructions AVX
|
||||
# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
|
||||
# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256
|
||||
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
|
||||
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
|
||||
# --- ( undefined ) --- enable undefined behavior checks
|
||||
# --- ( thread ) --- enable threading error checks
|
||||
# --- ( address ) --- enable memory access checks
|
||||
# --- ...etc... --- see compiler documentation for supported sanitizers
|
||||
# optimize = yes/no --- (-O3/-fast etc.) --- Enable/Disable optimizations
|
||||
# arch = (name) --- (-arch) --- Target architecture
|
||||
# bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system
|
||||
# prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction
|
||||
# popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction
|
||||
# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction
|
||||
# sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions
|
||||
# mmx = yes/no --- -mmmx --- Use Intel MMX instructions
|
||||
# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2
|
||||
# ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3
|
||||
# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1
|
||||
# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
|
||||
# avxvnni = yes/no --- -mavxvnni --- Use Intel Vector Neural Network Instructions AVX
|
||||
# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
|
||||
# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256
|
||||
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
|
||||
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
|
||||
# dotprod = yes/no --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
|
||||
#
|
||||
# Note that Makefile is space sensitive, so when adding new architectures
|
||||
# or modifying existing flags, you have to make sure there are no extra spaces
|
||||
|
@ -116,7 +117,7 @@ ifeq ($(ARCH), $(filter $(ARCH), \
|
|||
x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
|
||||
x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
|
||||
x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
|
||||
armv7 armv7-neon armv8 apple-silicon general-64 general-32 riscv64))
|
||||
armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64))
|
||||
SUPPORTED_ARCH=true
|
||||
else
|
||||
SUPPORTED_ARCH=false
|
||||
|
@ -140,6 +141,7 @@ avx512 = no
|
|||
vnni256 = no
|
||||
vnni512 = no
|
||||
neon = no
|
||||
dotprod = no
|
||||
arm_version = 0
|
||||
STRIP = strip
|
||||
|
||||
|
@ -308,11 +310,21 @@ ifeq ($(ARCH),armv8)
|
|||
arm_version = 8
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH),armv8-dotprod)
|
||||
arch = armv8
|
||||
prefetch = yes
|
||||
popcnt = yes
|
||||
neon = yes
|
||||
dotprod = yes
|
||||
arm_version = 8
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH),apple-silicon)
|
||||
arch = arm64
|
||||
prefetch = yes
|
||||
popcnt = yes
|
||||
neon = yes
|
||||
dotprod = yes
|
||||
arm_version = 8
|
||||
endif
|
||||
|
||||
|
@ -675,6 +687,10 @@ ifeq ($(neon),yes)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(dotprod),yes)
|
||||
CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD
|
||||
endif
|
||||
|
||||
### 3.7 pext
|
||||
ifeq ($(pext),yes)
|
||||
CXXFLAGS += -DUSE_PEXT
|
||||
|
@ -776,6 +792,7 @@ help:
|
|||
@echo "armv7 > ARMv7 32-bit"
|
||||
@echo "armv7-neon > ARMv7 32-bit with popcnt and neon"
|
||||
@echo "armv8 > ARMv8 64-bit with popcnt and neon"
|
||||
@echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support"
|
||||
@echo "e2k > Elbrus 2000"
|
||||
@echo "apple-silicon > Apple silicon ARM64"
|
||||
@echo "general-64 > unspecified 64-bit"
|
||||
|
|
|
@ -72,6 +72,10 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||
const __m64 Zeros = _mm_setzero_si64();
|
||||
const auto inputVector = reinterpret_cast<const __m64*>(input);
|
||||
|
||||
# elif defined(USE_NEON_DOTPROD)
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const auto inputVector = reinterpret_cast<const int8x16_t*>(input);
|
||||
|
||||
# elif defined(USE_NEON)
|
||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
|
||||
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
|
||||
|
@ -123,6 +127,14 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||
sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
|
||||
output[i] = _mm_cvtsi64_si32(sum);
|
||||
|
||||
# elif defined(USE_NEON_DOTPROD)
|
||||
int32x4_t sum = {biases[i]};
|
||||
const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
|
||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
||||
sum = vdotq_s32(sum, inputVector[j], row[j]);
|
||||
}
|
||||
output[i] = vaddvq_s32(sum);
|
||||
|
||||
# elif defined(USE_NEON)
|
||||
int32x4_t sum = {biases[i]};
|
||||
const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
|
||||
|
@ -187,6 +199,9 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||
#elif defined (USE_SSSE3)
|
||||
static constexpr IndexType InputSimdWidth = 16;
|
||||
static constexpr IndexType MaxNumOutputRegs = 8;
|
||||
#elif defined (USE_NEON_DOTPROD)
|
||||
static constexpr IndexType InputSimdWidth = 16;
|
||||
static constexpr IndexType MaxNumOutputRegs = 8;
|
||||
#elif defined (USE_NEON)
|
||||
static constexpr IndexType InputSimdWidth = 8;
|
||||
static constexpr IndexType MaxNumOutputRegs = 8;
|
||||
|
@ -292,6 +307,15 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::m128_hadd
|
||||
#define vec_haddx4 Simd::m128_haddx4
|
||||
#elif defined (USE_NEON_DOTPROD)
|
||||
using acc_vec_t = int32x4_t;
|
||||
using bias_vec_t = int32x4_t;
|
||||
using weight_vec_t = int8x16_t;
|
||||
using in_vec_t = int8x16_t;
|
||||
#define vec_zero {0}
|
||||
#define vec_add_dpbusd_32x2 Simd::dotprod_m128_add_dpbusd_epi32x2
|
||||
#define vec_hadd Simd::neon_m128_hadd
|
||||
#define vec_haddx4 Simd::neon_m128_haddx4
|
||||
#elif defined (USE_NEON)
|
||||
using acc_vec_t = int32x4_t;
|
||||
using bias_vec_t = int32x4_t;
|
||||
|
|
|
@ -346,6 +346,19 @@ namespace Stockfish::Simd {
|
|||
|
||||
#endif
|
||||
|
||||
#if defined (USE_NEON_DOTPROD)
|
||||
|
||||
[[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32x2(
|
||||
int32x4_t& acc,
|
||||
int8x16_t a0, int8x16_t b0,
|
||||
int8x16_t a1, int8x16_t b1) {
|
||||
|
||||
acc = vdotq_s32(acc, a0, b0);
|
||||
acc = vdotq_s32(acc, a1, b1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (USE_NEON)
|
||||
|
||||
[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
|
||||
|
|
Loading…
Add table
Reference in a new issue