mirror of
https://github.com/sockspls/badfish
synced 2025-04-30 08:43:09 +00:00
Provide vectorized NNUE code for SSE2 and MMX targets
This patch allows old x86 CPUs, from AMD K8 (which the x86-64 baseline targets) all the way down to the Pentium MMX, to benefit from NNUE with comparable performance hit versus hand-written eval as on more modern processors. NPS of the bench with NNUE enabled on a Pentium III 1.13 GHz (using the MMX code): master: 38951 this patch: 80586 NPS of the bench with NNUE enabled using baseline x86-64 arch, which is how linux distros are likely to package stockfish, on a modern CPU (using the SSE2 code): master: 882584 this patch: 1203945 closes https://github.com/official-stockfish/Stockfish/pull/2956 No functional change.
This commit is contained in:
parent
f948cd008d
commit
21df37d7fd
7 changed files with 150 additions and 6 deletions
1
AUTHORS
1
AUTHORS
|
@ -53,6 +53,7 @@ Ernesto Gatti
|
|||
Linmiao Xu (linrock)
|
||||
Fabian Beuke (madnight)
|
||||
Fabian Fichter (ianfab)
|
||||
Fanael Linithien (Fanael)
|
||||
fanon
|
||||
Fauzi Akram Dabat (FauziAkram)
|
||||
Felix Wittmann
|
||||
|
|
13
src/Makefile
13
src/Makefile
|
@ -86,6 +86,7 @@ sanitize = no
|
|||
bits = 64
|
||||
prefetch = no
|
||||
popcnt = no
|
||||
mmx = no
|
||||
sse = no
|
||||
ssse3 = no
|
||||
sse41 = no
|
||||
|
@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32)
|
|||
arch = i386
|
||||
bits = 32
|
||||
prefetch = yes
|
||||
mmx = yes
|
||||
sse = yes
|
||||
endif
|
||||
|
||||
|
@ -250,7 +252,7 @@ ifeq ($(COMP),gcc)
|
|||
ifneq ($(KERNEL),Darwin)
|
||||
LDFLAGS += -Wl,--no-as-needed
|
||||
endif
|
||||
|
||||
|
||||
gccversion = $(shell $(CXX) --version)
|
||||
gccisclang = $(findstring clang,$(gccversion))
|
||||
endif
|
||||
|
@ -432,6 +434,13 @@ ifeq ($(ssse3),yes)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(mmx),yes)
|
||||
CXXFLAGS += -DUSE_MMX
|
||||
ifeq ($(comp),$(filter $(comp),gcc clang mingw))
|
||||
CXXFLAGS += -mmmx
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(neon),yes)
|
||||
CXXFLAGS += -DUSE_NEON
|
||||
endif
|
||||
|
@ -516,7 +525,7 @@ help:
|
|||
@echo "x86-64-ssse3 > x86 64-bit with ssse3 support"
|
||||
@echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support"
|
||||
@echo "x86-64 > x86 64-bit generic"
|
||||
@echo "x86-32 > x86 32-bit (also enables SSE)"
|
||||
@echo "x86-32 > x86 32-bit (also enables MMX and SSE)"
|
||||
@echo "x86-32-old > x86 32-bit fall back for old hardware"
|
||||
@echo "ppc-64 > PPC 64-bit"
|
||||
@echo "ppc-32 > PPC 32-bit"
|
||||
|
|
|
@ -228,6 +228,9 @@ const std::string compiler_info() {
|
|||
#endif
|
||||
compiler += (HasPext ? " BMI2" : "");
|
||||
compiler += (HasPopCnt ? " POPCNT" : "");
|
||||
#if defined(USE_MMX)
|
||||
compiler += " MMX";
|
||||
#endif
|
||||
#if !defined(NDEBUG)
|
||||
compiler += " DEBUG";
|
||||
#endif
|
||||
|
|
|
@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers {
|
|||
const __m256i kOnes = _mm256_set1_epi16(1);
|
||||
const auto input_vector = reinterpret_cast<const __m256i*>(input);
|
||||
|
||||
#elif defined(USE_SSSE3)
|
||||
#elif defined(USE_SSE2)
|
||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||
#ifndef USE_SSSE3
|
||||
const __m128i kZeros = _mm_setzero_si128();
|
||||
#else
|
||||
const __m128i kOnes = _mm_set1_epi16(1);
|
||||
#endif
|
||||
const auto input_vector = reinterpret_cast<const __m128i*>(input);
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||
const __m64 kZeros = _mm_setzero_si64();
|
||||
const auto input_vector = reinterpret_cast<const __m64*>(input);
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||
const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
|
||||
|
@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers {
|
|||
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
|
||||
output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
|
||||
|
||||
#elif defined(USE_SSE2)
|
||||
__m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
|
||||
__m128i sum_hi = kZeros;
|
||||
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
__m128i row_j = _mm_load_si128(&row[j]);
|
||||
__m128i input_j = _mm_load_si128(&input_vector[j]);
|
||||
__m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
|
||||
__m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
|
||||
__m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
|
||||
__m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
|
||||
__m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
|
||||
__m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
|
||||
__m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
|
||||
sum_lo = _mm_add_epi32(sum_lo, product_lo);
|
||||
sum_hi = _mm_add_epi32(sum_hi, product_hi);
|
||||
}
|
||||
__m128i sum = _mm_add_epi32(sum_lo, sum_hi);
|
||||
__m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
sum = _mm_add_epi32(sum, sum_high_64);
|
||||
__m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
sum = _mm_add_epi32(sum, sum_second_32);
|
||||
output[i] = _mm_cvtsi128_si32(sum);
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
__m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
|
||||
__m64 sum_hi = kZeros;
|
||||
const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
__m64 row_j = row[j];
|
||||
__m64 input_j = input_vector[j];
|
||||
__m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
|
||||
__m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
|
||||
__m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
|
||||
__m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
|
||||
__m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
|
||||
__m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
|
||||
__m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
|
||||
sum_lo = _mm_add_pi32(sum_lo, product_lo);
|
||||
sum_hi = _mm_add_pi32(sum_hi, product_hi);
|
||||
}
|
||||
__m64 sum = _mm_add_pi32(sum_lo, sum_hi);
|
||||
sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
|
||||
output[i] = _mm_cvtsi64_si32(sum);
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
int32x4_t sum = {biases_[i]};
|
||||
const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
|
||||
|
@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers {
|
|||
#endif
|
||||
|
||||
}
|
||||
#if defined(USE_MMX)
|
||||
_mm_empty();
|
||||
#endif
|
||||
return output;
|
||||
}
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers {
|
|||
}
|
||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||
|
||||
#elif defined(USE_SSSE3)
|
||||
#elif defined(USE_SSE2)
|
||||
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
|
||||
|
||||
#ifdef USE_SSE41
|
||||
|
@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers {
|
|||
}
|
||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
|
||||
const __m64 k0x80s = _mm_set1_pi8(-128);
|
||||
const auto in = reinterpret_cast<const __m64*>(input);
|
||||
const auto out = reinterpret_cast<__m64*>(output);
|
||||
for (IndexType i = 0; i < kNumChunks; ++i) {
|
||||
const __m64 words0 = _mm_srai_pi16(
|
||||
_mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
|
||||
kWeightScaleBits);
|
||||
const __m64 words1 = _mm_srai_pi16(
|
||||
_mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
|
||||
kWeightScaleBits);
|
||||
const __m64 packedbytes = _mm_packs_pi16(words0, words1);
|
||||
out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
|
||||
}
|
||||
_mm_empty();
|
||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
|
||||
const int8x8_t kZero = {0};
|
||||
|
|
|
@ -33,6 +33,9 @@
|
|||
#elif defined(USE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
#include <mmintrin.h>
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
@ -79,6 +82,9 @@ namespace Eval::NNUE {
|
|||
#elif defined(USE_SSE2)
|
||||
constexpr std::size_t kSimdWidth = 16;
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
constexpr std::size_t kSimdWidth = 8;
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
constexpr std::size_t kSimdWidth = 16;
|
||||
#endif
|
||||
|
|
|
@ -88,7 +88,7 @@ namespace Eval::NNUE {
|
|||
constexpr int kControl = 0b11011000;
|
||||
const __m256i kZero = _mm256_setzero_si256();
|
||||
|
||||
#elif defined(USE_SSSE3)
|
||||
#elif defined(USE_SSE2)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
|
||||
|
||||
#ifdef USE_SSE41
|
||||
|
@ -97,6 +97,10 @@ namespace Eval::NNUE {
|
|||
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||
#endif
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
|
||||
const __m64 k0x80s = _mm_set1_pi8(-128);
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||
const int8x8_t kZero = {0};
|
||||
|
@ -117,7 +121,7 @@ namespace Eval::NNUE {
|
|||
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
|
||||
}
|
||||
|
||||
#elif defined(USE_SSSE3)
|
||||
#elif defined(USE_SSE2)
|
||||
auto out = reinterpret_cast<__m128i*>(&output[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
|
||||
|
@ -137,6 +141,17 @@ namespace Eval::NNUE {
|
|||
);
|
||||
}
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
auto out = reinterpret_cast<__m64*>(&output[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
__m64 sum0 = *(&reinterpret_cast<const __m64*>(
|
||||
accumulation[perspectives[p]][0])[j * 2 + 0]);
|
||||
__m64 sum1 = *(&reinterpret_cast<const __m64*>(
|
||||
accumulation[perspectives[p]][0])[j * 2 + 1]);
|
||||
const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
|
||||
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
|
||||
}
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
|
@ -154,6 +169,9 @@ namespace Eval::NNUE {
|
|||
#endif
|
||||
|
||||
}
|
||||
#if defined(USE_MMX)
|
||||
_mm_empty();
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -193,6 +211,15 @@ namespace Eval::NNUE {
|
|||
for (IndexType j = 0; j < kNumChunks; ++j)
|
||||
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
auto accumulation = reinterpret_cast<__m64*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
auto accumulation = reinterpret_cast<int16x8_t*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
|
@ -208,6 +235,9 @@ namespace Eval::NNUE {
|
|||
|
||||
}
|
||||
}
|
||||
#if defined(USE_MMX)
|
||||
_mm_empty();
|
||||
#endif
|
||||
|
||||
accumulator.computed_accumulation = true;
|
||||
accumulator.computed_score = false;
|
||||
|
@ -234,6 +264,11 @@ namespace Eval::NNUE {
|
|||
auto accumulation = reinterpret_cast<__m128i*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||
auto accumulation = reinterpret_cast<__m64*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||
auto accumulation = reinterpret_cast<int16x8_t*>(
|
||||
|
@ -263,6 +298,12 @@ namespace Eval::NNUE {
|
|||
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
|
@ -294,6 +335,12 @@ namespace Eval::NNUE {
|
|||
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(USE_MMX)
|
||||
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
|
@ -310,6 +357,9 @@ namespace Eval::NNUE {
|
|||
}
|
||||
}
|
||||
}
|
||||
#if defined(USE_MMX)
|
||||
_mm_empty();
|
||||
#endif
|
||||
|
||||
accumulator.computed_accumulation = true;
|
||||
accumulator.computed_score = false;
|
||||
|
|
Loading…
Add table
Reference in a new issue