1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-05-01 09:13:08 +00:00

Provide vectorized NNUE code for SSE2 and MMX targets

This patch allows old x86 CPUs, from AMD K8 (which the x86-64 baseline
targets) all the way down to the Pentium MMX, to benefit from NNUE with
comparable performance hit versus hand-written eval as on more modern
processors.

NPS of the bench with NNUE enabled on a Pentium III 1.13 GHz (using the
MMX code):
  master: 38951
  this patch: 80586

NPS of the bench with NNUE enabled using baseline x86-64 arch, which is
how linux distros are likely to package stockfish, on a modern CPU
(using the SSE2 code):
  master: 882584
  this patch: 1203945

closes https://github.com/official-stockfish/Stockfish/pull/2956

No functional change.
This commit is contained in:
Fanael Linithien 2020-08-09 16:20:45 +02:00 committed by Joost VandeVondele
parent f948cd008d
commit 21df37d7fd
7 changed files with 150 additions and 6 deletions

View file

@ -53,6 +53,7 @@ Ernesto Gatti
Linmiao Xu (linrock) Linmiao Xu (linrock)
Fabian Beuke (madnight) Fabian Beuke (madnight)
Fabian Fichter (ianfab) Fabian Fichter (ianfab)
Fanael Linithien (Fanael)
fanon fanon
Fauzi Akram Dabat (FauziAkram) Fauzi Akram Dabat (FauziAkram)
Felix Wittmann Felix Wittmann

View file

@ -86,6 +86,7 @@ sanitize = no
bits = 64 bits = 64
prefetch = no prefetch = no
popcnt = no popcnt = no
mmx = no
sse = no sse = no
ssse3 = no ssse3 = no
sse41 = no sse41 = no
@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32)
arch = i386 arch = i386
bits = 32 bits = 32
prefetch = yes prefetch = yes
mmx = yes
sse = yes sse = yes
endif endif
@ -432,6 +434,13 @@ ifeq ($(ssse3),yes)
endif endif
endif endif
ifeq ($(mmx),yes)
CXXFLAGS += -DUSE_MMX
ifeq ($(comp),$(filter $(comp),gcc clang mingw))
CXXFLAGS += -mmmx
endif
endif
ifeq ($(neon),yes) ifeq ($(neon),yes)
CXXFLAGS += -DUSE_NEON CXXFLAGS += -DUSE_NEON
endif endif
@ -516,7 +525,7 @@ help:
@echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-ssse3 > x86 64-bit with ssse3 support"
@echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support"
@echo "x86-64 > x86 64-bit generic" @echo "x86-64 > x86 64-bit generic"
@echo "x86-32 > x86 32-bit (also enables SSE)" @echo "x86-32 > x86 32-bit (also enables MMX and SSE)"
@echo "x86-32-old > x86 32-bit fall back for old hardware" @echo "x86-32-old > x86 32-bit fall back for old hardware"
@echo "ppc-64 > PPC 64-bit" @echo "ppc-64 > PPC 64-bit"
@echo "ppc-32 > PPC 32-bit" @echo "ppc-32 > PPC 32-bit"

View file

@ -228,6 +228,9 @@ const std::string compiler_info() {
#endif #endif
compiler += (HasPext ? " BMI2" : ""); compiler += (HasPext ? " BMI2" : "");
compiler += (HasPopCnt ? " POPCNT" : ""); compiler += (HasPopCnt ? " POPCNT" : "");
#if defined(USE_MMX)
compiler += " MMX";
#endif
#if !defined(NDEBUG) #if !defined(NDEBUG)
compiler += " DEBUG"; compiler += " DEBUG";
#endif #endif

View file

@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers {
const __m256i kOnes = _mm256_set1_epi16(1); const __m256i kOnes = _mm256_set1_epi16(1);
const auto input_vector = reinterpret_cast<const __m256i*>(input); const auto input_vector = reinterpret_cast<const __m256i*>(input);
#elif defined(USE_SSSE3) #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
#ifndef USE_SSSE3
const __m128i kZeros = _mm_setzero_si128();
#else
const __m128i kOnes = _mm_set1_epi16(1); const __m128i kOnes = _mm_set1_epi16(1);
#endif
const auto input_vector = reinterpret_cast<const __m128i*>(input); const auto input_vector = reinterpret_cast<const __m128i*>(input);
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const __m64 kZeros = _mm_setzero_si64();
const auto input_vector = reinterpret_cast<const __m64*>(input);
#elif defined(USE_NEON) #elif defined(USE_NEON)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const auto input_vector = reinterpret_cast<const int8x8_t*>(input); const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers {
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
#elif defined(USE_SSE2)
__m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
__m128i sum_hi = kZeros;
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i row_j = _mm_load_si128(&row[j]);
__m128i input_j = _mm_load_si128(&input_vector[j]);
__m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
__m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
__m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
__m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
__m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
__m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
__m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
sum_lo = _mm_add_epi32(sum_lo, product_lo);
sum_hi = _mm_add_epi32(sum_hi, product_hi);
}
__m128i sum = _mm_add_epi32(sum_lo, sum_hi);
__m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
sum = _mm_add_epi32(sum, sum_high_64);
__m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
sum = _mm_add_epi32(sum, sum_second_32);
output[i] = _mm_cvtsi128_si32(sum);
#elif defined(USE_MMX)
__m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
__m64 sum_hi = kZeros;
const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m64 row_j = row[j];
__m64 input_j = input_vector[j];
__m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
__m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
__m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
__m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
__m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
__m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
__m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
sum_lo = _mm_add_pi32(sum_lo, product_lo);
sum_hi = _mm_add_pi32(sum_hi, product_hi);
}
__m64 sum = _mm_add_pi32(sum_lo, sum_hi);
sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
output[i] = _mm_cvtsi64_si32(sum);
#elif defined(USE_NEON) #elif defined(USE_NEON)
int32x4_t sum = {biases_[i]}; int32x4_t sum = {biases_[i]};
const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]); const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers {
#endif #endif
} }
#if defined(USE_MMX)
_mm_empty();
#endif
return output; return output;
} }

View file

@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers {
} }
constexpr IndexType kStart = kNumChunks * kSimdWidth; constexpr IndexType kStart = kNumChunks * kSimdWidth;
#elif defined(USE_SSSE3) #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
#ifdef USE_SSE41 #ifdef USE_SSE41
@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers {
} }
constexpr IndexType kStart = kNumChunks * kSimdWidth; constexpr IndexType kStart = kNumChunks * kSimdWidth;
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
const __m64 k0x80s = _mm_set1_pi8(-128);
const auto in = reinterpret_cast<const __m64*>(input);
const auto out = reinterpret_cast<__m64*>(output);
for (IndexType i = 0; i < kNumChunks; ++i) {
const __m64 words0 = _mm_srai_pi16(
_mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
kWeightScaleBits);
const __m64 words1 = _mm_srai_pi16(
_mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
kWeightScaleBits);
const __m64 packedbytes = _mm_packs_pi16(words0, words1);
out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
}
_mm_empty();
constexpr IndexType kStart = kNumChunks * kSimdWidth;
#elif defined(USE_NEON) #elif defined(USE_NEON)
constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0}; const int8x8_t kZero = {0};

View file

@ -33,6 +33,9 @@
#elif defined(USE_SSE2) #elif defined(USE_SSE2)
#include <emmintrin.h> #include <emmintrin.h>
#elif defined(USE_MMX)
#include <mmintrin.h>
#elif defined(USE_NEON) #elif defined(USE_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
@ -79,6 +82,9 @@ namespace Eval::NNUE {
#elif defined(USE_SSE2) #elif defined(USE_SSE2)
constexpr std::size_t kSimdWidth = 16; constexpr std::size_t kSimdWidth = 16;
#elif defined(USE_MMX)
constexpr std::size_t kSimdWidth = 8;
#elif defined(USE_NEON) #elif defined(USE_NEON)
constexpr std::size_t kSimdWidth = 16; constexpr std::size_t kSimdWidth = 16;
#endif #endif

View file

@ -88,7 +88,7 @@ namespace Eval::NNUE {
constexpr int kControl = 0b11011000; constexpr int kControl = 0b11011000;
const __m256i kZero = _mm256_setzero_si256(); const __m256i kZero = _mm256_setzero_si256();
#elif defined(USE_SSSE3) #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
#ifdef USE_SSE41 #ifdef USE_SSE41
@ -97,6 +97,10 @@ namespace Eval::NNUE {
const __m128i k0x80s = _mm_set1_epi8(-128); const __m128i k0x80s = _mm_set1_epi8(-128);
#endif #endif
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
const __m64 k0x80s = _mm_set1_pi8(-128);
#elif defined(USE_NEON) #elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0}; const int8x8_t kZero = {0};
@ -117,7 +121,7 @@ namespace Eval::NNUE {
_mm256_packs_epi16(sum0, sum1), kZero), kControl)); _mm256_packs_epi16(sum0, sum1), kZero), kControl));
} }
#elif defined(USE_SSSE3) #elif defined(USE_SSE2)
auto out = reinterpret_cast<__m128i*>(&output[offset]); auto out = reinterpret_cast<__m128i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>( __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@ -137,6 +141,17 @@ namespace Eval::NNUE {
); );
} }
#elif defined(USE_MMX)
auto out = reinterpret_cast<__m64*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m64 sum0 = *(&reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][0])[j * 2 + 0]);
__m64 sum1 = *(&reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][0])[j * 2 + 1]);
const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
}
#elif defined(USE_NEON) #elif defined(USE_NEON)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]); const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
@ -154,6 +169,9 @@ namespace Eval::NNUE {
#endif #endif
} }
#if defined(USE_MMX)
_mm_empty();
#endif
} }
private: private:
@ -193,6 +211,15 @@ namespace Eval::NNUE {
for (IndexType j = 0; j < kNumChunks; ++j) for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto accumulation = reinterpret_cast<__m64*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j) {
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
}
#elif defined(USE_NEON) #elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>( auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]); &accumulator.accumulation[perspective][i][0]);
@ -208,6 +235,9 @@ namespace Eval::NNUE {
} }
} }
#if defined(USE_MMX)
_mm_empty();
#endif
accumulator.computed_accumulation = true; accumulator.computed_accumulation = true;
accumulator.computed_score = false; accumulator.computed_score = false;
@ -234,6 +264,11 @@ namespace Eval::NNUE {
auto accumulation = reinterpret_cast<__m128i*>( auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]); &accumulator.accumulation[perspective][i][0]);
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m64*>(
&accumulator.accumulation[perspective][i][0]);
#elif defined(USE_NEON) #elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<int16x8_t*>( auto accumulation = reinterpret_cast<int16x8_t*>(
@ -263,6 +298,12 @@ namespace Eval::NNUE {
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
} }
#elif defined(USE_MMX)
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
}
#elif defined(USE_NEON) #elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]); auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
@ -294,6 +335,12 @@ namespace Eval::NNUE {
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
} }
#elif defined(USE_MMX)
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
}
#elif defined(USE_NEON) #elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]); auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
@ -310,6 +357,9 @@ namespace Eval::NNUE {
} }
} }
} }
#if defined(USE_MMX)
_mm_empty();
#endif
accumulator.computed_accumulation = true; accumulator.computed_accumulation = true;
accumulator.computed_score = false; accumulator.computed_score = false;