1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-05-01 01:03:09 +00:00

Support VNNI on 256bit vectors

due to downclocking on current chips (tested up to cascade lake)
supporting avx512 and vnni512, it is better to use avx2 or vnni256
in multithreaded (in particular hyperthreaded) engine use.
In single threaded use, the picture is different.

gcc compilation for vnni256 requires a toolchain for gcc >= 9.

closes https://github.com/official-stockfish/Stockfish/pull/3038

No functional change
This commit is contained in:
mstembera 2020-08-20 16:59:27 -07:00 committed by Joost VandeVondele
parent e453f09f06
commit 701b2427bd
3 changed files with 42 additions and 11 deletions

View file

@ -77,8 +77,10 @@ script:
# compile only for some more advanced architectures (might not run in travis) # compile only for some more advanced architectures (might not run in travis)
- make clean && make -j2 ARCH=x86-64-avx2 build - make clean && make -j2 ARCH=x86-64-avx2 build
- make clean && make -j2 ARCH=x86-64-bmi2 build - make clean && make -j2 ARCH=x86-64-bmi2 build
# needs gcc 10 to compile - make clean && make -j2 ARCH=x86-64-avx512 build
- if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi - make clean && make -j2 ARCH=x86-64-vnni512 build
# requires gcc 9 or higher
- if [[ "$COMPILER" != "g++-8" ]]; make clean && make -j2 ARCH=x86-64-vnni256 build; fi
# #
# Check perft and reproducible search # Check perft and reproducible search

View file

@ -75,7 +75,8 @@ endif
# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 # sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1
# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
# vnni = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 # vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
# #
# Note that Makefile is space sensitive, so when adding new architectures # Note that Makefile is space sensitive, so when adding new architectures
@ -102,7 +103,8 @@ ssse3 = no
sse41 = no sse41 = no
avx2 = no avx2 = no
avx512 = no avx512 = no
vnni = no vnni256 = no
vnni512 = no
neon = no neon = no
ARCH = x86-64-modern ARCH = x86-64-modern
STRIP = strip STRIP = strip
@ -192,7 +194,18 @@ ifeq ($(findstring -avx512,$(ARCH)),-avx512)
avx512 = yes avx512 = yes
endif endif
ifeq ($(findstring -vnni,$(ARCH)),-vnni) ifeq ($(findstring -vnni256,$(ARCH)),-vnni256)
popcnt = yes
sse = yes
sse2 = yes
ssse3 = yes
sse41 = yes
avx2 = yes
pext = yes
vnni256 = yes
endif
ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
popcnt = yes popcnt = yes
sse = yes sse = yes
sse2 = yes sse2 = yes
@ -201,7 +214,7 @@ ifeq ($(findstring -vnni,$(ARCH)),-vnni)
avx2 = yes avx2 = yes
pext = yes pext = yes
avx512 = yes avx512 = yes
vnni = yes vnni512 = yes
endif endif
ifeq ($(sse),yes) ifeq ($(sse),yes)
@ -500,7 +513,14 @@ ifeq ($(avx512),yes)
endif endif
endif endif
ifeq ($(vnni),yes) ifeq ($(vnni256),yes)
CXXFLAGS += -DUSE_VNNI
ifeq ($(comp),$(filter $(comp),gcc clang mingw))
CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
endif
endif
ifeq ($(vnni512),yes)
CXXFLAGS += -DUSE_VNNI CXXFLAGS += -DUSE_VNNI
ifeq ($(comp),$(filter $(comp),gcc clang mingw)) ifeq ($(comp),$(filter $(comp),gcc clang mingw))
CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl
@ -623,7 +643,8 @@ help:
@echo "" @echo ""
@echo "Supported archs:" @echo "Supported archs:"
@echo "" @echo ""
@echo "x86-64-vnni > x86 64-bit with vnni support" @echo "x86-64-vnni512 > x86 64-bit with vnni support 512bit wide"
@echo "x86-64-vnni256 > x86 64-bit with vnni support 256bit wide"
@echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-avx512 > x86 64-bit with avx512 support"
@echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support"
@echo "x86-64-avx2 > x86 64-bit with avx2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support"
@ -767,7 +788,8 @@ config-sanity:
@echo "sse41: '$(sse41)'" @echo "sse41: '$(sse41)'"
@echo "avx2: '$(avx2)'" @echo "avx2: '$(avx2)'"
@echo "avx512: '$(avx512)'" @echo "avx512: '$(avx512)'"
@echo "vnni: '$(vnni)'" @echo "vnni256: '$(vnni256)'"
@echo "vnni512: '$(vnni512)'"
@echo "neon: '$(neon)'" @echo "neon: '$(neon)'"
@echo "" @echo ""
@echo "Flags:" @echo "Flags:"
@ -794,7 +816,8 @@ config-sanity:
@test "$(sse41)" = "yes" || test "$(sse41)" = "no" @test "$(sse41)" = "yes" || test "$(sse41)" = "no"
@test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no"
@test "$(avx512)" = "yes" || test "$(avx512)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no"
@test "$(vnni)" = "yes" || test "$(vnni)" = "no" @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no"
@test "$(vnni512)" = "yes" || test "$(vnni512)" = "no"
@test "$(neon)" = "yes" || test "$(neon)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no"
@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
|| test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang"

View file

@ -85,8 +85,10 @@ namespace Eval::NNUE::Layers {
#elif defined(USE_AVX2) #elif defined(USE_AVX2)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const __m256i kOnes = _mm256_set1_epi16(1);
const auto input_vector = reinterpret_cast<const __m256i*>(input); const auto input_vector = reinterpret_cast<const __m256i*>(input);
#if !defined(USE_VNNI)
const __m256i kOnes = _mm256_set1_epi16(1);
#endif
#elif defined(USE_SSE2) #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
@ -145,9 +147,13 @@ namespace Eval::NNUE::Layers {
__m256i sum = _mm256_setzero_si256(); __m256i sum = _mm256_setzero_si256();
const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]); const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
#if defined(USE_VNNI)
sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
#else
__m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
product = _mm256_madd_epi16(product, kOnes); product = _mm256_madd_epi16(product, kOnes);
sum = _mm256_add_epi32(sum, product); sum = _mm256_add_epi32(sum, product);
#endif
} }
__m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));