From 961a4dad5ce83a7795a5e60f4f34dd56212621db Mon Sep 17 00:00:00 2001 From: mstembera Date: Sat, 18 Jul 2020 19:21:46 -0700 Subject: [PATCH] Add AVX512 support. bench: 3909820 --- src/Makefile | 28 ++++++++++++++- src/eval/nnue/layers/affine_transform.h | 47 ++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 5 deletions(-) diff --git a/src/Makefile b/src/Makefile index 585d93a4..254f9bac 100644 --- a/src/Makefile +++ b/src/Makefile @@ -87,6 +87,7 @@ endif # sse42 = yes/no --- -msse4.2 --- Use Intel Streaming SIMD Extensions 4.2 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction +# avx512 = yes/no --- -mavx512vbmi --- Use Intel Advanced Vector Extensions 512 # # Note that Makefile is space sensitive, so when adding new architectures # or modifying existing flags, you have to make sure there are no extra spaces @@ -105,6 +106,7 @@ sse41 = no sse42 = no avx2 = no pext = no +avx512 = no ### 2.2 Architecture specific ifeq ($(ARCH),general-32) @@ -183,6 +185,20 @@ ifeq ($(ARCH),x86-64-bmi2) pext = yes endif +ifeq ($(ARCH),x86-64-avx512) + arch = x86_64 + bits = 64 + prefetch = yes + popcnt = yes + sse = yes + ssse3 = yes + sse41 = yes + sse42 = yes + avx2 = yes + pext = yes + avx512 = yes +endif + ifeq ($(ARCH),armv7) arch = armv7 prefetch = yes @@ -407,7 +423,14 @@ endif ifeq ($(avx2),yes) CXXFLAGS += -DUSE_AVX2 ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2)) - CXXFLAGS += -mavx2 + CXXFLAGS += -mavx2 + endif +endif + +ifeq ($(avx512),yes) + CXXFLAGS += -DUSE_AVX512 + ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2)) + CXXFLAGS += -mavx512vbmi endif endif @@ -493,6 +516,7 @@ help: @echo "" @echo "Supported archs:" @echo "" + @echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" @echo "x86-64-sse42 > x86 64-bit with sse42 support" @@ -599,6 +623,7 @@ config-sanity: @echo "sse42: '$(sse42)'" @echo "avx2: '$(avx2)'" @echo "pext: '$(pext)'" + @echo "avx512: '$(avx512)'" @echo "" @echo "Flags:" @echo "CXX: $(CXX)" @@ -622,6 +647,7 @@ config-sanity: @test "$(sse42)" = "yes" || test "$(sse42)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(pext)" = "yes" || test "$(pext)" = "no" + @test "$(avx512)" = "yes" || test "$(avx512)" = "no" @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" $(EXE): $(OBJS) diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h index cb56b07d..2db7f731 100644 --- a/src/eval/nnue/layers/affine_transform.h +++ b/src/eval/nnue/layers/affine_transform.h @@ -82,7 +82,11 @@ class AffineTransform { const auto input = previous_layer_.Propagate( transformed_features, buffer + kSelfBufferSize); const auto output = reinterpret_cast(buffer); -#if defined(USE_AVX2) +#if defined(USE_AVX512) + constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2); + const __m512i kOnes = _mm512_set1_epi16(1); + const auto input_vector = reinterpret_cast(input); +#elif defined(USE_AVX2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); @@ -96,8 +100,43 @@ class AffineTransform { #endif for (IndexType i = 0; i < kOutputDimensions; ++i) { const IndexType offset = i * kPaddedInputDimensions; -#if defined(USE_AVX2) - __m256i sum = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, biases_[i]); +#if defined(USE_AVX512) + __m512i sum = _mm512_setzero_si512(); + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { +#if defined(__MINGW32__) || defined(__MINGW64__) + __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); +#else + __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); +#endif + product = _mm512_madd_epi16(product, kOnes); + sum = _mm512_add_epi32(sum, product); + } + output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; + + // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. + // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) + // and we have to do one more 256bit chunk. + if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) + { + const auto iv_256 = reinterpret_cast(input); + const auto row_256 = reinterpret_cast(&weights_[offset]); + int j = kNumChunks * 2; +#if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. + __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); +#else + __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); +#endif + sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); + + sum256 = _mm256_hadd_epi32(sum256, sum256); + sum256 = _mm256_hadd_epi32(sum256, sum256); + const __m128i lo = _mm256_extracti128_si256(sum256, 0); + const __m128i hi = _mm256_extracti128_si256(sum256, 1); + output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi); + } +#elif defined(USE_AVX2) + __m256i sum = _mm256_setzero_si256(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i product = _mm256_maddubs_epi16( @@ -117,7 +156,7 @@ class AffineTransform { sum = _mm256_hadd_epi32(sum, sum); const __m128i lo = _mm256_extracti128_si256(sum, 0); const __m128i hi = _mm256_extracti128_si256(sum, 1); - output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi); + output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i]; #elif defined(USE_SSSE3) __m128i sum = _mm_cvtsi32_si128(biases_[i]); const auto row = reinterpret_cast(&weights_[offset]);