diff --git a/AUTHORS b/AUTHORS index 6620f4db..2e080e61 100644 --- a/AUTHORS +++ b/AUTHORS @@ -43,6 +43,7 @@ Dariusz Orzechowski David Zar Daylen Yang (daylen) DiscanX +Dominik Schlösser (domschl) double-beep Eduardo Cáceres (eduherminio) Eelco de Groot (KingDefender) diff --git a/src/Makefile b/src/Makefile index e54a0032..58b05c89 100644 --- a/src/Makefile +++ b/src/Makefile @@ -75,6 +75,7 @@ endif # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 +# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # # Note that Makefile is space sensitive, so when adding new architectures # or modifying existing flags, you have to make sure there are no extra spaces @@ -95,6 +96,7 @@ sse42 = no avx2 = no pext = no avx512 = no +neon = no ### 2.2 Architecture specific ifeq ($(ARCH),general-32) @@ -229,6 +231,13 @@ ifeq ($(ARCH),armv8) popcnt = yes endif +ifeq ($(ARCH),apple-silicon) + arch = arm64 + prefetch = yes + popcnt = yes + neon = yes +endif + ifeq ($(ARCH),ppc-32) arch = ppc bits = 32 @@ -413,7 +422,7 @@ endif ### 3.6 popcnt ifeq ($(popcnt),yes) - ifeq ($(arch),$(filter $(arch),ppc64 armv8-a)) + ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64)) CXXFLAGS += -DUSE_POPCNT else ifeq ($(comp),icc) CXXFLAGS += -msse3 -DUSE_POPCNT @@ -464,6 +473,10 @@ ifeq ($(sse3),yes) endif endif +ifeq ($(neon),yes) + CXXFLAGS += -DUSE_NEON +endif + ifeq ($(arch),x86_64) CXXFLAGS += -DUSE_SSE2 endif @@ -542,6 +555,7 @@ help: @echo "ppc-32 > PPC 32-bit" @echo "armv7 > ARMv7 32-bit" @echo "armv8 > ARMv8 64-bit" + @echo "apple-silicon > Apple silicon ARM64" @echo "general-64 > unspecified 64-bit" @echo "general-32 > unspecified 32-bit" @echo "" @@ -644,6 +658,7 @@ config-sanity: @echo "avx2: '$(avx2)'" @echo "pext: '$(pext)'" @echo "avx512: '$(avx512)'" + @echo "neon: '$(neon)'" @echo "" @echo "Flags:" @echo "CXX: $(CXX)" @@ -657,7 +672,7 @@ config-sanity: @test "$(optimize)" = "yes" || test "$(optimize)" = "no" @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \ - test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" + test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64" @test "$(bits)" = "32" || test "$(bits)" = "64" @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" @@ -669,6 +684,7 @@ config-sanity: @test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(pext)" = "yes" || test "$(pext)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" + @test "$(neon)" = "yes" || test "$(neon)" = "no" @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" $(EXE): $(OBJS) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 6abf15ee..b585bc87 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -92,7 +92,7 @@ namespace Eval::NNUE::Layers { const __m128i kOnes = _mm_set1_epi16(1); const auto input_vector = reinterpret_cast(input); - #elif defined(IS_ARM) + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const auto input_vector = reinterpret_cast(input); #endif @@ -177,7 +177,7 @@ namespace Eval::NNUE::Layers { sum = _mm_hadd_epi32(sum, sum); output[i] = _mm_cvtsi128_si32(sum); - #elif defined(IS_ARM) + #elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index fda2a7e9..7ade598f 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -153,7 +153,7 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; - #elif defined(IS_ARM) + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; const auto in = reinterpret_cast(input); diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index dc2d286b..972ef3e5 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -32,6 +32,9 @@ #elif defined(USE_SSE2) #include + +#elif defined(USE_NEON) +#include #endif namespace Eval::NNUE { @@ -53,7 +56,7 @@ namespace Eval::NNUE { #elif defined(USE_SSE2) constexpr std::size_t kSimdWidth = 16; - #elif defined(IS_ARM) + #elif defined(USE_NEON) constexpr std::size_t kSimdWidth = 16; #endif diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 6b0a01b4..1cfebbe4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -97,7 +97,7 @@ namespace Eval::NNUE { const __m128i k0x80s = _mm_set1_epi8(-128); #endif - #elif defined(IS_ARM) + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; #endif @@ -163,7 +163,7 @@ namespace Eval::NNUE { ); } - #elif defined(IS_ARM) + #elif defined(USE_NEON) const auto out = reinterpret_cast(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { int16x8_t sum = reinterpret_cast( @@ -218,7 +218,7 @@ namespace Eval::NNUE { accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); } - #elif defined(IS_ARM) + #elif defined(USE_NEON) auto accumulation = reinterpret_cast( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); @@ -261,7 +261,7 @@ namespace Eval::NNUE { auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); - #elif defined(IS_ARM) + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); auto accumulation = reinterpret_cast( &accumulator.accumulation[perspective][i][0]); @@ -290,7 +290,7 @@ namespace Eval::NNUE { accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); } - #elif defined(IS_ARM) + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { accumulation[j] = vsubq_s16(accumulation[j], column[j]); @@ -321,7 +321,7 @@ namespace Eval::NNUE { accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); } - #elif defined(IS_ARM) + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { accumulation[j] = vaddq_s16(accumulation[j], column[j]);