apple silicon platform with NEON

USE_NEON instead of IS_ARM New platform apple-silicon with default USE_NEON nnue_common.h includes arm_neon.h for USE_NEON
2025-06-28 00:19:50 +00:00 · 2020-08-05 10:55:00 +02:00 · 2020-08-05 10:55:00 +02:00 · c402fe7d26
commit c402fe7d26
parent 2b8bb8e226
6 changed files with 32 additions and 12 deletions
--- a/1
+++ b/1
@ -43,6 +43,7 @@ Dariusz Orzechowski
 David Zar
 Daylen Yang (daylen)
 DiscanX
+Dominik Schlösser (domschl)
 double-beep
 Eduardo Cáceres (eduherminio)
 Eelco de Groot (KingDefender)
--- a/src/Makefile
+++ b/src/Makefile
@ -75,6 +75,7 @@ endif
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
+# neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@ -95,6 +96,7 @@ sse42 = no
 avx2 = no
 pext = no
 avx512 = no
+neon = no

 ### 2.2 Architecture specific
 ifeq ($(ARCH),general-32)
@ -229,6 +231,13 @@ ifeq ($(ARCH),armv8)
 	popcnt = yes
 endif

+ifeq ($(ARCH),apple-silicon)
+	arch = arm64
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+endif
+
 ifeq ($(ARCH),ppc-32)
 	arch = ppc
 	bits = 32
@ -413,7 +422,7 @@ endif

 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@ -464,6 +473,10 @@ ifeq ($(sse3),yes)
 	endif
 endif

+ifeq ($(neon),yes)
+	CXXFLAGS += -DUSE_NEON
+endif
+
 ifeq ($(arch),x86_64)
 	CXXFLAGS += -DUSE_SSE2
 endif
@ -542,6 +555,7 @@ help:
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
 	@echo "armv8                   > ARMv8 64-bit"
+	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"
 	@echo "general-32              > unspecified 32-bit"
 	@echo ""
@ -644,6 +658,7 @@ config-sanity:
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
+	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
 	@echo "CXX: $(CXX)"
@ -657,7 +672,7 @@ config-sanity:
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
 	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
-	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a"
+	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
@ -669,6 +684,7 @@ config-sanity:
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
+	@test "$(neon)" = "yes" || test "$(neon)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"

 $(EXE): $(OBJS)
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@ -92,7 +92,7 @@ namespace Eval::NNUE::Layers {
      const __m128i kOnes = _mm_set1_epi16(1);
      const auto input_vector = reinterpret_cast<const __m128i*>(input);

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
  #endif
@ -177,7 +177,7 @@ namespace Eval::NNUE::Layers {
        sum = _mm_hadd_epi32(sum, sum);
        output[i] = _mm_cvtsi128_si32(sum);

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
        int32x4_t sum = {biases_[i]};
        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@ -153,7 +153,7 @@ namespace Eval::NNUE::Layers {
      }
      constexpr IndexType kStart = kNumChunks * kSimdWidth;

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
      const int8x8_t kZero = {0};
      const auto in = reinterpret_cast<const int32x4_t*>(input);
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@ -32,6 +32,9 @@

 #elif defined(USE_SSE2)
 #include <emmintrin.h>
+
+#elif defined(USE_NEON)
+#include <arm_neon.h>
 #endif

 namespace Eval::NNUE {
@ -53,7 +56,7 @@ namespace Eval::NNUE {
  #elif defined(USE_SSE2)
  constexpr std::size_t kSimdWidth = 16;

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
  constexpr std::size_t kSimdWidth = 16;
  #endif

--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@ -97,7 +97,7 @@ namespace Eval::NNUE {
      const __m128i k0x80s = _mm_set1_epi8(-128);
  #endif

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
      const int8x8_t kZero = {0};
  #endif
@ -163,7 +163,7 @@ namespace Eval::NNUE {
          );
        }

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
@ -218,7 +218,7 @@ namespace Eval::NNUE {
            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
          }

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
          auto accumulation = reinterpret_cast<int16x8_t*>(
              &accumulator.accumulation[perspective][i][0]);
          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
@ -261,7 +261,7 @@ namespace Eval::NNUE {
        auto accumulation = reinterpret_cast<__m128i*>(
            &accumulator.accumulation[perspective][i][0]);

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
        auto accumulation = reinterpret_cast<int16x8_t*>(
            &accumulator.accumulation[perspective][i][0]);
@ -290,7 +290,7 @@ namespace Eval::NNUE {
              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
            }

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j) {
              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
@ -321,7 +321,7 @@ namespace Eval::NNUE {
              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
            }

-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j) {
              accumulation[j] = vaddq_s16(accumulation[j], column[j]);