diff --git a/AUTHORS b/AUTHORS
index 6620f4db..2e080e61 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -43,6 +43,7 @@ Dariusz Orzechowski
 David Zar
 Daylen Yang (daylen)
 DiscanX
+Dominik Schlösser (domschl)
 double-beep
 Eduardo Cáceres (eduherminio)
 Eelco de Groot (KingDefender)
diff --git a/src/Makefile b/src/Makefile
index e54a0032..58b05c89 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -75,6 +75,7 @@ endif
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
+# neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@@ -95,6 +96,7 @@ sse42 = no
 avx2 = no
 pext = no
 avx512 = no
+neon = no
 
 ### 2.2 Architecture specific
 ifeq ($(ARCH),general-32)
@@ -229,6 +231,13 @@ ifeq ($(ARCH),armv8)
 	popcnt = yes
 endif
 
+ifeq ($(ARCH),apple-silicon)
+	arch = arm64
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+endif
+
 ifeq ($(ARCH),ppc-32)
 	arch = ppc
 	bits = 32
@@ -413,7 +422,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@@ -464,6 +473,10 @@ ifeq ($(sse3),yes)
 	endif
 endif
 
+ifeq ($(neon),yes)
+	CXXFLAGS += -DUSE_NEON
+endif
+
 ifeq ($(arch),x86_64)
 	CXXFLAGS += -DUSE_SSE2
 endif
@@ -542,6 +555,7 @@ help:
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
 	@echo "armv8                   > ARMv8 64-bit"
+	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"
 	@echo "general-32              > unspecified 32-bit"
 	@echo ""
@@ -644,6 +658,7 @@ config-sanity:
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
+	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
 	@echo "CXX: $(CXX)"
@@ -657,7 +672,7 @@ config-sanity:
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
 	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
-	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a"
+	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
@@ -669,6 +684,7 @@ config-sanity:
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
+	@test "$(neon)" = "yes" || test "$(neon)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 
 $(EXE): $(OBJS)
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 6abf15ee..b585bc87 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -92,7 +92,7 @@ namespace Eval::NNUE::Layers {
       const __m128i kOnes = _mm_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
   #endif
@@ -177,7 +177,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_hadd_epi32(sum, sum);
         output[i] = _mm_cvtsi128_si32(sum);
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index fda2a7e9..7ade598f 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -153,7 +153,7 @@ namespace Eval::NNUE::Layers {
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
       const auto in = reinterpret_cast<const int32x4_t*>(input);
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index dc2d286b..972ef3e5 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -32,6 +32,9 @@
 
 #elif defined(USE_SSE2)
 #include <emmintrin.h>
+
+#elif defined(USE_NEON)
+#include <arm_neon.h>
 #endif
 
 namespace Eval::NNUE {
@@ -53,7 +56,7 @@ namespace Eval::NNUE {
   #elif defined(USE_SSE2)
   constexpr std::size_t kSimdWidth = 16;
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
   constexpr std::size_t kSimdWidth = 16;
   #endif
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 6b0a01b4..1cfebbe4 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -97,7 +97,7 @@ namespace Eval::NNUE {
       const __m128i k0x80s = _mm_set1_epi8(-128);
   #endif
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
   #endif
@@ -163,7 +163,7 @@ namespace Eval::NNUE {
           );
         }
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
@@ -218,7 +218,7 @@ namespace Eval::NNUE {
             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
           }
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
           auto accumulation = reinterpret_cast<int16x8_t*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
@@ -261,7 +261,7 @@ namespace Eval::NNUE {
         auto accumulation = reinterpret_cast<__m128i*>(
             &accumulator.accumulation[perspective][i][0]);
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
         auto accumulation = reinterpret_cast<int16x8_t*>(
             &accumulator.accumulation[perspective][i][0]);
@@ -290,7 +290,7 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
             }
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
               accumulation[j] = vsubq_s16(accumulation[j], column[j]);
@@ -321,7 +321,7 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
             }
 
-  #elif defined(IS_ARM)
+  #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
               accumulation[j] = vaddq_s16(accumulation[j], column[j]);