1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-29 16:23:09 +00:00

apple silicon platform with NEON

USE_NEON instead of IS_ARM
New platform apple-silicon with default USE_NEON
nnue_common.h includes arm_neon.h for USE_NEON
This commit is contained in:
Dominik Schlösser 2020-08-05 10:55:00 +02:00 committed by Joost VandeVondele
parent 2b8bb8e226
commit c402fe7d26
6 changed files with 32 additions and 12 deletions

View file

@ -43,6 +43,7 @@ Dariusz Orzechowski
David Zar
Daylen Yang (daylen)
DiscanX
Dominik Schlösser (domschl)
double-beep
Eduardo Cáceres (eduherminio)
Eelco de Groot (KingDefender)

View file

@ -75,6 +75,7 @@ endif
# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction
# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
#
# Note that Makefile is space sensitive, so when adding new architectures
# or modifying existing flags, you have to make sure there are no extra spaces
@ -95,6 +96,7 @@ sse42 = no
avx2 = no
pext = no
avx512 = no
neon = no
### 2.2 Architecture specific
ifeq ($(ARCH),general-32)
@ -229,6 +231,13 @@ ifeq ($(ARCH),armv8)
popcnt = yes
endif
ifeq ($(ARCH),apple-silicon)
arch = arm64
prefetch = yes
popcnt = yes
neon = yes
endif
ifeq ($(ARCH),ppc-32)
arch = ppc
bits = 32
@ -413,7 +422,7 @@ endif
### 3.6 popcnt
ifeq ($(popcnt),yes)
ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
CXXFLAGS += -DUSE_POPCNT
else ifeq ($(comp),icc)
CXXFLAGS += -msse3 -DUSE_POPCNT
@ -464,6 +473,10 @@ ifeq ($(sse3),yes)
endif
endif
ifeq ($(neon),yes)
CXXFLAGS += -DUSE_NEON
endif
ifeq ($(arch),x86_64)
CXXFLAGS += -DUSE_SSE2
endif
@ -542,6 +555,7 @@ help:
@echo "ppc-32 > PPC 32-bit"
@echo "armv7 > ARMv7 32-bit"
@echo "armv8 > ARMv8 64-bit"
@echo "apple-silicon > Apple silicon ARM64"
@echo "general-64 > unspecified 64-bit"
@echo "general-32 > unspecified 32-bit"
@echo ""
@ -644,6 +658,7 @@ config-sanity:
@echo "avx2: '$(avx2)'"
@echo "pext: '$(pext)'"
@echo "avx512: '$(avx512)'"
@echo "neon: '$(neon)'"
@echo ""
@echo "Flags:"
@echo "CXX: $(CXX)"
@ -657,7 +672,7 @@ config-sanity:
@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a"
test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
@test "$(bits)" = "32" || test "$(bits)" = "64"
@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
@ -669,6 +684,7 @@ config-sanity:
@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
@test "$(pext)" = "yes" || test "$(pext)" = "no"
@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
@test "$(neon)" = "yes" || test "$(neon)" = "no"
@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
$(EXE): $(OBJS)

View file

@ -92,7 +92,7 @@ namespace Eval::NNUE::Layers {
const __m128i kOnes = _mm_set1_epi16(1);
const auto input_vector = reinterpret_cast<const __m128i*>(input);
#elif defined(IS_ARM)
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
#endif
@ -177,7 +177,7 @@ namespace Eval::NNUE::Layers {
sum = _mm_hadd_epi32(sum, sum);
output[i] = _mm_cvtsi128_si32(sum);
#elif defined(IS_ARM)
#elif defined(USE_NEON)
int32x4_t sum = {biases_[i]};
const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {

View file

@ -153,7 +153,7 @@ namespace Eval::NNUE::Layers {
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
#elif defined(IS_ARM)
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
const auto in = reinterpret_cast<const int32x4_t*>(input);

View file

@ -32,6 +32,9 @@
#elif defined(USE_SSE2)
#include <emmintrin.h>
#elif defined(USE_NEON)
#include <arm_neon.h>
#endif
namespace Eval::NNUE {
@ -53,7 +56,7 @@ namespace Eval::NNUE {
#elif defined(USE_SSE2)
constexpr std::size_t kSimdWidth = 16;
#elif defined(IS_ARM)
#elif defined(USE_NEON)
constexpr std::size_t kSimdWidth = 16;
#endif

View file

@ -97,7 +97,7 @@ namespace Eval::NNUE {
const __m128i k0x80s = _mm_set1_epi8(-128);
#endif
#elif defined(IS_ARM)
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
#endif
@ -163,7 +163,7 @@ namespace Eval::NNUE {
);
}
#elif defined(IS_ARM)
#elif defined(USE_NEON)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
int16x8_t sum = reinterpret_cast<const int16x8_t*>(
@ -218,7 +218,7 @@ namespace Eval::NNUE {
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
}
#elif defined(IS_ARM)
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
@ -261,7 +261,7 @@ namespace Eval::NNUE {
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
#elif defined(IS_ARM)
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
@ -290,7 +290,7 @@ namespace Eval::NNUE {
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
}
#elif defined(IS_ARM)
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
accumulation[j] = vsubq_s16(accumulation[j], column[j]);
@ -321,7 +321,7 @@ namespace Eval::NNUE {
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
}
#elif defined(IS_ARM)
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
accumulation[j] = vaddq_s16(accumulation[j], column[j]);