mirror of
https://github.com/sockspls/badfish
synced 2025-04-29 16:23:09 +00:00
apple silicon platform with NEON
USE_NEON instead of IS_ARM New platform apple-silicon with default USE_NEON nnue_common.h includes arm_neon.h for USE_NEON
This commit is contained in:
parent
2b8bb8e226
commit
c402fe7d26
6 changed files with 32 additions and 12 deletions
1
AUTHORS
1
AUTHORS
|
@ -43,6 +43,7 @@ Dariusz Orzechowski
|
|||
David Zar
|
||||
Daylen Yang (daylen)
|
||||
DiscanX
|
||||
Dominik Schlösser (domschl)
|
||||
double-beep
|
||||
Eduardo Cáceres (eduherminio)
|
||||
Eelco de Groot (KingDefender)
|
||||
|
|
20
src/Makefile
20
src/Makefile
|
@ -75,6 +75,7 @@ endif
|
|||
# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
|
||||
# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction
|
||||
# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
|
||||
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
|
||||
#
|
||||
# Note that Makefile is space sensitive, so when adding new architectures
|
||||
# or modifying existing flags, you have to make sure there are no extra spaces
|
||||
|
@ -95,6 +96,7 @@ sse42 = no
|
|||
avx2 = no
|
||||
pext = no
|
||||
avx512 = no
|
||||
neon = no
|
||||
|
||||
### 2.2 Architecture specific
|
||||
ifeq ($(ARCH),general-32)
|
||||
|
@ -229,6 +231,13 @@ ifeq ($(ARCH),armv8)
|
|||
popcnt = yes
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH),apple-silicon)
|
||||
arch = arm64
|
||||
prefetch = yes
|
||||
popcnt = yes
|
||||
neon = yes
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH),ppc-32)
|
||||
arch = ppc
|
||||
bits = 32
|
||||
|
@ -413,7 +422,7 @@ endif
|
|||
|
||||
### 3.6 popcnt
|
||||
ifeq ($(popcnt),yes)
|
||||
ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
|
||||
ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
|
||||
CXXFLAGS += -DUSE_POPCNT
|
||||
else ifeq ($(comp),icc)
|
||||
CXXFLAGS += -msse3 -DUSE_POPCNT
|
||||
|
@ -464,6 +473,10 @@ ifeq ($(sse3),yes)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(neon),yes)
|
||||
CXXFLAGS += -DUSE_NEON
|
||||
endif
|
||||
|
||||
ifeq ($(arch),x86_64)
|
||||
CXXFLAGS += -DUSE_SSE2
|
||||
endif
|
||||
|
@ -542,6 +555,7 @@ help:
|
|||
@echo "ppc-32 > PPC 32-bit"
|
||||
@echo "armv7 > ARMv7 32-bit"
|
||||
@echo "armv8 > ARMv8 64-bit"
|
||||
@echo "apple-silicon > Apple silicon ARM64"
|
||||
@echo "general-64 > unspecified 64-bit"
|
||||
@echo "general-32 > unspecified 32-bit"
|
||||
@echo ""
|
||||
|
@ -644,6 +658,7 @@ config-sanity:
|
|||
@echo "avx2: '$(avx2)'"
|
||||
@echo "pext: '$(pext)'"
|
||||
@echo "avx512: '$(avx512)'"
|
||||
@echo "neon: '$(neon)'"
|
||||
@echo ""
|
||||
@echo "Flags:"
|
||||
@echo "CXX: $(CXX)"
|
||||
|
@ -657,7 +672,7 @@ config-sanity:
|
|||
@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
|
||||
@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
|
||||
test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
|
||||
test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a"
|
||||
test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
|
||||
@test "$(bits)" = "32" || test "$(bits)" = "64"
|
||||
@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
|
||||
@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
|
||||
|
@ -669,6 +684,7 @@ config-sanity:
|
|||
@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
|
||||
@test "$(pext)" = "yes" || test "$(pext)" = "no"
|
||||
@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
|
||||
@test "$(neon)" = "yes" || test "$(neon)" = "no"
|
||||
@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
|
||||
|
||||
$(EXE): $(OBJS)
|
||||
|
|
|
@ -92,7 +92,7 @@ namespace Eval::NNUE::Layers {
|
|||
const __m128i kOnes = _mm_set1_epi16(1);
|
||||
const auto input_vector = reinterpret_cast<const __m128i*>(input);
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||
const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
|
||||
#endif
|
||||
|
@ -177,7 +177,7 @@ namespace Eval::NNUE::Layers {
|
|||
sum = _mm_hadd_epi32(sum, sum);
|
||||
output[i] = _mm_cvtsi128_si32(sum);
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
int32x4_t sum = {biases_[i]};
|
||||
const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
|
|
|
@ -153,7 +153,7 @@ namespace Eval::NNUE::Layers {
|
|||
}
|
||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
|
||||
const int8x8_t kZero = {0};
|
||||
const auto in = reinterpret_cast<const int32x4_t*>(input);
|
||||
|
|
|
@ -32,6 +32,9 @@
|
|||
|
||||
#elif defined(USE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
|
||||
#elif defined(USE_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
namespace Eval::NNUE {
|
||||
|
@ -53,7 +56,7 @@ namespace Eval::NNUE {
|
|||
#elif defined(USE_SSE2)
|
||||
constexpr std::size_t kSimdWidth = 16;
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
constexpr std::size_t kSimdWidth = 16;
|
||||
#endif
|
||||
|
||||
|
|
|
@ -97,7 +97,7 @@ namespace Eval::NNUE {
|
|||
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||
#endif
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||
const int8x8_t kZero = {0};
|
||||
#endif
|
||||
|
@ -163,7 +163,7 @@ namespace Eval::NNUE {
|
|||
);
|
||||
}
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
int16x8_t sum = reinterpret_cast<const int16x8_t*>(
|
||||
|
@ -218,7 +218,7 @@ namespace Eval::NNUE {
|
|||
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
auto accumulation = reinterpret_cast<int16x8_t*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
|
||||
|
@ -261,7 +261,7 @@ namespace Eval::NNUE {
|
|||
auto accumulation = reinterpret_cast<__m128i*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||
auto accumulation = reinterpret_cast<int16x8_t*>(
|
||||
&accumulator.accumulation[perspective][i][0]);
|
||||
|
@ -290,7 +290,7 @@ namespace Eval::NNUE {
|
|||
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
accumulation[j] = vsubq_s16(accumulation[j], column[j]);
|
||||
|
@ -321,7 +321,7 @@ namespace Eval::NNUE {
|
|||
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
|
||||
}
|
||||
|
||||
#elif defined(IS_ARM)
|
||||
#elif defined(USE_NEON)
|
||||
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
|
||||
|
|
Loading…
Add table
Reference in a new issue