1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-30 08:43:09 +00:00

Add support for SSSE3-only compiles

For Core 2 Duo.

To compile:
make ARCH=x86-64 ssse3=yes nnue

No observable difference in speed to SSE4.1 on my machine.
This commit is contained in:
No name 2020-07-09 23:18:13 +03:00 committed by nodchip
parent b9a32fe331
commit 081761d084
5 changed files with 38 additions and 9 deletions

View file

@ -387,12 +387,20 @@ ifeq ($(avx2),yes)
endif
ifeq ($(sse41),yes)
ssse3 = yes
CXXFLAGS += -DUSE_SSE41
ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
CXXFLAGS += -msse4.1
endif
endif
ifeq ($(ssse3),yes)
CXXFLAGS += -DUSE_SSSE3
ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
CXXFLAGS += -mssse3
endif
endif
ifeq ($(arch),x86_64)
CXXFLAGS += -DUSE_SSE2
endif

View file

@ -86,7 +86,7 @@ class AffineTransform {
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const __m256i kOnes = _mm256_set1_epi16(1);
const auto input_vector = reinterpret_cast<const __m256i*>(input);
#elif defined(USE_SSE41)
#elif defined(USE_SSSE3)
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
const __m128i kOnes = _mm_set1_epi16(1);
const auto input_vector = reinterpret_cast<const __m128i*>(input);
@ -118,7 +118,7 @@ class AffineTransform {
const __m128i lo = _mm256_extracti128_si256(sum, 0);
const __m128i hi = _mm256_extracti128_si256(sum, 1);
output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
#elif defined(USE_SSE41)
#elif defined(USE_SSSE3)
__m128i sum = _mm_cvtsi32_si128(biases_[i]);
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {

View file

@ -110,9 +110,12 @@ class ClippedReLU {
_mm256_packs_epi16(words0, words1), kZero), kOffsets));
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
#elif defined(USE_SSE41)
#elif defined(USE_SSSE3)
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
const __m128i kZero = _mm_setzero_si128();
#ifndef USE_SSE41
const __m128i k0x80s = _mm_set1_epi8(-128);
#endif
const auto in = reinterpret_cast<const __m128i*>(input);
const auto out = reinterpret_cast<__m128i*>(output);
for (IndexType i = 0; i < kNumChunks; ++i) {
@ -122,8 +125,14 @@ class ClippedReLU {
const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
_mm_load_si128(&in[i * 4 + 2]),
_mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
_mm_store_si128(&out[i], _mm_max_epi8(
_mm_packs_epi16(words0, words1), kZero));
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
_mm_store_si128(&out[i],
#ifdef USE_SSE41
_mm_max_epi8(packedbytes, kZero)
#else
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
#endif
);
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
#elif defined(IS_ARM)

View file

@ -9,6 +9,8 @@
#include <immintrin.h>
#elif defined(USE_SSE41)
#include <smmintrin.h>
#elif defined(USE_SSSE3)
#include <tmmintrin.h>
#elif defined(USE_SSE2)
#include <emmintrin.h>
#endif

View file

@ -87,9 +87,12 @@ class FeatureTransformer {
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
constexpr int kControl = 0b11011000;
const __m256i kZero = _mm256_setzero_si256();
#elif defined(USE_SSE41)
#elif defined(USE_SSSE3)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
const __m128i kZero = _mm_setzero_si128();
#ifndef USE_SSE41
const __m128i k0x80s = _mm_set1_epi8(-128);
#endif
#elif defined(IS_ARM)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
@ -133,7 +136,7 @@ class FeatureTransformer {
(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
}
#elif defined(USE_SSE41)
#elif defined(USE_SSSE3)
auto out = reinterpret_cast<__m128i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@ -146,8 +149,15 @@ class FeatureTransformer {
sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
_mm_store_si128(&out[j], _mm_max_epi8(
_mm_packs_epi16(sum0, sum1), kZero));
const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
_mm_store_si128(&out[j],
#ifdef USE_SSE41
_mm_max_epi8(packedbytes, kZero)
#else
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
#endif
);
}
#elif defined(IS_ARM)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);