mirror of
https://github.com/sockspls/badfish
synced 2025-04-30 16:53:09 +00:00
Add support for SSSE3-only compiles
For Core 2 Duo. To compile: make ARCH=x86-64 ssse3=yes nnue No observable difference in speed to SSE4.1 on my machine.
This commit is contained in:
parent
b9a32fe331
commit
081761d084
5 changed files with 38 additions and 9 deletions
|
@ -387,12 +387,20 @@ ifeq ($(avx2),yes)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(sse41),yes)
|
ifeq ($(sse41),yes)
|
||||||
|
ssse3 = yes
|
||||||
CXXFLAGS += -DUSE_SSE41
|
CXXFLAGS += -DUSE_SSE41
|
||||||
ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
|
ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
|
||||||
CXXFLAGS += -msse4.1
|
CXXFLAGS += -msse4.1
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ssse3),yes)
|
||||||
|
CXXFLAGS += -DUSE_SSSE3
|
||||||
|
ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
|
||||||
|
CXXFLAGS += -mssse3
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(arch),x86_64)
|
ifeq ($(arch),x86_64)
|
||||||
CXXFLAGS += -DUSE_SSE2
|
CXXFLAGS += -DUSE_SSE2
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -86,7 +86,7 @@ class AffineTransform {
|
||||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||||
const __m256i kOnes = _mm256_set1_epi16(1);
|
const __m256i kOnes = _mm256_set1_epi16(1);
|
||||||
const auto input_vector = reinterpret_cast<const __m256i*>(input);
|
const auto input_vector = reinterpret_cast<const __m256i*>(input);
|
||||||
#elif defined(USE_SSE41)
|
#elif defined(USE_SSSE3)
|
||||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||||
const __m128i kOnes = _mm_set1_epi16(1);
|
const __m128i kOnes = _mm_set1_epi16(1);
|
||||||
const auto input_vector = reinterpret_cast<const __m128i*>(input);
|
const auto input_vector = reinterpret_cast<const __m128i*>(input);
|
||||||
|
@ -118,7 +118,7 @@ class AffineTransform {
|
||||||
const __m128i lo = _mm256_extracti128_si256(sum, 0);
|
const __m128i lo = _mm256_extracti128_si256(sum, 0);
|
||||||
const __m128i hi = _mm256_extracti128_si256(sum, 1);
|
const __m128i hi = _mm256_extracti128_si256(sum, 1);
|
||||||
output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
|
output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
|
||||||
#elif defined(USE_SSE41)
|
#elif defined(USE_SSSE3)
|
||||||
__m128i sum = _mm_cvtsi32_si128(biases_[i]);
|
__m128i sum = _mm_cvtsi32_si128(biases_[i]);
|
||||||
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
|
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
|
||||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||||
|
|
|
@ -110,9 +110,12 @@ class ClippedReLU {
|
||||||
_mm256_packs_epi16(words0, words1), kZero), kOffsets));
|
_mm256_packs_epi16(words0, words1), kZero), kOffsets));
|
||||||
}
|
}
|
||||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||||
#elif defined(USE_SSE41)
|
#elif defined(USE_SSSE3)
|
||||||
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
|
constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
|
||||||
const __m128i kZero = _mm_setzero_si128();
|
const __m128i kZero = _mm_setzero_si128();
|
||||||
|
#ifndef USE_SSE41
|
||||||
|
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||||
|
#endif
|
||||||
const auto in = reinterpret_cast<const __m128i*>(input);
|
const auto in = reinterpret_cast<const __m128i*>(input);
|
||||||
const auto out = reinterpret_cast<__m128i*>(output);
|
const auto out = reinterpret_cast<__m128i*>(output);
|
||||||
for (IndexType i = 0; i < kNumChunks; ++i) {
|
for (IndexType i = 0; i < kNumChunks; ++i) {
|
||||||
|
@ -122,8 +125,14 @@ class ClippedReLU {
|
||||||
const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
|
const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
|
||||||
_mm_load_si128(&in[i * 4 + 2]),
|
_mm_load_si128(&in[i * 4 + 2]),
|
||||||
_mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
|
_mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
|
||||||
_mm_store_si128(&out[i], _mm_max_epi8(
|
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
|
||||||
_mm_packs_epi16(words0, words1), kZero));
|
_mm_store_si128(&out[i],
|
||||||
|
#ifdef USE_SSE41
|
||||||
|
_mm_max_epi8(packedbytes, kZero)
|
||||||
|
#else
|
||||||
|
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
|
||||||
|
#endif
|
||||||
|
);
|
||||||
}
|
}
|
||||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||||
#elif defined(IS_ARM)
|
#elif defined(IS_ARM)
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#elif defined(USE_SSE41)
|
#elif defined(USE_SSE41)
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
|
#elif defined(USE_SSSE3)
|
||||||
|
#include <tmmintrin.h>
|
||||||
#elif defined(USE_SSE2)
|
#elif defined(USE_SSE2)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -87,9 +87,12 @@ class FeatureTransformer {
|
||||||
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
|
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
|
||||||
constexpr int kControl = 0b11011000;
|
constexpr int kControl = 0b11011000;
|
||||||
const __m256i kZero = _mm256_setzero_si256();
|
const __m256i kZero = _mm256_setzero_si256();
|
||||||
#elif defined(USE_SSE41)
|
#elif defined(USE_SSSE3)
|
||||||
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
|
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
|
||||||
const __m128i kZero = _mm_setzero_si128();
|
const __m128i kZero = _mm_setzero_si128();
|
||||||
|
#ifndef USE_SSE41
|
||||||
|
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||||
|
#endif
|
||||||
#elif defined(IS_ARM)
|
#elif defined(IS_ARM)
|
||||||
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
|
||||||
const int8x8_t kZero = {0};
|
const int8x8_t kZero = {0};
|
||||||
|
@ -133,7 +136,7 @@ class FeatureTransformer {
|
||||||
(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
|
(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
|
||||||
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
|
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
|
||||||
}
|
}
|
||||||
#elif defined(USE_SSE41)
|
#elif defined(USE_SSSE3)
|
||||||
auto out = reinterpret_cast<__m128i*>(&output[offset]);
|
auto out = reinterpret_cast<__m128i*>(&output[offset]);
|
||||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||||
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
|
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
|
||||||
|
@ -146,8 +149,15 @@ class FeatureTransformer {
|
||||||
sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
|
sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
|
||||||
accumulation[perspectives[p]][i])[j * 2 + 1]);
|
accumulation[perspectives[p]][i])[j * 2 + 1]);
|
||||||
}
|
}
|
||||||
_mm_store_si128(&out[j], _mm_max_epi8(
|
const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
|
||||||
_mm_packs_epi16(sum0, sum1), kZero));
|
|
||||||
|
_mm_store_si128(&out[j],
|
||||||
|
#ifdef USE_SSE41
|
||||||
|
_mm_max_epi8(packedbytes, kZero)
|
||||||
|
#else
|
||||||
|
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
|
||||||
|
#endif
|
||||||
|
);
|
||||||
}
|
}
|
||||||
#elif defined(IS_ARM)
|
#elif defined(IS_ARM)
|
||||||
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
|
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
|
||||||
|
|
Loading…
Add table
Reference in a new issue