diff --git a/src/Makefile b/src/Makefile index eaa4c867..aa5cc1ba 100644 --- a/src/Makefile +++ b/src/Makefile @@ -387,12 +387,20 @@ ifeq ($(avx2),yes) endif ifeq ($(sse41),yes) + ssse3 = yes CXXFLAGS += -DUSE_SSE41 ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2)) CXXFLAGS += -msse4.1 endif endif +ifeq ($(ssse3),yes) + CXXFLAGS += -DUSE_SSSE3 + ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2)) + CXXFLAGS += -mssse3 + endif +endif + ifeq ($(arch),x86_64) CXXFLAGS += -DUSE_SSE2 endif diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h index c06af1a0..cb56b07d 100644 --- a/src/eval/nnue/layers/affine_transform.h +++ b/src/eval/nnue/layers/affine_transform.h @@ -86,7 +86,7 @@ class AffineTransform { constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); -#elif defined(USE_SSE41) +#elif defined(USE_SSSE3) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const __m128i kOnes = _mm_set1_epi16(1); const auto input_vector = reinterpret_cast(input); @@ -118,7 +118,7 @@ class AffineTransform { const __m128i lo = _mm256_extracti128_si256(sum, 0); const __m128i hi = _mm256_extracti128_si256(sum, 1); output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi); -#elif defined(USE_SSE41) +#elif defined(USE_SSSE3) __m128i sum = _mm_cvtsi32_si128(biases_[i]); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/eval/nnue/layers/clipped_relu.h index 7c5c1f75..fe4bedaa 100644 --- a/src/eval/nnue/layers/clipped_relu.h +++ b/src/eval/nnue/layers/clipped_relu.h @@ -110,9 +110,12 @@ class ClippedReLU { _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; -#elif defined(USE_SSE41) +#elif defined(USE_SSSE3) constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; const __m128i kZero = _mm_setzero_si128(); +#ifndef USE_SSE41 + const __m128i k0x80s = _mm_set1_epi8(-128); +#endif const auto in = reinterpret_cast(input); const auto out = reinterpret_cast<__m128i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { @@ -122,8 +125,14 @@ class ClippedReLU { const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32( _mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits); - _mm_store_si128(&out[i], _mm_max_epi8( - _mm_packs_epi16(words0, words1), kZero)); + const __m128i packedbytes = _mm_packs_epi16(words0, words1); + _mm_store_si128(&out[i], +#ifdef USE_SSE41 + _mm_max_epi8(packedbytes, kZero) +#else + _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) +#endif + ); } constexpr IndexType kStart = kNumChunks * kSimdWidth; #elif defined(IS_ARM) diff --git a/src/eval/nnue/nnue_common.h b/src/eval/nnue/nnue_common.h index bb52bdfe..cffb0098 100644 --- a/src/eval/nnue/nnue_common.h +++ b/src/eval/nnue/nnue_common.h @@ -9,6 +9,8 @@ #include #elif defined(USE_SSE41) #include +#elif defined(USE_SSSE3) +#include #elif defined(USE_SSE2) #include #endif diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h index 27bbb562..bb1a50bc 100644 --- a/src/eval/nnue/nnue_feature_transformer.h +++ b/src/eval/nnue/nnue_feature_transformer.h @@ -87,9 +87,12 @@ class FeatureTransformer { constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr int kControl = 0b11011000; const __m256i kZero = _mm256_setzero_si256(); -#elif defined(USE_SSE41) +#elif defined(USE_SSSE3) constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; const __m128i kZero = _mm_setzero_si128(); +#ifndef USE_SSE41 + const __m128i k0x80s = _mm_set1_epi8(-128); +#endif #elif defined(IS_ARM) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; @@ -133,7 +136,7 @@ class FeatureTransformer { (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } -#elif defined(USE_SSE41) +#elif defined(USE_SSSE3) auto out = reinterpret_cast<__m128i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m128i sum0 = _mm_load_si128(&reinterpret_cast( @@ -146,8 +149,15 @@ class FeatureTransformer { sum1 = _mm_add_epi16(sum1, reinterpret_cast( accumulation[perspectives[p]][i])[j * 2 + 1]); } - _mm_store_si128(&out[j], _mm_max_epi8( - _mm_packs_epi16(sum0, sum1), kZero)); + const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); + + _mm_store_si128(&out[j], +#ifdef USE_SSE41 + _mm_max_epi8(packedbytes, kZero) +#else + _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) +#endif + ); } #elif defined(IS_ARM) const auto out = reinterpret_cast(&output[offset]);