mirror of
https://github.com/sockspls/badfish
synced 2025-07-11 19:49:14 +00:00
Clean SIMD code a bit
Cleaner vector code structure in feature transformer. This patch just regroups the parts of the inner loop for each SIMD instruction set. Tested for non-regression: LLR: 2.96 (-2.94,2.94) <-2.50,0.50> Total: 115760 W: 9835 L: 9831 D: 96094 Ptnml(0-2): 326, 7776, 41715, 7694, 369 https://tests.stockfishchess.org/tests/view/60b96b39457376eb8bcaa26e It would be nice if a future patch could use some of the macros at the top of the file to unify the code between the distincts SIMD instruction sets (of course, unifying the Relu will be the challenge). closes https://github.com/official-stockfish/Stockfish/pull/3506 No functional change
This commit is contained in:
parent
4445965f97
commit
8f081c86f7
1 changed files with 110 additions and 84 deletions
|
@ -180,118 +180,144 @@ namespace Stockfish::Eval::NNUE {
|
||||||
const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
|
const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
|
||||||
|
|
||||||
const auto psqt = (
|
const auto psqt = (
|
||||||
psqtAccumulation[static_cast<int>(perspectives[0])][bucket]
|
psqtAccumulation[perspectives[0]][bucket]
|
||||||
- psqtAccumulation[static_cast<int>(perspectives[1])][bucket]
|
- psqtAccumulation[perspectives[1]][bucket]
|
||||||
) / 2;
|
) / 2;
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_AVX512)
|
#if defined(USE_AVX512)
|
||||||
|
|
||||||
constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
|
constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
|
||||||
static_assert(HalfDimensions % (SimdWidth * 2) == 0);
|
static_assert(HalfDimensions % (SimdWidth * 2) == 0);
|
||||||
const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
|
const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
|
||||||
const __m512i Zero = _mm512_setzero_si512();
|
const __m512i Zero = _mm512_setzero_si512();
|
||||||
|
|
||||||
|
for (IndexType p = 0; p < 2; ++p)
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * p;
|
||||||
|
auto out = reinterpret_cast<__m512i*>(&output[offset]);
|
||||||
|
for (IndexType j = 0; j < NumChunks; ++j)
|
||||||
|
{
|
||||||
|
__m512i sum0 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
|
||||||
|
(accumulation[perspectives[p]])[j * 2 + 0]);
|
||||||
|
__m512i sum1 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
|
||||||
|
(accumulation[perspectives[p]])[j * 2 + 1]);
|
||||||
|
|
||||||
|
_mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
|
||||||
|
_mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return psqt;
|
||||||
|
|
||||||
#elif defined(USE_AVX2)
|
#elif defined(USE_AVX2)
|
||||||
|
|
||||||
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
||||||
constexpr int Control = 0b11011000;
|
constexpr int Control = 0b11011000;
|
||||||
const __m256i Zero = _mm256_setzero_si256();
|
const __m256i Zero = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
for (IndexType p = 0; p < 2; ++p)
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * p;
|
||||||
|
auto out = reinterpret_cast<__m256i*>(&output[offset]);
|
||||||
|
for (IndexType j = 0; j < NumChunks; ++j)
|
||||||
|
{
|
||||||
|
__m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
|
||||||
|
(accumulation[perspectives[p]])[j * 2 + 0]);
|
||||||
|
__m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
|
||||||
|
(accumulation[perspectives[p]])[j * 2 + 1]);
|
||||||
|
|
||||||
|
_mm256_store_si256(&out[j], _mm256_permute4x64_epi64(
|
||||||
|
_mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return psqt;
|
||||||
|
|
||||||
#elif defined(USE_SSE2)
|
#elif defined(USE_SSE2)
|
||||||
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
|
||||||
|
|
||||||
#ifdef USE_SSE41
|
#ifdef USE_SSE41
|
||||||
|
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
||||||
const __m128i Zero = _mm_setzero_si128();
|
const __m128i Zero = _mm_setzero_si128();
|
||||||
#else
|
#else
|
||||||
|
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
||||||
const __m128i k0x80s = _mm_set1_epi8(-128);
|
const __m128i k0x80s = _mm_set1_epi8(-128);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
for (IndexType p = 0; p < 2; ++p)
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * p;
|
||||||
|
auto out = reinterpret_cast<__m128i*>(&output[offset]);
|
||||||
|
for (IndexType j = 0; j < NumChunks; ++j)
|
||||||
|
{
|
||||||
|
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>
|
||||||
|
(accumulation[perspectives[p]])[j * 2 + 0]);
|
||||||
|
__m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>
|
||||||
|
(accumulation[perspectives[p]])[j * 2 + 1]);
|
||||||
|
const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
|
||||||
|
|
||||||
|
#ifdef USE_SSE41
|
||||||
|
_mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero));
|
||||||
|
#else
|
||||||
|
_mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return psqt;
|
||||||
|
|
||||||
#elif defined(USE_MMX)
|
#elif defined(USE_MMX)
|
||||||
|
|
||||||
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
|
||||||
const __m64 k0x80s = _mm_set1_pi8(-128);
|
const __m64 k0x80s = _mm_set1_pi8(-128);
|
||||||
|
|
||||||
#elif defined(USE_NEON)
|
for (IndexType p = 0; p < 2; ++p)
|
||||||
constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
|
{
|
||||||
const int8x8_t Zero = {0};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (IndexType p = 0; p < 2; ++p) {
|
|
||||||
const IndexType offset = HalfDimensions * p;
|
const IndexType offset = HalfDimensions * p;
|
||||||
|
|
||||||
#if defined(USE_AVX512)
|
|
||||||
auto out = reinterpret_cast<__m512i*>(&output[offset]);
|
|
||||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
|
||||||
__m512i sum0 = _mm512_load_si512(
|
|
||||||
&reinterpret_cast<const __m512i*>(accumulation[perspectives[p]])[j * 2 + 0]);
|
|
||||||
__m512i sum1 = _mm512_load_si512(
|
|
||||||
&reinterpret_cast<const __m512i*>(accumulation[perspectives[p]])[j * 2 + 1]);
|
|
||||||
_mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
|
|
||||||
_mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(USE_AVX2)
|
|
||||||
auto out = reinterpret_cast<__m256i*>(&output[offset]);
|
|
||||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
|
||||||
__m256i sum0 = _mm256_load_si256(
|
|
||||||
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]])[j * 2 + 0]);
|
|
||||||
__m256i sum1 = _mm256_load_si256(
|
|
||||||
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]])[j * 2 + 1]);
|
|
||||||
_mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
|
|
||||||
_mm256_packs_epi16(sum0, sum1), Zero), Control));
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(USE_SSE2)
|
|
||||||
auto out = reinterpret_cast<__m128i*>(&output[offset]);
|
|
||||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
|
||||||
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
|
|
||||||
accumulation[perspectives[p]])[j * 2 + 0]);
|
|
||||||
__m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
|
|
||||||
accumulation[perspectives[p]])[j * 2 + 1]);
|
|
||||||
const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
|
|
||||||
|
|
||||||
_mm_store_si128(&out[j],
|
|
||||||
|
|
||||||
#ifdef USE_SSE41
|
|
||||||
_mm_max_epi8(packedbytes, Zero)
|
|
||||||
#else
|
|
||||||
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(USE_MMX)
|
|
||||||
auto out = reinterpret_cast<__m64*>(&output[offset]);
|
auto out = reinterpret_cast<__m64*>(&output[offset]);
|
||||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
for (IndexType j = 0; j < NumChunks; ++j)
|
||||||
__m64 sum0 = *(&reinterpret_cast<const __m64*>(
|
{
|
||||||
accumulation[perspectives[p]])[j * 2 + 0]);
|
__m64 sum0 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 0]);
|
||||||
__m64 sum1 = *(&reinterpret_cast<const __m64*>(
|
__m64 sum1 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 1]);
|
||||||
accumulation[perspectives[p]])[j * 2 + 1]);
|
|
||||||
const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
|
const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
|
||||||
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
|
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
_mm_empty();
|
||||||
|
return psqt;
|
||||||
|
|
||||||
#elif defined(USE_NEON)
|
#elif defined(USE_NEON)
|
||||||
|
|
||||||
|
constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
|
||||||
|
const int8x8_t Zero = {0};
|
||||||
|
|
||||||
|
for (IndexType p = 0; p < 2; ++p)
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * p;
|
||||||
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
|
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
|
||||||
for (IndexType j = 0; j < NumChunks; ++j) {
|
for (IndexType j = 0; j < NumChunks; ++j)
|
||||||
int16x8_t sum = reinterpret_cast<const int16x8_t*>(
|
{
|
||||||
accumulation[perspectives[p]])[j];
|
int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
|
||||||
out[j] = vmax_s8(vqmovn_s16(sum), Zero);
|
out[j] = vmax_s8(vqmovn_s16(sum), Zero);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
return psqt;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
for (IndexType j = 0; j < HalfDimensions; ++j) {
|
|
||||||
BiasType sum = accumulation[static_cast<int>(perspectives[p])][j];
|
|
||||||
output[offset + j] = static_cast<OutputType>(
|
|
||||||
std::max<int>(0, std::min<int>(127, sum)));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
for (IndexType p = 0; p < 2; ++p)
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * p;
|
||||||
|
for (IndexType j = 0; j < HalfDimensions; ++j)
|
||||||
|
{
|
||||||
|
BiasType sum = accumulation[perspectives[p]][j];
|
||||||
|
output[offset + j] = static_cast<OutputType>(std::max<int>(0, std::min<int>(127, sum)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#if defined(USE_MMX)
|
|
||||||
_mm_empty();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return psqt;
|
return psqt;
|
||||||
}
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // end of function transform()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void update_accumulator(const Position& pos, const Color perspective) const {
|
void update_accumulator(const Position& pos, const Color perspective) const {
|
||||||
|
|
Loading…
Add table
Reference in a new issue