1
0
Fork 0
mirror of https://github.com/sockspls/badfish synced 2025-04-29 16:23:09 +00:00

Use tiling to speed up accumulator refreshes and updates

Perform the update and refresh operations tile by tile in a local
array of vectors. By selecting the array size carefully, we
achieve that the compiler keeps the whole array in vector registers.

Idea and original implementation by @sf-x.

STC: https://tests.stockfishchess.org/tests/view/5f623eec912c15f19854b855
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 4872 W: 623 L: 477 D: 3772
Ptnml(0-2): 14, 350, 1585, 450, 37

LTC: https://tests.stockfishchess.org/tests/view/5f62434e912c15f19854b860
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 25808 W: 1565 L: 1401 D: 22842
Ptnml(0-2): 23, 1186, 10332, 1330, 33

closes https://github.com/official-stockfish/Stockfish/pull/3130

No functional change
This commit is contained in:
syzygy1 2020-09-16 17:39:11 +02:00 committed by Joost VandeVondele
parent 64a63464d7
commit 8b8a510fd6

View file

@ -29,6 +29,56 @@
namespace Eval::NNUE {
// If vector instructions are enabled, we update and refresh the
// accumulator tile by tile such that each tile fits in the CPU's
// vector registers.
#define TILING
#ifdef USE_AVX512
typedef __m512i vec_t;
#define vec_load(a) _mm512_loadA_si512(a)
#define vec_store(a,b) _mm512_storeA_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 8; // only 8 are needed
#elif USE_AVX2
typedef __m256i vec_t;
#define vec_load(a) _mm256_loadA_si256(a)
#define vec_store(a,b) _mm256_storeA_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 16;
#elif USE_SSE2
typedef __m128i vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
#elif USE_MMX
typedef __m64 vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
static constexpr IndexType kNumRegs = 8;
#elif USE_NEON
typedef int16x8_t vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
static constexpr IndexType kNumRegs = 16;
#else
#undef TILING
#endif
// Input feature converter
class FeatureTransformer {
@ -36,6 +86,11 @@ namespace Eval::NNUE {
// Number of output dimensions for one side
static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
#ifdef TILING
static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
#endif
public:
// Output type
using OutputType = TransformedFeatureType;
@ -189,57 +244,41 @@ namespace Eval::NNUE {
RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
active_indices);
for (Color perspective : { WHITE, BLACK }) {
#ifdef TILING
for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
for (unsigned k = 0; k < kNumRegs; k++)
vec_store(&accTile[k], acc[k]);
}
#else
std::memcpy(accumulator.accumulation[perspective][i], biases_,
kHalfDimensions * sizeof(BiasType));
kHalfDimensions * sizeof(BiasType));
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
#if defined(USE_AVX512)
auto accumulation = reinterpret_cast<__m512i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
for (IndexType j = 0; j < kNumChunks; ++j)
_mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
#elif defined(USE_AVX2)
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
_mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
#elif defined(USE_SSE2)
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto accumulation = reinterpret_cast<__m64*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
#else
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
#endif
}
#endif
}
#if defined(USE_MMX)
_mm_empty();
#endif
@ -257,29 +296,55 @@ namespace Eval::NNUE {
bool reset[2];
RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
removed_indices, added_indices, reset);
for (Color perspective : { WHITE, BLACK }) {
#if defined(USE_AVX2)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
#ifdef TILING
for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
for (Color perspective : { WHITE, BLACK }) {
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
#elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
if (reset[perspective]) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
} else {
auto prevAccTile = reinterpret_cast<const vec_t*>(
&prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_load(&prevAccTile[k]);
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m64*>(
&accumulator.accumulation[perspective][i][0]);
// Difference calculation for the deactivated features
for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]);
}
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
}
for (IndexType k = 0; k < kNumRegs; ++k)
vec_store(&accTile[k], acc[k]);
}
}
#if defined(USE_MMX)
_mm_empty();
#endif
#else
for (Color perspective : { WHITE, BLACK }) {
if (reset[perspective]) {
std::memcpy(accumulator.accumulation[perspective][i], biases_,
kHalfDimensions * sizeof(BiasType));
@ -291,67 +356,19 @@ namespace Eval::NNUE {
for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
#if defined(USE_AVX2)
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
#elif defined(USE_SSE2)
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vsubq_s16(accumulation[j], column[j]);
#else
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
#endif
}
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
#if defined(USE_AVX2)
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
#elif defined(USE_SSE2)
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
#else
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
#endif
}
}
}
#if defined(USE_MMX)
_mm_empty();
#endif
accumulator.computed_accumulation = true;