mirror of
https://github.com/sockspls/badfish
synced 2025-07-11 19:49:14 +00:00
Optimize the most common update accumalator cases w/o tiling
In the most common case where we only update a single state it's faster to not use temporary accumulation registers and tiling. (Also includes a couple of small cleanups.) passed STC https://tests.stockfishchess.org/tests/view/651918e3cff46e538ee0023b LLR: 2.95 (-2.94,2.94) <0.00,2.00> Total: 34944 W: 8989 L: 8687 D: 17268 Ptnml(0-2): 88, 3743, 9512, 4037, 92 A simpler version https://tests.stockfishchess.org/tests/view/65190dfacff46e538ee00155 also passed but this version is stronger still https://tests.stockfishchess.org/tests/view/6519b95fcff46e538ee00fa2 closes https://github.com/official-stockfish/Stockfish/pull/4816 No functional change
This commit is contained in:
parent
040dfedb34
commit
c17a657b04
2 changed files with 122 additions and 59 deletions
|
@ -87,6 +87,7 @@ public:
|
||||||
void push_back(const T& value) { values_[size_++] = value; }
|
void push_back(const T& value) { values_[size_++] = value; }
|
||||||
const T* begin() const { return values_; }
|
const T* begin() const { return values_; }
|
||||||
const T* end() const { return values_ + size_; }
|
const T* end() const { return values_ + size_; }
|
||||||
|
const T& operator[](int index) const { return values_[index]; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
T values_[MaxSize];
|
T values_[MaxSize];
|
||||||
|
|
|
@ -370,13 +370,13 @@ namespace Stockfish::Eval::NNUE {
|
||||||
while (states_to_update[i] == nullptr)
|
while (states_to_update[i] == nullptr)
|
||||||
--i;
|
--i;
|
||||||
|
|
||||||
StateInfo *st2 = states_to_update[i];
|
StateInfo* st2 = states_to_update[i];
|
||||||
|
|
||||||
for (; i >= 0; --i)
|
for (; i >= 0; --i)
|
||||||
{
|
{
|
||||||
states_to_update[i]->accumulator.computed[Perspective] = true;
|
states_to_update[i]->accumulator.computed[Perspective] = true;
|
||||||
|
|
||||||
StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
|
const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
|
||||||
|
|
||||||
for (; st2 != end_state; st2 = st2->previous)
|
for (; st2 != end_state; st2 = st2->previous)
|
||||||
FeatureSet::append_changed_indices<Perspective>(
|
FeatureSet::append_changed_indices<Perspective>(
|
||||||
|
@ -388,78 +388,140 @@ namespace Stockfish::Eval::NNUE {
|
||||||
|
|
||||||
// Now update the accumulators listed in states_to_update[], where the last element is a sentinel.
|
// Now update the accumulators listed in states_to_update[], where the last element is a sentinel.
|
||||||
#ifdef VECTOR
|
#ifdef VECTOR
|
||||||
for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
|
|
||||||
|
if ( states_to_update[1] == nullptr
|
||||||
|
&& (removed[0].size() == 1 || removed[0].size() == 2)
|
||||||
|
&& added[0].size() == 1)
|
||||||
{
|
{
|
||||||
// Load accumulator
|
assert(states_to_update[0]);
|
||||||
auto accTile = reinterpret_cast<vec_t*>(
|
|
||||||
&st->accumulator.accumulation[Perspective][j * TileHeight]);
|
|
||||||
for (IndexType k = 0; k < NumRegs; ++k)
|
|
||||||
acc[k] = vec_load(&accTile[k]);
|
|
||||||
|
|
||||||
for (IndexType i = 0; states_to_update[i]; ++i)
|
auto accTileIn = reinterpret_cast<const vec_t*>(
|
||||||
{
|
&st->accumulator.accumulation[Perspective][0]);
|
||||||
// Difference calculation for the deactivated features
|
auto accTileOut = reinterpret_cast<vec_t*>(
|
||||||
for (const auto index : removed[i])
|
&states_to_update[0]->accumulator.accumulation[Perspective][0]);
|
||||||
|
|
||||||
|
const IndexType offsetR0 = HalfDimensions * removed[0][0];
|
||||||
|
auto columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
|
||||||
|
const IndexType offsetA = HalfDimensions * added[0][0];
|
||||||
|
auto columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
|
||||||
|
|
||||||
|
if (removed[0].size() == 1)
|
||||||
{
|
{
|
||||||
const IndexType offset = HalfDimensions * index + j * TileHeight;
|
for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t); ++k)
|
||||||
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
|
accTileOut[k] = vec_add_16(vec_sub_16(accTileIn[k], columnR0[k]), columnA[k]);
|
||||||
for (IndexType k = 0; k < NumRegs; ++k)
|
}
|
||||||
acc[k] = vec_sub_16(acc[k], column[k]);
|
else
|
||||||
|
{
|
||||||
|
const IndexType offsetR1 = HalfDimensions * removed[0][1];
|
||||||
|
auto columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
|
||||||
|
|
||||||
|
for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t); ++k)
|
||||||
|
accTileOut[k] = vec_sub_16(
|
||||||
|
vec_add_16(accTileIn[k], columnA[k]),
|
||||||
|
vec_add_16(columnR0[k], columnR1[k]));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Difference calculation for the activated features
|
auto accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>(
|
||||||
for (const auto index : added[i])
|
&st->accumulator.psqtAccumulation[Perspective][0]);
|
||||||
{
|
auto accTilePsqtOut = reinterpret_cast<psqt_vec_t*>(
|
||||||
const IndexType offset = HalfDimensions * index + j * TileHeight;
|
&states_to_update[0]->accumulator.psqtAccumulation[Perspective][0]);
|
||||||
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
|
|
||||||
for (IndexType k = 0; k < NumRegs; ++k)
|
|
||||||
acc[k] = vec_add_16(acc[k], column[k]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store accumulator
|
const IndexType offsetPsqtR0 = PSQTBuckets * removed[0][0];
|
||||||
accTile = reinterpret_cast<vec_t*>(
|
auto columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]);
|
||||||
&states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]);
|
const IndexType offsetPsqtA = PSQTBuckets * added[0][0];
|
||||||
for (IndexType k = 0; k < NumRegs; ++k)
|
auto columnPsqtA = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA]);
|
||||||
vec_store(&accTile[k], acc[k]);
|
|
||||||
}
|
if (removed[0].size() == 1)
|
||||||
|
{
|
||||||
|
for (std::size_t k = 0; k < PSQTBuckets * sizeof(std::int32_t) / sizeof(psqt_vec_t); ++k)
|
||||||
|
accTilePsqtOut[k] = vec_add_psqt_32(vec_sub_psqt_32(
|
||||||
|
accTilePsqtIn[k], columnPsqtR0[k]), columnPsqtA[k]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const IndexType offsetPsqtR1 = PSQTBuckets * removed[0][1];
|
||||||
|
auto columnPsqtR1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]);
|
||||||
|
|
||||||
|
for (std::size_t k = 0; k < PSQTBuckets * sizeof(std::int32_t) / sizeof(psqt_vec_t); ++k)
|
||||||
|
accTilePsqtOut[k] = vec_sub_psqt_32(
|
||||||
|
vec_add_psqt_32(accTilePsqtIn[k], columnPsqtA[k]),
|
||||||
|
vec_add_psqt_32(columnPsqtR0[k], columnPsqtR1[k]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
|
|
||||||
{
|
{
|
||||||
// Load accumulator
|
for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
|
||||||
auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
|
|
||||||
&st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
|
|
||||||
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
|
||||||
psqt[k] = vec_load_psqt(&accTilePsqt[k]);
|
|
||||||
|
|
||||||
for (IndexType i = 0; states_to_update[i]; ++i)
|
|
||||||
{
|
|
||||||
// Difference calculation for the deactivated features
|
|
||||||
for (const auto index : removed[i])
|
|
||||||
{
|
{
|
||||||
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
|
// Load accumulator
|
||||||
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
|
auto accTileIn = reinterpret_cast<const vec_t*>(
|
||||||
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
&st->accumulator.accumulation[Perspective][j * TileHeight]);
|
||||||
psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
|
for (IndexType k = 0; k < NumRegs; ++k)
|
||||||
|
acc[k] = vec_load(&accTileIn[k]);
|
||||||
|
|
||||||
|
for (IndexType i = 0; states_to_update[i]; ++i)
|
||||||
|
{
|
||||||
|
// Difference calculation for the deactivated features
|
||||||
|
for (const auto index : removed[i])
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * index + j * TileHeight;
|
||||||
|
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
|
||||||
|
for (IndexType k = 0; k < NumRegs; ++k)
|
||||||
|
acc[k] = vec_sub_16(acc[k], column[k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Difference calculation for the activated features
|
||||||
|
for (const auto index : added[i])
|
||||||
|
{
|
||||||
|
const IndexType offset = HalfDimensions * index + j * TileHeight;
|
||||||
|
auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
|
||||||
|
for (IndexType k = 0; k < NumRegs; ++k)
|
||||||
|
acc[k] = vec_add_16(acc[k], column[k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store accumulator
|
||||||
|
auto accTileOut = reinterpret_cast<vec_t*>(
|
||||||
|
&states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]);
|
||||||
|
for (IndexType k = 0; k < NumRegs; ++k)
|
||||||
|
vec_store(&accTileOut[k], acc[k]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Difference calculation for the activated features
|
for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
|
||||||
for (const auto index : added[i])
|
|
||||||
{
|
{
|
||||||
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
|
// Load accumulator
|
||||||
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
|
auto accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>(
|
||||||
|
&st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
|
||||||
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
||||||
psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
|
psqt[k] = vec_load_psqt(&accTilePsqtIn[k]);
|
||||||
}
|
|
||||||
|
|
||||||
// Store accumulator
|
for (IndexType i = 0; states_to_update[i]; ++i)
|
||||||
accTilePsqt = reinterpret_cast<psqt_vec_t*>(
|
{
|
||||||
&states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
|
// Difference calculation for the deactivated features
|
||||||
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
for (const auto index : removed[i])
|
||||||
vec_store_psqt(&accTilePsqt[k], psqt[k]);
|
{
|
||||||
}
|
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
|
||||||
|
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
|
||||||
|
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
||||||
|
psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Difference calculation for the activated features
|
||||||
|
for (const auto index : added[i])
|
||||||
|
{
|
||||||
|
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
|
||||||
|
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
|
||||||
|
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
||||||
|
psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store accumulator
|
||||||
|
auto accTilePsqtOut = reinterpret_cast<psqt_vec_t*>(
|
||||||
|
&states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
|
||||||
|
for (std::size_t k = 0; k < NumPsqtRegs; ++k)
|
||||||
|
vec_store_psqt(&accTilePsqtOut[k], psqt[k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
for (IndexType i = 0; states_to_update[i]; ++i)
|
for (IndexType i = 0; states_to_update[i]; ++i)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Reference in a new issue