mirror of
https://github.com/sockspls/badfish
synced 2025-04-30 08:43:09 +00:00
Prevent usage of AVX-512 for the last layer.
Add more static checks regarding the SIMD width match.
STC: https://tests.stockfishchess.org/tests/view/64f5c568a9bc5a78c669e70e
LLR: 2.95 (-2.94,2.94) <-1.75,0.25>
Total: 125216 W: 31756 L: 31636 D: 61824
Ptnml(0-2): 327, 13993, 33848, 14113, 327
Fixes a bug introduced in 2f2f45f
, where with AVX-512 the weights and input to
the last layer were being read out of bounds. Now AVX-512 is only used for the
layers it can be used for. Additional static assertions have been added to
prevent more errors like this in the future.
closes https://github.com/official-stockfish/Stockfish/pull/4773
No functional change
This commit is contained in:
parent
a8b4fd1671
commit
1461d861c8
1 changed files with 41 additions and 9 deletions
|
@ -210,6 +210,11 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||||
void propagate(
|
void propagate(
|
||||||
const InputType* input, OutputType* output) const {
|
const InputType* input, OutputType* output) const {
|
||||||
|
|
||||||
|
#if defined (USE_SSSE3)
|
||||||
|
|
||||||
|
if constexpr (OutputDimensions > 1)
|
||||||
|
{
|
||||||
|
|
||||||
#if defined (USE_AVX512)
|
#if defined (USE_AVX512)
|
||||||
using vec_t = __m512i;
|
using vec_t = __m512i;
|
||||||
#define vec_setzero _mm512_setzero_si512
|
#define vec_setzero _mm512_setzero_si512
|
||||||
|
@ -233,15 +238,10 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||||
#define vec_hadd Simd::m128_hadd
|
#define vec_hadd Simd::m128_hadd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined (USE_SSSE3)
|
static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
|
||||||
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
|
||||||
|
|
||||||
static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
|
static_assert(OutputDimensions % OutputSimdWidth == 0);
|
||||||
|
|
||||||
static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
|
|
||||||
|
|
||||||
if constexpr (OutputDimensions % OutputSimdWidth == 0)
|
|
||||||
{
|
|
||||||
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
|
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
|
||||||
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
|
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
|
||||||
|
|
||||||
|
@ -264,10 +264,41 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||||
vec_t* outptr = reinterpret_cast<vec_t*>(output);
|
vec_t* outptr = reinterpret_cast<vec_t*>(output);
|
||||||
for (IndexType k = 0; k < NumRegs; ++k)
|
for (IndexType k = 0; k < NumRegs; ++k)
|
||||||
outptr[k] = acc[k];
|
outptr[k] = acc[k];
|
||||||
|
|
||||||
|
# undef vec_setzero
|
||||||
|
# undef vec_set_32
|
||||||
|
# undef vec_add_dpbusd_32
|
||||||
|
# undef vec_add_dpbusd_32x2
|
||||||
|
# undef vec_hadd
|
||||||
|
|
||||||
}
|
}
|
||||||
else if constexpr (OutputDimensions == 1)
|
else if constexpr (OutputDimensions == 1)
|
||||||
{
|
{
|
||||||
constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
|
|
||||||
|
// We cannot use AVX512 for the last layer because there's only 32 inputs and the buffer is not padded to 64 elements.
|
||||||
|
#if defined (USE_AVX2)
|
||||||
|
using vec_t = __m256i;
|
||||||
|
#define vec_setzero _mm256_setzero_si256
|
||||||
|
#define vec_set_32 _mm256_set1_epi32
|
||||||
|
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
|
||||||
|
#define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
|
||||||
|
#define vec_hadd Simd::m256_hadd
|
||||||
|
#elif defined (USE_SSSE3)
|
||||||
|
using vec_t = __m128i;
|
||||||
|
#define vec_setzero _mm_setzero_si128
|
||||||
|
#define vec_set_32 _mm_set1_epi32
|
||||||
|
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
|
||||||
|
#define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
|
||||||
|
#define vec_hadd Simd::m128_hadd
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
||||||
|
|
||||||
|
static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType);
|
||||||
|
|
||||||
|
static_assert(PaddedInputDimensions % InputSimdWidth == 0);
|
||||||
|
|
||||||
|
constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
|
||||||
vec_t sum0 = vec_setzero();
|
vec_t sum0 = vec_setzero();
|
||||||
const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
|
const auto row0 = reinterpret_cast<const vec_t*>(&weights[0]);
|
||||||
|
|
||||||
|
@ -277,13 +308,14 @@ namespace Stockfish::Eval::NNUE::Layers {
|
||||||
vec_add_dpbusd_32(sum0, in, row0[j]);
|
vec_add_dpbusd_32(sum0, in, row0[j]);
|
||||||
}
|
}
|
||||||
output[0] = vec_hadd(sum0, biases[0]);
|
output[0] = vec_hadd(sum0, biases[0]);
|
||||||
}
|
|
||||||
|
|
||||||
# undef vec_setzero
|
# undef vec_setzero
|
||||||
# undef vec_set_32
|
# undef vec_set_32
|
||||||
# undef vec_add_dpbusd_32
|
# undef vec_add_dpbusd_32
|
||||||
# undef vec_add_dpbusd_32x2
|
# undef vec_add_dpbusd_32x2
|
||||||
# undef vec_hadd
|
# undef vec_hadd
|
||||||
|
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
// Use old implementation for the other architectures.
|
// Use old implementation for the other architectures.
|
||||||
affine_transform_non_ssse3<
|
affine_transform_non_ssse3<
|
||||||
|
|
Loading…
Add table
Reference in a new issue