From b976f0a101f80d8b80aa212e92d1cc04b12c6136 Mon Sep 17 00:00:00 2001
From: MinetaS <skdty87@gmail.com>
Date: Mon, 29 Jul 2024 12:10:34 +0900
Subject: [PATCH] Move DotProd code into optimized affine layer

This patch moves the DotProd code into the propagation function which
has sequential access optimization. To prove the speedup, the comparison
is done without the sparse layer. With the sparse layer the effect is
marginal (GCC 0.3%, LLVM/Clang 0.1%).

For both tests, binary is compiled with GCC 14.1. Each test had 50 runs.

Sparse layer included:
```
speedup        = +0.0030
P(speedup > 0) =  1.0000
```

Sparse layer excluded:
```
speedup        = +0.0561
P(speedup > 0) =  1.0000
```

closes https://github.com/official-stockfish/Stockfish/pull/5520

No functional change
---
 src/nnue/layers/affine_transform.h | 58 +++++++++++++++---------------
 1 file changed, 28 insertions(+), 30 deletions(-)
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index ad9167c0..59a6149f 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -39,25 +39,26 @@
 
 namespace Stockfish::Eval::NNUE::Layers {
 
+#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
+    #define ENABLE_SEQ_OPT
+#endif
+
 // Fallback implementation for older/other architectures.
 // Requires the input to be padded to at least 16 values.
-#if !defined(USE_SSSE3)
+#ifndef ENABLE_SEQ_OPT
+
 template<IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
 static void affine_transform_non_ssse3(std::int32_t*       output,
                                        const std::int8_t*  weights,
                                        const std::int32_t* biases,
                                        const std::uint8_t* input) {
-    #if defined(USE_SSE2) || defined(USE_NEON_DOTPROD) || defined(USE_NEON)
+    #if defined(USE_SSE2) || defined(USE_NEON)
         #if defined(USE_SSE2)
     // At least a multiple of 16, with SSE2.
     constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
     const __m128i       Zeros       = _mm_setzero_si128();
     const auto          inputVector = reinterpret_cast<const __m128i*>(input);
 
-        #elif defined(USE_NEON_DOTPROD)
-    constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
-    const auto          inputVector = reinterpret_cast<const int8x16_t*>(input);
-
         #elif defined(USE_NEON)
     constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
     const auto          inputVector = reinterpret_cast<const int8x8_t*>(input);
@@ -91,16 +92,8 @@ static void affine_transform_non_ssse3(std::int32_t*       output,
         sum                   = _mm_add_epi32(sum, sum_second_32);
         output[i]             = _mm_cvtsi128_si32(sum);
 
-        #elif defined(USE_NEON_DOTPROD)
-        int32x4_t  sum = {biases[i]};
-        const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j)
-        {
-            sum = vdotq_s32(sum, inputVector[j], row[j]);
-        }
-        output[i] = vaddvq_s32(sum);
-
         #elif defined(USE_NEON)
+
         int32x4_t  sum = {biases[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
         for (IndexType j = 0; j < NumChunks; ++j)
@@ -127,7 +120,8 @@ static void affine_transform_non_ssse3(std::int32_t*       output,
         }
     #endif
 }
-#endif
+
+#endif  // !ENABLE_SEQ_OPT
 
 template<IndexType InDims, IndexType OutDims>
 class AffineTransform {
@@ -162,7 +156,7 @@ class AffineTransform {
     }
 
     static constexpr IndexType get_weight_index(IndexType i) {
-#if defined(USE_SSSE3)
+#ifdef ENABLE_SEQ_OPT
         return get_weight_index_scrambled(i);
 #else
         return i;
@@ -190,29 +184,28 @@ class AffineTransform {
     // Forward propagation
     void propagate(const InputType* input, OutputType* output) const {
 
-#if defined(USE_SSSE3)
+#ifdef ENABLE_SEQ_OPT
 
         if constexpr (OutputDimensions > 1)
         {
-
     #if defined(USE_AVX512)
             using vec_t = __m512i;
-        #define vec_setzero _mm512_setzero_si512
         #define vec_set_32 _mm512_set1_epi32
         #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
-        #define vec_hadd Simd::m512_hadd
     #elif defined(USE_AVX2)
             using vec_t = __m256i;
-        #define vec_setzero _mm256_setzero_si256
         #define vec_set_32 _mm256_set1_epi32
         #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
-        #define vec_hadd Simd::m256_hadd
     #elif defined(USE_SSSE3)
             using vec_t = __m128i;
-        #define vec_setzero _mm_setzero_si128
         #define vec_set_32 _mm_set1_epi32
         #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
-        #define vec_hadd Simd::m128_hadd
+    #elif defined(USE_NEON_DOTPROD)
+            using vec_t = int32x4_t;
+        #define vec_set_32 vdupq_n_s32
+        #define vec_add_dpbusd_32(acc, a, b) \
+            Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
+                                                vreinterpretq_s8_s32(b))
     #endif
 
             static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
@@ -242,28 +235,33 @@ class AffineTransform {
             for (IndexType k = 0; k < NumRegs; ++k)
                 outptr[k] = acc[k];
 
-    #undef vec_setzero
     #undef vec_set_32
     #undef vec_add_dpbusd_32
-    #undef vec_hadd
         }
         else if constexpr (OutputDimensions == 1)
         {
-
     // We cannot use AVX512 for the last layer because there are only 32 inputs
     // and the buffer is not padded to 64 elements.
     #if defined(USE_AVX2)
             using vec_t = __m256i;
-        #define vec_setzero _mm256_setzero_si256
+        #define vec_setzero() _mm256_setzero_si256()
         #define vec_set_32 _mm256_set1_epi32
         #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
         #define vec_hadd Simd::m256_hadd
     #elif defined(USE_SSSE3)
             using vec_t = __m128i;
-        #define vec_setzero _mm_setzero_si128
+        #define vec_setzero() _mm_setzero_si128()
         #define vec_set_32 _mm_set1_epi32
         #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
         #define vec_hadd Simd::m128_hadd
+    #elif defined(USE_NEON_DOTPROD)
+            using vec_t = int32x4_t;
+        #define vec_setzero() vdupq_n_s32(0)
+        #define vec_set_32 vdupq_n_s32
+        #define vec_add_dpbusd_32(acc, a, b) \
+            Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
+                                                vreinterpretq_s8_s32(b))
+        #define vec_hadd Simd::neon_m128_hadd
     #endif
 
             const auto inputVector = reinterpret_cast<const vec_t*>(input);