diff --git a/.github/ci/libcxx17.imp b/.github/ci/libcxx17.imp index 7bdcf5bc..d3a262b5 100644 --- a/.github/ci/libcxx17.imp +++ b/.github/ci/libcxx17.imp @@ -7,6 +7,7 @@ { include: [ "<__fwd/sstream.h>", private, "", public ] }, { include: [ "<__fwd/streambuf.h>", private, "", public ] }, { include: [ "<__fwd/string_view.h>", private, "", public ] }, + { include: [ "<__system_error/errc.h>", private, "", public ] }, # Mappings for includes between public headers { include: [ "", public, "", public ] }, diff --git a/.github/workflows/arm_compilation.yml b/.github/workflows/arm_compilation.yml index 3934ac2d..5bf2a93e 100644 --- a/.github/workflows/arm_compilation.yml +++ b/.github/workflows/arm_compilation.yml @@ -26,6 +26,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + persist-credentials: false - name: Download required linux packages if: runner.os == 'Linux' @@ -91,4 +92,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: ${{ matrix.config.simple_name }} ${{ matrix.binaries }} - path: . + path: | + . + !.git + !.output diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index e20e0d5d..452c2f2a 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -11,6 +11,10 @@ on: paths: - "**.cpp" - "**.h" + +permissions: + pull-requests: write + jobs: Clang-Format: name: Clang-Format @@ -25,7 +29,7 @@ jobs: id: clang-format continue-on-error: true with: - clang-format-version: "17" + clang-format-version: "18" exclude-regex: "incbin" - name: Comment on PR @@ -33,12 +37,13 @@ jobs: uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # @v2.5.0 with: message: | - clang-format 17 needs to be run on this PR. + clang-format 18 needs to be run on this PR. If you do not have clang-format installed, the maintainer will run it when merging. - For the exact version please see https://packages.ubuntu.com/mantic/clang-format-17. + For the exact version please see https://packages.ubuntu.com/noble/clang-format-18. _(execution **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_ comment_tag: execution + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Comment on PR if: steps.clang-format.outcome != 'failure' @@ -49,3 +54,4 @@ jobs: create_if_not_exists: false comment_tag: execution mode: delete + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index d949a5a7..d01ed41f 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -30,6 +30,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + persist-credentials: false # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/compilation.yml b/.github/workflows/compilation.yml index 3524d5e9..5878adec 100644 --- a/.github/workflows/compilation.yml +++ b/.github/workflows/compilation.yml @@ -25,6 +25,8 @@ jobs: shell: ${{ matrix.config.shell }} steps: - uses: actions/checkout@v4 + with: + persist-credentials: false - name: Install fixed GCC on Linux if: runner.os == 'Linux' @@ -86,4 +88,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: ${{ matrix.config.simple_name }} ${{ matrix.binaries }} - path: . + path: | + . + !.git + !.output diff --git a/.github/workflows/games.yml b/.github/workflows/games.yml new file mode 100644 index 00000000..f0bca442 --- /dev/null +++ b/.github/workflows/games.yml @@ -0,0 +1,43 @@ +# This workflow will play games with a debug enabled SF using the PR + +name: Games +on: + workflow_call: +jobs: + Matetrack: + name: Games + runs-on: ubuntu-22.04 + steps: + - name: Checkout SF repo + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: Stockfish + persist-credentials: false + + - name: build debug enabled version of SF + working-directory: Stockfish/src + run: make -j build debug=yes + + - name: Checkout fast-chess repo + uses: actions/checkout@v4 + with: + repository: Disservin/fast-chess + path: fast-chess + ref: d54af1910d5479c669dc731f1f54f9108a251951 + persist-credentials: false + + - name: fast-chess build + working-directory: fast-chess + run: make -j + + - name: Run games + working-directory: fast-chess + run: | + ./fast-chess -rounds 4 -games 2 -repeat -concurrency 4 -openings file=app/tests/data/openings.epd format=epd order=random -srand $RANDOM\ + -engine name=sf1 cmd=/home/runner/work/Stockfish/Stockfish/Stockfish/src/stockfish\ + -engine name=sf2 cmd=/home/runner/work/Stockfish/Stockfish/Stockfish/src/stockfish\ + -ratinginterval 1 -report penta=true -each proto=uci tc=4+0.04 -log file=fast.log | tee fast.out + cat fast.log + ! grep "Assertion" fast.log > /dev/null + ! grep "disconnect" fast.out > /dev/null diff --git a/.github/workflows/iwyu.yml b/.github/workflows/iwyu.yml index 0552a598..f8898b1c 100644 --- a/.github/workflows/iwyu.yml +++ b/.github/workflows/iwyu.yml @@ -14,6 +14,7 @@ jobs: uses: actions/checkout@v4 with: path: Stockfish + persist-credentials: false - name: Checkout include-what-you-use uses: actions/checkout@v4 @@ -21,6 +22,7 @@ jobs: repository: include-what-you-use/include-what-you-use ref: f25caa280dc3277c4086ec345ad279a2463fea0f path: include-what-you-use + persist-credentials: false - name: Download required linux packages run: | diff --git a/.github/workflows/matetrack.yml b/.github/workflows/matetrack.yml new file mode 100644 index 00000000..dc8dff8d --- /dev/null +++ b/.github/workflows/matetrack.yml @@ -0,0 +1,54 @@ +# This workflow will run matetrack on the PR + +name: Matetrack +on: + workflow_call: +jobs: + Matetrack: + name: Matetrack + runs-on: ubuntu-22.04 + steps: + - name: Checkout SF repo + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: Stockfish + persist-credentials: false + + - name: build SF + working-directory: Stockfish/src + run: make -j profile-build + + - name: Checkout matetrack repo + uses: actions/checkout@v4 + with: + repository: vondele/matetrack + path: matetrack + ref: 814160f82e6428ed2f6522dc06c2a6fa539cd413 + persist-credentials: false + + - name: matetrack install deps + working-directory: matetrack + run: pip install -r requirements.txt + + - name: cache syzygy + id: cache-syzygy + uses: actions/cache@v4 + with: + path: | + matetrack/3-4-5-wdl/ + matetrack/3-4-5-dtz/ + key: key-syzygy + + - name: download syzygy 3-4-5 if needed + working-directory: matetrack + if: steps.cache-syzygy.outputs.cache-hit != 'true' + run: | + wget --no-verbose -r -nH --cut-dirs=2 --no-parent --reject="index.html*" -e robots=off https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/ + wget --no-verbose -r -nH --cut-dirs=2 --no-parent --reject="index.html*" -e robots=off https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/ + + - name: Run matetrack + working-directory: matetrack + run: | + python matecheck.py --syzygyPath 3-4-5-wdl/:3-4-5-dtz/ --engine /home/runner/work/Stockfish/Stockfish/Stockfish/src/stockfish --epdFile mates2000.epd --nodes 100000 | tee matecheckout.out + ! grep "issues were detected" matecheckout.out > /dev/null diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml index b75c06cf..946a81ce 100644 --- a/.github/workflows/sanitizers.yml +++ b/.github/workflows/sanitizers.yml @@ -40,6 +40,8 @@ jobs: shell: ${{ matrix.config.shell }} steps: - uses: actions/checkout@v4 + with: + persist-credentials: false - name: Download required linux packages run: | @@ -73,4 +75,4 @@ jobs: export CXXFLAGS="-O1 -fno-inline" make clean make -j4 ARCH=x86-64-sse41-popcnt ${{ matrix.sanitizers.make_option }} debug=yes optimize=no build > /dev/null - ../tests/instrumented.sh --${{ matrix.sanitizers.instrumented_option }} + python3 ../tests/instrumented.py --${{ matrix.sanitizers.instrumented_option }} ./stockfish diff --git a/.github/workflows/stockfish.yml b/.github/workflows/stockfish.yml index 13d57f9e..1f87e061 100644 --- a/.github/workflows/stockfish.yml +++ b/.github/workflows/stockfish.yml @@ -15,8 +15,12 @@ jobs: Prerelease: if: github.repository == 'official-stockfish/Stockfish' && (github.ref == 'refs/heads/master' || (startsWith(github.ref_name, 'sf_') && github.ref_type == 'tag')) runs-on: ubuntu-latest + permissions: + contents: write # For deleting/creating a prerelease steps: - uses: actions/checkout@v4 + with: + persist-credentials: false # returns null if no pre-release exists - name: Get Commit SHA of Latest Pre-release @@ -66,6 +70,8 @@ jobs: arm_matrix: ${{ steps.set-arm-matrix.outputs.arm_matrix }} steps: - uses: actions/checkout@v4 + with: + persist-credentials: false - id: set-matrix run: | TASKS=$(echo $(cat .github/ci/matrix.json) ) @@ -90,15 +96,27 @@ jobs: uses: ./.github/workflows/sanitizers.yml Tests: uses: ./.github/workflows/tests.yml + Matetrack: + uses: ./.github/workflows/matetrack.yml + Games: + uses: ./.github/workflows/games.yml Binaries: if: github.repository == 'official-stockfish/Stockfish' needs: [Matrix, Prerelease, Compilation] uses: ./.github/workflows/upload_binaries.yml with: matrix: ${{ needs.Matrix.outputs.matrix }} + permissions: + contents: write # For deleting/creating a (pre)release + secrets: + token: ${{ secrets.GITHUB_TOKEN }} ARM_Binaries: if: github.repository == 'official-stockfish/Stockfish' needs: [Matrix, Prerelease, ARMCompilation] uses: ./.github/workflows/upload_binaries.yml with: matrix: ${{ needs.Matrix.outputs.arm_matrix }} + permissions: + contents: write # For deleting/creating a (pre)release + secrets: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 328c9cf9..b97aaa29 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -106,6 +106,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + persist-credentials: false - name: Download required linux packages if: runner.os == 'Linux' @@ -138,16 +139,16 @@ jobs: - name: Build Docker container if: matrix.config.base_image run: | - docker buildx build --load -t sf_builder - << EOF + docker buildx build --platform ${{ matrix.config.platform }} --load -t sf_builder - << EOF FROM ${{ matrix.config.base_image }} WORKDIR /app RUN apk update && apk add make g++ - CMD ["sh", "script.sh"] + CMD ["sh", "src/script.sh"] EOF - name: Download required macOS packages if: runner.os == 'macOS' - run: brew install coreutils + run: brew install coreutils gcc@11 - name: Setup msys and install required packages if: runner.os == 'Windows' @@ -175,7 +176,7 @@ jobs: $COMPCXX -v else echo "$COMPCXX -v" > script.sh - docker run --rm --platform ${{ matrix.config.platform }} -v ${{ github.workspace }}/src:/app sf_builder + docker run --rm --platform ${{ matrix.config.platform }} -v ${{ github.workspace }}:/app sf_builder fi - name: Test help target @@ -341,8 +342,8 @@ jobs: - name: Test riscv64 build if: matrix.config.run_riscv64_tests run: | - echo "export LDFLAGS='-static' && make clean && make -j4 ARCH=riscv64 build" > script.sh - docker run --rm --platform ${{ matrix.config.platform }} -v ${{ github.workspace }}/src:/app sf_builder + echo "cd src && export LDFLAGS='-static' && make clean && make -j4 ARCH=riscv64 build" > script.sh + docker run --rm --platform ${{ matrix.config.platform }} -v ${{ github.workspace }}:/app sf_builder ../tests/signature.sh $benchref # ppc64 tests @@ -350,8 +351,8 @@ jobs: - name: Test ppc64 build if: matrix.config.run_ppc64_tests run: | - echo "export LDFLAGS='-static' && make clean && make -j4 ARCH=ppc-64 build" > script.sh - docker run --rm --platform ${{ matrix.config.platform }} -v ${{ github.workspace }}/src:/app sf_builder + echo "cd src && export LDFLAGS='-static' && make clean && make -j4 ARCH=ppc-64 build" > script.sh + docker run --rm --platform ${{ matrix.config.platform }} -v ${{ github.workspace }}:/app sf_builder ../tests/signature.sh $benchref # Other tests diff --git a/.github/workflows/upload_binaries.yml b/.github/workflows/upload_binaries.yml index acf91a8f..1067f6e7 100644 --- a/.github/workflows/upload_binaries.yml +++ b/.github/workflows/upload_binaries.yml @@ -5,6 +5,9 @@ on: matrix: type: string required: true + secrets: + token: + required: true jobs: Artifacts: @@ -25,6 +28,8 @@ jobs: shell: ${{ matrix.config.shell }} steps: - uses: actions/checkout@v4 + with: + persist-credentials: false - name: Download artifact from compilation uses: actions/download-artifact@v4 @@ -54,6 +59,7 @@ jobs: mv "${{ matrix.config.simple_name }} ${{ matrix.binaries }}" stockfish-workflow cd stockfish-workflow cp -r src ../stockfish/ + cp -r scripts ../stockfish/ cp stockfish-$NAME-$BINARY$EXT ../stockfish/ cp "Top CPU Contributors.txt" ../stockfish/ cp Copying.txt ../stockfish/ @@ -78,6 +84,7 @@ jobs: uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981 with: files: stockfish-${{ matrix.config.simple_name }}-${{ matrix.binaries }}.${{ matrix.config.archive_ext }} + token: ${{ secrets.token }} - name: Get last commit sha id: last_commit @@ -104,3 +111,4 @@ jobs: tag_name: stockfish-dev-${{ env.COMMIT_DATE }}-${{ env.COMMIT_SHA }} prerelease: true files: stockfish-${{ matrix.config.simple_name }}-${{ matrix.binaries }}.${{ matrix.config.archive_ext }} + token: ${{ secrets.token }} diff --git a/.gitignore b/.gitignore index 8981efca..2fc80d48 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,8 @@ src/-lstdc++.res # Neural network for the NNUE evaluation **/*.nnue +# Files generated by the instrumented tests +tsan.supp +__pycache__/ +tests/syzygy +tests/bench_tmp.epd \ No newline at end of file diff --git a/AUTHORS b/AUTHORS index 36b2b6f7..ddc53ec0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -20,6 +20,7 @@ Alexander Kure Alexander Pagel (Lolligerhans) Alfredo Menezes (lonfom169) Ali AlZhrani (Cooffe) +Andreas Jan van der Meulen (Andyson007) Andreas Matthies (Matthies) Andrei Vetrov (proukornew) Andrew Grant (AndyGrant) @@ -44,6 +45,7 @@ Bruno de Melo Costa (BM123499) Bruno Pellanda (pellanda) Bryan Cross (crossbr) candirufish +Carlos Esparza Sánchez (ces42) Chess13234 Chris Cain (ceebo) Ciekce @@ -68,9 +70,11 @@ Douglas Matos Gomes (dsmsgms) Dubslow Eduardo Cáceres (eduherminio) Eelco de Groot (KingDefender) +Ehsan Rashid (erashid) Elvin Liu (solarlight2) erbsenzaehler Ernesto Gatti +evqsx Fabian Beuke (madnight) Fabian Fichter (ianfab) Fanael Linithien (Fanael) @@ -127,6 +131,7 @@ Kojirion Krystian Kuzniarek (kuzkry) Leonardo Ljubičić (ICCF World Champion) Leonid Pechenik (lp--) +Li Ying (yl25946) Liam Keegan (lkeegan) Linmiao Xu (linrock) Linus Arver (listx) @@ -139,6 +144,7 @@ Maciej Żenczykowski (zenczykowski) Malcolm Campbell (xoto10) Mark Tenzer (31m059) marotear +Mathias Parnaudeau (mparnaudeau) Matt Ginsberg (mattginsberg) Matthew Lai (matthewlai) Matthew Sullivan (Matt14916) @@ -167,10 +173,12 @@ Niklas Fiekas (niklasf) Nikolay Kostov (NikolayIT) Norman Schmidt (FireFather) notruck +Nour Berakdar (Nonlinear) Ofek Shochat (OfekShochat, ghostway) Ondrej Mosnáček (WOnder93) Ondřej Mišina (AndrovT) Oskar Werkelin Ahlin +Ömer Faruk Tutkun (OmerFarukTutkun) Pablo Vazquez Panthee Pascal Romaret @@ -232,6 +240,7 @@ Unai Corzo (unaiic) Uri Blass (uriblass) Vince Negri (cuddlestmonkey) Viren +Wencey Wang windfishballad xefoci7612 Xiang Wang (KatyushaScarlet) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cf9cecda..caffc916 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,7 +59,7 @@ discussion._ Changes to Stockfish C++ code should respect our coding style defined by [.clang-format](.clang-format). You can format your changes by running -`make format`. This requires clang-format version 17 to be installed on your system. +`make format`. This requires clang-format version 18 to be installed on your system. ## Navigate diff --git a/README.md b/README.md index 1530264b..0a5e8eb4 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ descriptions. An example suitable for most Intel and AMD chips: ``` cd src -make -j profile-build ARCH=x86-64-avx2 +make -j profile-build ``` Detailed compilation instructions for all platforms can be found in our @@ -122,6 +122,11 @@ where the source code can be found) to generate the exact binary you are distributing. If you make any changes to the source code, these changes must also be made available under GPL v3. +## Acknowledgements + +Stockfish uses neural networks trained on [data provided by the Leela Chess Zero +project][lc0-data-link], which is made available under the [Open Database License][odbl-link] (ODbL). + [authors-link]: https://github.com/official-stockfish/Stockfish/blob/master/AUTHORS [build-link]: https://github.com/official-stockfish/Stockfish/actions/workflows/stockfish.yml @@ -146,6 +151,8 @@ also be made available under GPL v3. [wiki-uci-link]: https://github.com/official-stockfish/Stockfish/wiki/UCI-&-Commands [wiki-usage-link]: https://github.com/official-stockfish/Stockfish/wiki/Download-and-usage [worker-link]: https://github.com/official-stockfish/fishtest/wiki/Running-the-worker +[lc0-data-link]: https://storage.lczero.org/files/training_data +[odbl-link]: https://opendatacommons.org/licenses/odbl/odbl-10.txt [build-badge]: https://img.shields.io/github/actions/workflow/status/official-stockfish/Stockfish/stockfish.yml?branch=master&style=for-the-badge&label=stockfish&logo=github [commits-badge]: https://img.shields.io/github/commits-since/official-stockfish/Stockfish/latest?style=for-the-badge diff --git a/Top CPU Contributors.txt b/Top CPU Contributors.txt index 11636e84..3d8c5236 100644 --- a/Top CPU Contributors.txt +++ b/Top CPU Contributors.txt @@ -1,106 +1,109 @@ -Contributors to Fishtest with >10,000 CPU hours, as of 2024-02-24. +Contributors to Fishtest with >10,000 CPU hours, as of 2024-08-31. Thank you! Username CPU Hours Games played ------------------------------------------------------------------ -noobpwnftw 39302472 3055513453 -technologov 20845762 994893444 -linrock 8616428 560281417 +noobpwnftw 40428649 3164740143 +technologov 23581394 1076895482 +vdv 19425375 718302718 +linrock 10034115 643194527 mlang 3026000 200065824 -okrout 2332151 222639518 -pemo 1800019 60274069 +okrout 2572676 237511408 +pemo 1836785 62226157 dew 1689162 100033738 -TueRens 1474943 75121774 -grandphish2 1463002 91616949 -JojoM 1109702 72927902 -olafm 978631 71037944 -sebastronomy 939955 44920556 +TueRens 1648780 77891164 +sebastronomy 1468328 60859092 +grandphish2 1466110 91776075 +JojoM 1130625 73666098 +olafm 1067009 74807270 tvijlbrief 796125 51897690 -gvreuls 711320 49142318 +oz 781847 53910686 +rpngn 768460 49812975 +gvreuls 751085 52177668 mibere 703840 46867607 -oz 646268 46293638 -rpngn 572571 38928563 -leszek 531858 39316505 -cw 518116 34894291 +leszek 566598 42024615 +cw 519601 34988161 fastgm 503862 30260818 CSU_Dynasty 468784 31385034 -ctoks 434591 28520597 -maximmasiutin 429983 27066286 +maximmasiutin 439192 27893522 +ctoks 435148 28541909 crunchy 427414 27371625 bcross 415724 29061187 +robal 371112 24642270 +mgrabiak 367963 26464704 velislav 342588 22140902 -mgrabiak 338763 23999170 +ncfish1 329039 20624527 Fisherman 327231 21829379 -robal 299836 20213182 Dantist 296386 18031762 -ncfish1 267604 17881149 +tolkki963 262050 22049676 +Sylvain27 255595 8864404 nordlandia 249322 16420192 +Fifis 237657 13065577 marrco 234581 17714473 -tolkki963 233490 19773930 +Calis007 217537 14450582 glinscott 208125 13277240 drabel 204167 13930674 mhoram 202894 12601997 bking_US 198894 11876016 -Calis007 188631 12795784 Thanar 179852 12365359 -Fifis 176209 10638245 -vdv 175544 9904472 +javran 169679 13481966 +armo9494 162863 10937118 spams 157128 10319326 -DesolatedDodo 156659 10210328 -armo9494 155355 10566898 +DesolatedDodo 156683 10211206 +Wencey 152308 8375444 sqrt2 147963 9724586 +vdbergh 140311 9225125 jcAEie 140086 10603658 -vdbergh 139746 9172061 CoffeeOne 137100 5024116 malala 136182 8002293 xoto 133759 9159372 +Dubslow 129614 8519312 davar 129023 8376525 DMBK 122960 8980062 dsmith 122059 7570238 -javran 121564 10144656 +CypressChess 120784 8672620 +sschnee 120526 7547722 +maposora 119734 10749710 amicic 119661 7938029 -sschnee 118107 7389266 -Wolfgang 114616 8070494 +Wolfgang 115713 8159062 Data 113305 8220352 BrunoBanani 112960 7436849 -Wencey 111502 5991676 -cuistot 108503 7006992 -CypressChess 108331 7759788 +markkulix 112897 9133168 +cuistot 109802 7121030 skiminki 107583 7218170 +sterni1971 104431 5938282 MaZePallas 102823 6633619 -sterni1971 100532 5880772 sunu 100167 7040199 zeryl 99331 6221261 thirdlife 99156 2245320 ElbertoOne 99028 7023771 -Dubslow 98600 6903242 -markkulix 97010 7643900 -bigpen0r 94809 6529203 +megaman7de 98456 6675076 +Goatminola 96765 8257832 +bigpen0r 94825 6529241 brabos 92118 6186135 Maxim 90818 3283364 psk 89957 5984901 -megaman7de 88822 6052132 racerschmacer 85805 6122790 -maposora 85710 7778146 Vizvezdenec 83761 5344740 0x3C33 82614 5271253 +szupaw 82495 7151686 BRAVONE 81239 5054681 nssy 76497 5259388 +cody 76126 4492126 jromang 76106 5236025 +MarcusTullius 76103 5061991 +woutboat 76072 6022922 +Spprtr 75977 5252287 teddybaer 75125 5407666 Pking_cda 73776 5293873 -yurikvelo 73516 5036928 -MarcusTullius 71053 4803477 +yurikvelo 73611 5046822 +Mineta 71130 4711422 Bobo1239 70579 4794999 solarlight 70517 5028306 dv8silencer 70287 3883992 -Spprtr 69646 4806763 -Mineta 66325 4537742 manap 66273 4121774 -szupaw 65468 5669742 tinker 64333 4268790 qurashee 61208 3429862 -woutboat 59496 4906352 AGI 58195 4329580 robnjr 57262 4053117 Freja 56938 3733019 @@ -108,39 +111,45 @@ MaxKlaxxMiner 56879 3423958 ttruscott 56010 3680085 rkl 55132 4164467 jmdana 54697 4012593 +notchris 53936 4184018 renouve 53811 3501516 -notchris 52433 4044590 finfish 51360 3370515 eva42 51272 3599691 eastorwest 51117 3454811 -Goatminola 51004 4432492 rap 49985 3219146 pb00067 49733 3298934 GPUex 48686 3684998 OuaisBla 48626 3445134 ronaldjerum 47654 3240695 biffhero 46564 3111352 -oryx 45533 3539290 +oryx 45639 3546530 VoyagerOne 45476 3452465 speedycpu 43842 3003273 jbwiebe 43305 2805433 Antihistamine 41788 2761312 mhunt 41735 2691355 +jibarbosa 41640 4145702 homyur 39893 2850481 gri 39871 2515779 +DeepnessFulled 39020 3323102 Garf 37741 2999686 SC 37299 2731694 -Sylvain27 36520 1467082 +Gaster319 37118 3279678 +naclosagc 36562 1279618 csnodgrass 36207 2688994 -Gaster319 35655 3149442 strelock 34716 2074055 +gopeto 33717 2245606 EthanOConnor 33370 2090311 slakovv 32915 2021889 -gopeto 31884 2076712 +jojo2357 32890 2826662 +shawnxu 32019 2802552 Gelma 31771 1551204 +vidar808 31560 1351810 kdave 31157 2198362 manapbk 30987 1810399 -ZacHFX 30551 2238078 +ZacHFX 30966 2272416 +TataneSan 30713 1513402 +votoanthuan 30691 2460856 Prcuvu 30377 2170122 anst 30301 2190091 jkiiski 30136 1904470 @@ -149,14 +158,15 @@ hyperbolic.tom 29840 2017394 chuckstablers 29659 2093438 Pyafue 29650 1902349 belzedar94 28846 1811530 -votoanthuan 27978 2285818 -shawnxu 27438 2465810 +mecevdimitar 27610 1721382 chriswk 26902 1868317 xwziegtm 26897 2124586 achambord 26582 1767323 +somethingintheshadows 26496 2186404 Patrick_G 26276 1801617 yorkman 26193 1992080 -Ulysses 25397 1701264 +srowen 25743 1490684 +Ulysses 25413 1702830 Jopo12321 25227 1652482 SFTUser 25182 1675689 nabildanial 25068 1531665 @@ -164,66 +174,69 @@ Sharaf_DG 24765 1786697 rodneyc 24376 1416402 jsys14 24297 1721230 agg177 23890 1395014 -srowen 23842 1342508 +AndreasKrug 23754 1890115 Ente 23752 1678188 -jojo2357 23479 2061238 JanErik 23408 1703875 Isidor 23388 1680691 Norabor 23371 1603244 +WoodMan777 23253 2023048 +Nullvalue 23155 2022752 cisco2015 22920 1763301 Zirie 22542 1472937 -Nullvalue 22490 1970374 -AndreasKrug 22485 1769491 team-oh 22272 1636708 Roady 22220 1465606 MazeOfGalious 21978 1629593 -sg4032 21947 1643353 +sg4032 21950 1643373 +tsim67 21747 1330880 ianh2105 21725 1632562 +Skiff84 21711 1014212 xor12 21628 1680365 dex 21612 1467203 nesoneg 21494 1463031 user213718 21454 1404128 +Serpensin 21452 1790510 sphinx 21211 1384728 -qoo_charly_cai 21135 1514907 +qoo_charly_cai 21136 1514927 +IslandLambda 21062 1220838 jjoshua2 21001 1423089 Zake9298 20938 1565848 horst.prack 20878 1465656 +fishtester 20729 1348888 0xB00B1ES 20590 1208666 -Serpensin 20487 1729674 -Dinde 20440 1292390 +ols 20477 1195945 +Dinde 20459 1292774 j3corre 20405 941444 Adrian.Schmidt123 20316 1281436 wei 19973 1745989 -fishtester 19617 1257388 +teenychess 19819 1762006 rstoesser 19569 1293588 eudhan 19274 1283717 vulcan 18871 1729392 +wizardassassin 18795 1376884 Karpovbot 18766 1053178 -WoodMan777 18556 1628264 jundery 18445 1115855 +mkstockfishtester 18350 1690676 ville 17883 1384026 chris 17698 1487385 purplefishies 17595 1092533 dju 17414 981289 -ols 17291 1042003 iisiraider 17275 1049015 -Skiff84 17111 950248 DragonLord 17014 1162790 +Karby 17008 1013160 +pirt 16965 1271519 redstone59 16842 1461780 -Karby 16839 1010124 Alb11747 16787 1213990 -pirt 16493 1237199 Naven94 16414 951718 -wizardassassin 16392 1148672 +scuzzi 16115 994341 IgorLeMasson 16064 1147232 -scuzzi 15757 968735 ako027ako 15671 1173203 +infinigon 15285 965966 Nikolay.IT 15154 1068349 Andrew Grant 15114 895539 OssumOpossum 14857 1007129 LunaticBFF57 14525 1190310 enedene 14476 905279 -IslandLambda 14393 958196 +Hjax 14394 1005013 bpfliegel 14233 882523 YELNAMRON 14230 1128094 mpx86 14019 759568 @@ -233,54 +246,56 @@ Nesa92 13806 1116101 crocogoat 13803 1117422 joster 13710 946160 mbeier 13650 1044928 -Hjax 13535 915487 +Pablohn26 13552 1088532 +wxt9861 13550 1312306 Dark_wizzie 13422 1007152 Rudolphous 13244 883140 Machariel 13010 863104 -infinigon 12991 943216 +nalanzeyu 12996 232590 mabichito 12903 749391 +Jackfish 12895 868928 thijsk 12886 722107 AdrianSA 12860 804972 Flopzee 12698 894821 +whelanh 12682 266404 mschmidt 12644 863193 korposzczur 12606 838168 -tsim67 12570 890180 -Jackfish 12553 836958 fatmurphy 12547 853210 -Oakwen 12503 853105 +Oakwen 12532 855759 +icewulf 12447 854878 SapphireBrand 12416 969604 deflectooor 12386 579392 modolief 12386 896470 -TataneSan 12358 609332 Farseer 12249 694108 +Hongildong 12201 648712 pgontarz 12151 848794 dbernier 12103 860824 -FormazChar 11989 907809 +szczur90 12035 942376 +FormazChar 12019 910409 +rensonthemove 11999 971993 stocky 11954 699440 -somethingintheshadows 11940 989472 -MooTheCow 11892 776126 +MooTheCow 11923 779432 3cho 11842 1036786 -whelanh 11557 245188 +ckaz 11792 732276 infinity 11470 727027 aga 11412 695127 torbjo 11395 729145 Thomas A. Anderson 11372 732094 savage84 11358 670860 +Def9Infinity 11345 696552 d64 11263 789184 ali-al-zhrani 11245 779246 -ckaz 11170 680866 +ImperiumAeternum 11155 952000 snicolet 11106 869170 dapper 11032 771402 Ethnikoi 10993 945906 Snuuka 10938 435504 -Karmatron 10859 678058 +Karmatron 10871 678306 basepi 10637 744851 -jibarbosa 10628 857100 Cubox 10621 826448 -mecevdimitar 10609 787318 +gerbil 10519 971688 michaelrpg 10509 739239 -Def9Infinity 10427 686978 OIVAS7572 10420 995586 -wxt9861 10412 1013864 Garruk 10365 706465 dzjp 10343 732529 +RickGroszkiewicz 10263 990798 diff --git a/scripts/get_native_properties.sh b/scripts/get_native_properties.sh index fb124021..ed5fc9af 100755 --- a/scripts/get_native_properties.sh +++ b/scripts/get_native_properties.sh @@ -26,6 +26,17 @@ check_znver_1_2() { [ "$vendor_id" = "AuthenticAMD" ] && [ "$cpu_family" = "23" ] && znver_1_2=true } +# Set the file CPU loongarch64 architecture +set_arch_loongarch64() { + if check_flags 'lasx'; then + true_arch='loongarch64-lasx' + elif check_flags 'lsx'; then + true_arch='lonngarch64-lsx' + else + true_arch='loongarch64' + fi +} + # Set the file CPU x86_64 architecture set_arch_x86_64() { if check_flags 'avx512vnni' 'avx512dq' 'avx512f' 'avx512bw' 'avx512vl'; then @@ -43,6 +54,20 @@ set_arch_x86_64() { fi } +set_arch_ppc_64() { + if $(grep -q -w "altivec" /proc/cpuinfo); then + power=$(grep -oP -m 1 'cpu\t+: POWER\K\d+' /proc/cpuinfo) + if [ "0$power" -gt 7 ]; then + # VSX started with POWER8 + true_arch='ppc-64-vsx' + else + true_arch='ppc-64-altivec' + fi + else + true_arch='ppc-64' + fi +} + # Check the system type uname_s=$(uname -s) uname_m=$(uname -m) @@ -76,6 +101,10 @@ case $uname_s in file_os='ubuntu' true_arch='x86-32' ;; + 'ppc64'*) + file_os='ubuntu' + set_arch_ppc_64 + ;; 'aarch64') file_os='android' true_arch='armv8' @@ -90,6 +119,10 @@ case $uname_s in true_arch="$true_arch-neon" fi ;; + 'loongarch64'*) + file_os='linux' + set_arch_loongarch64 + ;; *) # Unsupported machine type, exit with error printf 'Unsupported machine type: %s\n' "$uname_m" exit 1 diff --git a/scripts/net.sh b/scripts/net.sh new file mode 100755 index 00000000..0bc57a19 --- /dev/null +++ b/scripts/net.sh @@ -0,0 +1,75 @@ +#!/bin/sh + +wget_or_curl=$( (command -v wget > /dev/null 2>&1 && echo "wget -qO-") || \ + (command -v curl > /dev/null 2>&1 && echo "curl -skL")) + +if [ -z "$wget_or_curl" ]; then + >&2 printf "%s\n" "Neither wget or curl is installed." \ + "Install one of these tools to download NNUE files automatically." + exit 1 +fi + +sha256sum=$( (command -v shasum > /dev/null 2>&1 && echo "shasum -a 256") || \ + (command -v sha256sum > /dev/null 2>&1 && echo "sha256sum")) + +if [ -z "$sha256sum" ]; then + >&2 echo "sha256sum not found, NNUE files will be assumed valid." +fi + +get_nnue_filename() { + grep "$1" evaluate.h | grep "#define" | sed "s/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/" +} + +validate_network() { + # If no sha256sum command is available, assume the file is always valid. + if [ -n "$sha256sum" ] && [ -f "$1" ]; then + if [ "$1" != "nn-$($sha256sum "$1" | cut -c 1-12).nnue" ]; then + rm -f "$1" + return 1 + fi + fi +} + +fetch_network() { + _filename="$(get_nnue_filename "$1")" + + if [ -z "$_filename" ]; then + >&2 echo "NNUE file name not found for: $1" + return 1 + fi + + if [ -f "$_filename" ]; then + if validate_network "$_filename"; then + echo "Existing $_filename validated, skipping download" + return + else + echo "Removing invalid NNUE file: $_filename" + fi + fi + + for url in \ + "https://tests.stockfishchess.org/api/nn/$_filename" \ + "https://github.com/official-stockfish/networks/raw/master/$_filename"; do + echo "Downloading from $url ..." + if $wget_or_curl "$url" > "$_filename"; then + if validate_network "$_filename"; then + echo "Successfully validated $_filename" + else + echo "Downloaded $_filename is invalid" + continue + fi + else + echo "Failed to download from $url" + fi + if [ -f "$_filename" ]; then + return + fi + done + + # Download was not successful in the loop, return false. + >&2 echo "Failed to download $_filename" + return 1 +} + +fetch_network EvalFileDefaultNameBig && \ +fetch_network EvalFileDefaultNameSmall diff --git a/src/Makefile b/src/Makefile index 45f38b01..e7f8ce55 100644 --- a/src/Makefile +++ b/src/Makefile @@ -55,15 +55,15 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \ misc.cpp movegen.cpp movepick.cpp position.cpp \ search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ - nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp + nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp -HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \ +HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h history.h \ nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \ nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \ nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \ nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \ search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \ - tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h + tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h OBJS = $(notdir $(SRCS:.cpp=.o)) @@ -98,8 +98,12 @@ VPATH = syzygy:nnue:nnue/features # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 # vnni256 = yes/no --- -mavx256vnni --- Use Intel Vector Neural Network Instructions 512 with 256bit operands # vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 +# altivec = yes/no --- -maltivec --- Use PowerPC Altivec SIMD extension +# vsx = yes/no --- -mvsx --- Use POWER VSX SIMD extension # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # dotprod = yes/no --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions +# lsx = yes/no --- -mlsx --- Use Loongson SIMD eXtension +# lasx = yes/no --- -mlasx --- use Loongson Advanced SIMD eXtension # # Note that Makefile is space sensitive, so when adding new architectures # or modifying existing flags, you have to make sure there are no extra spaces @@ -124,8 +128,9 @@ endif ifeq ($(ARCH), $(filter $(ARCH), \ x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \ x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \ - x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \ - armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 loongarch64)) + x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-64-altivec ppc-64-vsx ppc-32 e2k \ + armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 \ + loongarch64 loongarch64-lsx loongarch64-lasx)) SUPPORTED_ARCH=true else SUPPORTED_ARCH=false @@ -148,13 +153,17 @@ avxvnni = no avx512 = no vnni256 = no vnni512 = no +altivec = no +vsx = no neon = no dotprod = no arm_version = 0 +lsx = no +lasx = no STRIP = strip -ifneq ($(shell which clang-format-17 2> /dev/null),) - CLANG-FORMAT = clang-format-17 +ifneq ($(shell which clang-format-18 2> /dev/null),) + CLANG-FORMAT = clang-format-18 else CLANG-FORMAT = clang-format endif @@ -355,6 +364,20 @@ ifeq ($(ARCH),ppc-64) prefetch = yes endif +ifeq ($(ARCH),ppc-64-altivec) + arch = ppc64 + popcnt = yes + prefetch = yes + altivec = yes +endif + +ifeq ($(ARCH),ppc-64-vsx) + arch = ppc64 + popcnt = yes + prefetch = yes + vsx = yes +endif + ifeq ($(findstring e2k,$(ARCH)),e2k) arch = e2k mmx = yes @@ -370,8 +393,19 @@ ifeq ($(ARCH),riscv64) arch = riscv64 endif -ifeq ($(ARCH),loongarch64) +ifeq ($(findstring loongarch64,$(ARCH)),loongarch64) arch = loongarch64 + prefetch = yes + +ifeq ($(findstring -lasx,$(ARCH)),-lasx) + lsx = yes + lasx = yes +endif + +ifeq ($(findstring -lsx,$(ARCH)),-lsx) + lsx = yes +endif + endif endif @@ -408,7 +442,7 @@ ifeq ($(COMP),gcc) ifeq ($(ARCH),riscv64) CXXFLAGS += -latomic endif - else ifeq ($(ARCH),loongarch64) + else ifeq ($(arch),loongarch64) CXXFLAGS += -latomic else CXXFLAGS += -m$(bits) @@ -480,7 +514,7 @@ ifeq ($(COMP),clang) ifeq ($(ARCH),riscv64) CXXFLAGS += -latomic endif - else ifeq ($(ARCH),loongarch64) + else ifeq ($(arch),loongarch64) CXXFLAGS += -latomic else CXXFLAGS += -m$(bits) @@ -489,8 +523,8 @@ ifeq ($(COMP),clang) endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -mmacosx-version-min=10.14 - LDFLAGS += -mmacosx-version-min=10.14 + CXXFLAGS += -mmacosx-version-min=10.15 + LDFLAGS += -mmacosx-version-min=10.15 ifneq ($(arch),any) CXXFLAGS += -arch $(arch) LDFLAGS += -arch $(arch) @@ -634,7 +668,7 @@ else endif ifeq ($(popcnt),yes) - ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64)) + ifeq ($(arch),$(filter $(arch),ppc64 ppc64-altivec ppc64-vsx armv7 armv8 arm64)) CXXFLAGS += -DUSE_POPCNT else CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT @@ -704,6 +738,20 @@ ifeq ($(mmx),yes) endif endif +ifeq ($(altivec),yes) + CXXFLAGS += -maltivec + ifeq ($(COMP),gcc) + CXXFLAGS += -mabi=altivec + endif +endif + +ifeq ($(vsx),yes) + CXXFLAGS += -mvsx + ifeq ($(COMP),gcc) + CXXFLAGS += -DNO_WARN_X86_INTRINSICS -DUSE_SSE2 + endif +endif + ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON=$(arm_version) ifeq ($(KERNEL),Linux) @@ -719,6 +767,18 @@ ifeq ($(dotprod),yes) CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD endif +ifeq ($(lasx),yes) + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mlasx + endif +endif + +ifeq ($(lsx),yes) + ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) + CXXFLAGS += -mlsx + endif +endif + ### 3.7 pext ifeq ($(pext),yes) CXXFLAGS += -DUSE_PEXT @@ -791,71 +851,75 @@ endif ### ========================================================================== help: - @echo "" - @echo "To compile stockfish, type: " - @echo "" - @echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]" - @echo "" - @echo "Supported targets:" - @echo "" - @echo "help > Display architecture details" - @echo "profile-build > standard build with profile-guided optimization" - @echo "build > skip profile-guided optimization" - @echo "net > Download the default nnue nets" - @echo "strip > Strip executable" - @echo "install > Install executable" - @echo "clean > Clean up" - @echo "" - @echo "Supported archs:" - @echo "" - @echo "native > select the best architecture for the host processor (default)" - @echo "x86-64-vnni512 > x86 64-bit with vnni 512bit support" - @echo "x86-64-vnni256 > x86 64-bit with vnni 512bit support, limit operands to 256bit wide" - @echo "x86-64-avx512 > x86 64-bit with avx512 support" - @echo "x86-64-avxvnni > x86 64-bit with vnni 256bit support" - @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" - @echo "x86-64-avx2 > x86 64-bit with avx2 support" - @echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" - @echo "x86-64-modern > deprecated, currently x86-64-sse41-popcnt" - @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" - @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 compile and popcnt support" - @echo "x86-64 > x86 64-bit generic (with sse2 support)" - @echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support" - @echo "x86-32-sse2 > x86 32-bit with sse2 support" - @echo "x86-32 > x86 32-bit generic (with mmx compile support)" - @echo "ppc-64 > PPC 64-bit" - @echo "ppc-32 > PPC 32-bit" - @echo "armv7 > ARMv7 32-bit" - @echo "armv7-neon > ARMv7 32-bit with popcnt and neon" - @echo "armv8 > ARMv8 64-bit with popcnt and neon" - @echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support" - @echo "e2k > Elbrus 2000" - @echo "apple-silicon > Apple silicon ARM64" - @echo "general-64 > unspecified 64-bit" - @echo "general-32 > unspecified 32-bit" - @echo "riscv64 > RISC-V 64-bit" - @echo "loongarch64 > LoongArch 64-bit" - @echo "" - @echo "Supported compilers:" - @echo "" - @echo "gcc > GNU compiler (default)" - @echo "mingw > GNU compiler with MinGW under Windows" - @echo "clang > LLVM Clang compiler" - @echo "icx > Intel oneAPI DPC++/C++ Compiler" - @echo "ndk > Google NDK to cross-compile for Android" - @echo "" - @echo "Simple examples. If you don't know what to do, you likely want to run one of: " - @echo "" - @echo "make -j profile-build ARCH=x86-64-avx2 # typically a fast compile for common systems " - @echo "make -j profile-build ARCH=x86-64-sse41-popcnt # A more portable compile for 64-bit systems " - @echo "make -j profile-build ARCH=x86-64 # A portable compile for 64-bit systems " - @echo "" - @echo "Advanced examples, for experienced users: " - @echo "" - @echo "make -j profile-build ARCH=x86-64-avxvnni" - @echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0" - @echo "make -j build ARCH=x86-64-ssse3 COMP=clang" - @echo "" + @echo "" && \ + echo "To compile stockfish, type: " && \ + echo "" && \ + echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]" && \ + echo "" && \ + echo "Supported targets:" && \ + echo "" && \ + echo "help > Display architecture details" && \ + echo "profile-build > standard build with profile-guided optimization" && \ + echo "build > skip profile-guided optimization" && \ + echo "net > Download the default nnue nets" && \ + echo "strip > Strip executable" && \ + echo "install > Install executable" && \ + echo "clean > Clean up" && \ + echo "" && \ + echo "Supported archs:" && \ + echo "" && \ + echo "native > select the best architecture for the host processor (default)" && \ + echo "x86-64-vnni512 > x86 64-bit with vnni 512bit support" && \ + echo "x86-64-vnni256 > x86 64-bit with vnni 512bit support, limit operands to 256bit wide" && \ + echo "x86-64-avx512 > x86 64-bit with avx512 support" && \ + echo "x86-64-avxvnni > x86 64-bit with vnni 256bit support" && \ + echo "x86-64-bmi2 > x86 64-bit with bmi2 support" && \ + echo "x86-64-avx2 > x86 64-bit with avx2 support" && \ + echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" && \ + echo "x86-64-modern > deprecated, currently x86-64-sse41-popcnt" && \ + echo "x86-64-ssse3 > x86 64-bit with ssse3 support" && \ + echo "x86-64-sse3-popcnt > x86 64-bit with sse3 compile and popcnt support" && \ + echo "x86-64 > x86 64-bit generic (with sse2 support)" && \ + echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support" && \ + echo "x86-32-sse2 > x86 32-bit with sse2 support" && \ + echo "x86-32 > x86 32-bit generic (with mmx compile support)" && \ + echo "ppc-64 > PPC 64-bit" && \ + echo "ppc-64-altivec > PPC 64-bit with altivec support" && \ + echo "ppc-64-vsx > PPC 64-bit with vsx support" && \ + echo "ppc-32 > PPC 32-bit" && \ + echo "armv7 > ARMv7 32-bit" && \ + echo "armv7-neon > ARMv7 32-bit with popcnt and neon" && \ + echo "armv8 > ARMv8 64-bit with popcnt and neon" && \ + echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support" && \ + echo "e2k > Elbrus 2000" && \ + echo "apple-silicon > Apple silicon ARM64" && \ + echo "general-64 > unspecified 64-bit" && \ + echo "general-32 > unspecified 32-bit" && \ + echo "riscv64 > RISC-V 64-bit" && \ + echo "loongarch64 > LoongArch 64-bit" && \ + echo "loongarch64-lsx > LoongArch 64-bit with SIMD eXtension" && \ + echo "loongarch64-lasx > LoongArch 64-bit with Advanced SIMD eXtension" && \ + echo "" && \ + echo "Supported compilers:" && \ + echo "" && \ + echo "gcc > GNU compiler (default)" && \ + echo "mingw > GNU compiler with MinGW under Windows" && \ + echo "clang > LLVM Clang compiler" && \ + echo "icx > Intel oneAPI DPC++/C++ Compiler" && \ + echo "ndk > Google NDK to cross-compile for Android" && \ + echo "" && \ + echo "Simple examples. If you don't know what to do, you likely want to run one of: " && \ + echo "" && \ + echo "make -j profile-build ARCH=x86-64-avx2 # typically a fast compile for common systems " && \ + echo "make -j profile-build ARCH=x86-64-sse41-popcnt # A more portable compile for 64-bit systems " && \ + echo "make -j profile-build ARCH=x86-64 # A portable compile for 64-bit systems " && \ + echo "" && \ + echo "Advanced examples, for experienced users: " && \ + echo "" && \ + echo "make -j profile-build ARCH=x86-64-avxvnni" && \ + echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0" && \ + echo "make -j build ARCH=x86-64-ssse3 COMP=clang" && \ + echo "" ifneq ($(SUPPORTED_ARCH), true) @echo "Specify a supported architecture with the ARCH option for more details" @echo "" @@ -917,59 +981,9 @@ profileclean: @rm -f stockfish.res @rm -f ./-lstdc++.res -define fetch_network - @echo "Default net: $(nnuenet)" - @if [ "x$(curl_or_wget)" = "x" ]; then \ - echo "Neither curl nor wget is installed. Install one of these tools unless the net has been downloaded manually"; \ - fi - @if [ "x$(shasum_command)" = "x" ]; then \ - echo "shasum / sha256sum not found, skipping net validation"; \ - elif test -f "$(nnuenet)"; then \ - if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \ - echo "Removing invalid network"; rm -f $(nnuenet); \ - fi; \ - fi; - @for nnuedownloadurl in "$(nnuedownloadurl1)" "$(nnuedownloadurl2)"; do \ - if test -f "$(nnuenet)"; then \ - echo "$(nnuenet) available : OK"; break; \ - else \ - if [ "x$(curl_or_wget)" != "x" ]; then \ - echo "Downloading $${nnuedownloadurl}"; $(curl_or_wget) $${nnuedownloadurl} > $(nnuenet);\ - else \ - echo "No net found and download not possible"; exit 1;\ - fi; \ - fi; \ - if [ "x$(shasum_command)" != "x" ]; then \ - if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \ - echo "Removing failed download"; rm -f $(nnuenet); \ - fi; \ - fi; \ - done - @if ! test -f "$(nnuenet)"; then \ - echo "Failed to download $(nnuenet)."; \ - fi; - @if [ "x$(shasum_command)" != "x" ]; then \ - if [ "$(nnuenet)" = "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \ - echo "Network validated"; break; \ - fi; \ - fi; -endef - -# set up shell variables for the net stuff -define netvariables -$(eval nnuenet := $(shell grep $(1) evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/')) -$(eval nnuedownloadurl1 := https://tests.stockfishchess.org/api/nn/$(nnuenet)) -$(eval nnuedownloadurl2 := https://github.com/official-stockfish/networks/raw/master/$(nnuenet)) -$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi)) -$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi)) -endef - # evaluation network (nnue) net: - $(call netvariables, EvalFileDefaultNameBig) - $(call fetch_network) - $(call netvariables, EvalFileDefaultNameSmall) - $(call fetch_network) + @$(SHELL) ../scripts/net.sh format: $(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file @@ -986,61 +1000,71 @@ all: $(EXE) .depend config-sanity: net @echo "" - @echo "Config:" - @echo "debug: '$(debug)'" - @echo "sanitize: '$(sanitize)'" - @echo "optimize: '$(optimize)'" - @echo "arch: '$(arch)'" - @echo "bits: '$(bits)'" - @echo "kernel: '$(KERNEL)'" - @echo "os: '$(OS)'" - @echo "prefetch: '$(prefetch)'" - @echo "popcnt: '$(popcnt)'" - @echo "pext: '$(pext)'" - @echo "sse: '$(sse)'" - @echo "mmx: '$(mmx)'" - @echo "sse2: '$(sse2)'" - @echo "ssse3: '$(ssse3)'" - @echo "sse41: '$(sse41)'" - @echo "avx2: '$(avx2)'" - @echo "avxvnni: '$(avxvnni)'" - @echo "avx512: '$(avx512)'" - @echo "vnni256: '$(vnni256)'" - @echo "vnni512: '$(vnni512)'" - @echo "neon: '$(neon)'" - @echo "dotprod: '$(dotprod)'" - @echo "arm_version: '$(arm_version)'" - @echo "target_windows: '$(target_windows)'" - @echo "" - @echo "Flags:" - @echo "CXX: $(CXX)" - @echo "CXXFLAGS: $(CXXFLAGS)" - @echo "LDFLAGS: $(LDFLAGS)" - @echo "" - @echo "Testing config sanity. If this fails, try 'make help' ..." - @echo "" - @test "$(debug)" = "yes" || test "$(debug)" = "no" - @test "$(optimize)" = "yes" || test "$(optimize)" = "no" - @test "$(SUPPORTED_ARCH)" = "true" - @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ + @echo "Config:" && \ + echo "debug: '$(debug)'" && \ + echo "sanitize: '$(sanitize)'" && \ + echo "optimize: '$(optimize)'" && \ + echo "arch: '$(arch)'" && \ + echo "bits: '$(bits)'" && \ + echo "kernel: '$(KERNEL)'" && \ + echo "os: '$(OS)'" && \ + echo "prefetch: '$(prefetch)'" && \ + echo "popcnt: '$(popcnt)'" && \ + echo "pext: '$(pext)'" && \ + echo "sse: '$(sse)'" && \ + echo "mmx: '$(mmx)'" && \ + echo "sse2: '$(sse2)'" && \ + echo "ssse3: '$(ssse3)'" && \ + echo "sse41: '$(sse41)'" && \ + echo "avx2: '$(avx2)'" && \ + echo "avxvnni: '$(avxvnni)'" && \ + echo "avx512: '$(avx512)'" && \ + echo "vnni256: '$(vnni256)'" && \ + echo "vnni512: '$(vnni512)'" && \ + echo "altivec: '$(altivec)'" && \ + echo "vsx: '$(vsx)'" && \ + echo "neon: '$(neon)'" && \ + echo "dotprod: '$(dotprod)'" && \ + echo "arm_version: '$(arm_version)'" && \ + echo "lsx: '$(lsx)'" && \ + echo "lasx: '$(lasx)'" && \ + echo "target_windows: '$(target_windows)'" && \ + echo "" && \ + echo "Flags:" && \ + echo "CXX: $(CXX)" && \ + echo "CXXFLAGS: $(CXXFLAGS)" && \ + echo "LDFLAGS: $(LDFLAGS)" && \ + echo "" && \ + echo "Testing config sanity. If this fails, try 'make help' ..." && \ + echo "" && \ + (test "$(debug)" = "yes" || test "$(debug)" = "no") && \ + (test "$(optimize)" = "yes" || test "$(optimize)" = "no") && \ + (test "$(SUPPORTED_ARCH)" = "true") && \ + (test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || test "$(arch)" = "e2k" || \ - test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64" - @test "$(bits)" = "32" || test "$(bits)" = "64" - @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" - @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" - @test "$(pext)" = "yes" || test "$(pext)" = "no" - @test "$(sse)" = "yes" || test "$(sse)" = "no" - @test "$(mmx)" = "yes" || test "$(mmx)" = "no" - @test "$(sse2)" = "yes" || test "$(sse2)" = "no" - @test "$(ssse3)" = "yes" || test "$(ssse3)" = "no" - @test "$(sse41)" = "yes" || test "$(sse41)" = "no" - @test "$(avx2)" = "yes" || test "$(avx2)" = "no" - @test "$(avx512)" = "yes" || test "$(avx512)" = "no" - @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no" - @test "$(vnni512)" = "yes" || test "$(vnni512)" = "no" - @test "$(neon)" = "yes" || test "$(neon)" = "no" - @test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ - || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" + test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || \ + test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64") && \ + (test "$(bits)" = "32" || test "$(bits)" = "64") && \ + (test "$(prefetch)" = "yes" || test "$(prefetch)" = "no") && \ + (test "$(popcnt)" = "yes" || test "$(popcnt)" = "no") && \ + (test "$(pext)" = "yes" || test "$(pext)" = "no") && \ + (test "$(sse)" = "yes" || test "$(sse)" = "no") && \ + (test "$(mmx)" = "yes" || test "$(mmx)" = "no") && \ + (test "$(sse2)" = "yes" || test "$(sse2)" = "no") && \ + (test "$(ssse3)" = "yes" || test "$(ssse3)" = "no") && \ + (test "$(sse41)" = "yes" || test "$(sse41)" = "no") && \ + (test "$(avx2)" = "yes" || test "$(avx2)" = "no") && \ + (test "$(avx512)" = "yes" || test "$(avx512)" = "no") && \ + (test "$(vnni256)" = "yes" || test "$(vnni256)" = "no") && \ + (test "$(vnni512)" = "yes" || test "$(vnni512)" = "no") && \ + (test "$(altivec)" = "yes" || test "$(altivec)" = "no") && \ + (test "$(vsx)" = "yes" || test "$(vsx)" = "no") && \ + (test "$(neon)" = "yes" || test "$(neon)" = "no") && \ + (test "$(lsx)" = "yes" || test "$(lsx)" = "no") && \ + (test "$(lasx)" = "yes" || test "$(lasx)" = "no") && \ + (test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || \ + test "$(comp)" = "clang" || test "$(comp)" = "armv7a-linux-androideabi16-clang" || \ + test "$(comp)" = "aarch64-linux-android21-clang") $(EXE): $(OBJS) +$(CXX) -o $@ $(OBJS) $(LDFLAGS) @@ -1051,14 +1075,14 @@ FORCE: clang-profile-make: $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-instr-generate ' \ - EXTRALDFLAGS=' -fprofile-instr-generate' \ + EXTRACXXFLAGS='-fprofile-generate ' \ + EXTRALDFLAGS=' -fprofile-generate' \ all clang-profile-use: $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \ + EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \ EXTRALDFLAGS='-fprofile-use ' \ all diff --git a/src/benchmark.cpp b/src/benchmark.cpp index 3622ac8a..35ad3c18 100644 --- a/src/benchmark.cpp +++ b/src/benchmark.cpp @@ -17,6 +17,7 @@ */ #include "benchmark.h" +#include "numa.h" #include #include @@ -91,6 +92,282 @@ const std::vector Defaults = { }; // clang-format on +// clang-format off +// human-randomly picked 5 games with <60 moves from +// https://tests.stockfishchess.org/tests/view/665c71f9fd45fb0f907c21e0 +// only moves for one side +const std::vector> BenchmarkPositions = { + { + "rnbq1k1r/ppp1bppp/4pn2/8/2B5/2NP1N2/PPP2PPP/R1BQR1K1 b - - 2 8", + "rnbq1k1r/pp2bppp/4pn2/2p5/2B2B2/2NP1N2/PPP2PPP/R2QR1K1 b - - 1 9", + "r1bq1k1r/pp2bppp/2n1pn2/2p5/2B1NB2/3P1N2/PPP2PPP/R2QR1K1 b - - 3 10", + "r1bq1k1r/pp2bppp/2n1p3/2p5/2B1PB2/5N2/PPP2PPP/R2QR1K1 b - - 0 11", + "r1b2k1r/pp2bppp/2n1p3/2p5/2B1PB2/5N2/PPP2PPP/3RR1K1 b - - 0 12", + "r1b1k2r/pp2bppp/2n1p3/2p5/2B1PB2/2P2N2/PP3PPP/3RR1K1 b - - 0 13", + "r1b1k2r/1p2bppp/p1n1p3/2p5/4PB2/2P2N2/PP2BPPP/3RR1K1 b - - 1 14", + "r1b1k2r/4bppp/p1n1p3/1pp5/P3PB2/2P2N2/1P2BPPP/3RR1K1 b - - 0 15", + "r1b1k2r/4bppp/p1n1p3/1P6/2p1PB2/2P2N2/1P2BPPP/3RR1K1 b - - 0 16", + "r1b1k2r/4bppp/2n1p3/1p6/2p1PB2/1PP2N2/4BPPP/3RR1K1 b - - 0 17", + "r3k2r/3bbppp/2n1p3/1p6/2P1PB2/2P2N2/4BPPP/3RR1K1 b - - 0 18", + "r3k2r/3bbppp/2n1p3/8/1pP1P3/2P2N2/3BBPPP/3RR1K1 b - - 1 19", + "1r2k2r/3bbppp/2n1p3/8/1pPNP3/2P5/3BBPPP/3RR1K1 b - - 3 20", + "1r2k2r/3bbppp/2n1p3/8/2PNP3/2B5/4BPPP/3RR1K1 b - - 0 21", + "1r2k2r/3bb1pp/2n1pp2/1N6/2P1P3/2B5/4BPPP/3RR1K1 b - - 1 22", + "1r2k2r/3b2pp/2n1pp2/1N6/1BP1P3/8/4BPPP/3RR1K1 b - - 0 23", + "1r2k2r/3b2pp/4pp2/1N6/1nP1P3/8/3RBPPP/4R1K1 b - - 1 24", + "1r5r/3bk1pp/4pp2/1N6/1nP1PP2/8/3RB1PP/4R1K1 b - - 0 25", + "1r5r/3bk1pp/2n1pp2/1N6/2P1PP2/8/3RBKPP/4R3 b - - 2 26", + "1r5r/3bk1pp/2n2p2/1N2p3/2P1PP2/6P1/3RBK1P/4R3 b - - 0 27", + "1r1r4/3bk1pp/2n2p2/1N2p3/2P1PP2/6P1/3RBK1P/R7 b - - 2 28", + "1r1r4/N3k1pp/2n1bp2/4p3/2P1PP2/6P1/3RBK1P/R7 b - - 4 29", + "1r1r4/3bk1pp/2N2p2/4p3/2P1PP2/6P1/3RBK1P/R7 b - - 0 30", + "1r1R4/4k1pp/2b2p2/4p3/2P1PP2/6P1/4BK1P/R7 b - - 0 31", + "3r4/4k1pp/2b2p2/4P3/2P1P3/6P1/4BK1P/R7 b - - 0 32", + "3r4/R3k1pp/2b5/4p3/2P1P3/6P1/4BK1P/8 b - - 1 33", + "8/3rk1pp/2b5/R3p3/2P1P3/6P1/4BK1P/8 b - - 3 34", + "8/3r2pp/2bk4/R1P1p3/4P3/6P1/4BK1P/8 b - - 0 35", + "8/2kr2pp/2b5/R1P1p3/4P3/4K1P1/4B2P/8 b - - 2 36", + "1k6/3r2pp/2b5/RBP1p3/4P3/4K1P1/7P/8 b - - 4 37", + "8/1k1r2pp/2b5/R1P1p3/4P3/3BK1P1/7P/8 b - - 6 38", + "1k6/3r2pp/2b5/2P1p3/4P3/3BK1P1/7P/R7 b - - 8 39", + "1k6/r5pp/2b5/2P1p3/4P3/3BK1P1/7P/5R2 b - - 10 40", + "1k3R2/6pp/2b5/2P1p3/4P3/r2BK1P1/7P/8 b - - 12 41", + "5R2/2k3pp/2b5/2P1p3/4P3/r2B2P1/3K3P/8 b - - 14 42", + "5R2/2k3pp/2b5/2P1p3/4P3/3BK1P1/r6P/8 b - - 16 43", + "5R2/2k3pp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 18 44", + "5R2/2k3pp/2b5/2P1p3/4P3/3B1KP1/r6P/8 b - - 20 45", + "8/2k2Rpp/2b5/2P1p3/4P3/r2B1KP1/7P/8 b - - 22 46", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 24 47", + "3k4/5Rpp/2b5/2P1p3/4P3/3B1KP1/r6P/8 b - - 26 48", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 28 49", + "3k4/5Rpp/2b5/2P1p3/4P3/3BK1P1/r6P/8 b - - 30 50", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/3K3P/8 b - - 32 51", + "3k4/5Rpp/2b5/2P1p3/4P3/2KB2P1/r6P/8 b - - 34 52", + "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/2K4P/8 b - - 36 53", + "3k4/5Rpp/2b5/2P1p3/4P3/1K1B2P1/r6P/8 b - - 38 54", + "3k4/6Rp/2b5/2P1p3/4P3/1K1B2P1/7r/8 b - - 0 55", + "3k4/8/2b3Rp/2P1p3/4P3/1K1B2P1/7r/8 b - - 1 56", + "8/2k3R1/2b4p/2P1p3/4P3/1K1B2P1/7r/8 b - - 3 57", + "3k4/8/2b3Rp/2P1p3/4P3/1K1B2P1/7r/8 b - - 5 58", + "8/2k5/2b3Rp/2P1p3/1K2P3/3B2P1/7r/8 b - - 7 59", + "8/2k5/2b3Rp/2P1p3/4P3/2KB2P1/3r4/8 b - - 9 60", + "8/2k5/2b3Rp/2P1p3/1K2P3/3B2P1/6r1/8 b - - 11 61", + "8/2k5/2b3Rp/2P1p3/4P3/2KB2P1/3r4/8 b - - 13 62", + "8/2k5/2b3Rp/2P1p3/2K1P3/3B2P1/6r1/8 b - - 15 63", + "4b3/2k3R1/7p/2P1p3/2K1P3/3B2P1/6r1/8 b - - 17 64", + }, + { + "r1bqkbnr/npp1pppp/p7/3P4/4pB2/2N5/PPP2PPP/R2QKBNR w KQkq - 1 6", + "r1bqkb1r/npp1pppp/p4n2/3P4/4pB2/2N5/PPP1QPPP/R3KBNR w KQkq - 3 7", + "r2qkb1r/npp1pppp/p4n2/3P1b2/4pB2/2N5/PPP1QPPP/2KR1BNR w kq - 5 8", + "r2qkb1r/1pp1pppp/p4n2/1n1P1b2/4pB2/2N4P/PPP1QPP1/2KR1BNR w kq - 1 9", + "r2qkb1r/1pp1pppp/5n2/1p1P1b2/4pB2/7P/PPP1QPP1/2KR1BNR w kq - 0 10", + "r2qkb1r/1ppbpppp/5n2/1Q1P4/4pB2/7P/PPP2PP1/2KR1BNR w kq - 1 11", + "3qkb1r/1Qpbpppp/5n2/3P4/4pB2/7P/rPP2PP1/2KR1BNR w k - 0 12", + "q3kb1r/1Qpbpppp/5n2/3P4/4pB2/7P/rPP2PP1/1K1R1BNR w k - 2 13", + "r3kb1r/2pbpppp/5n2/3P4/4pB2/7P/1PP2PP1/1K1R1BNR w k - 0 14", + "r3kb1r/2Bb1ppp/4pn2/3P4/4p3/7P/1PP2PP1/1K1R1BNR w k - 0 15", + "r3kb1r/2Bb2pp/4pn2/8/4p3/7P/1PP2PP1/1K1R1BNR w k - 0 16", + "r3k2r/2Bb2pp/4pn2/2b5/4p3/7P/1PP1NPP1/1K1R1B1R w k - 2 17", + "r6r/2Bbk1pp/4pn2/2b5/3Np3/7P/1PP2PP1/1K1R1B1R w - - 4 18", + "r6r/b2bk1pp/4pn2/4B3/3Np3/7P/1PP2PP1/1K1R1B1R w - - 6 19", + "r1r5/b2bk1pp/4pn2/4B3/2BNp3/7P/1PP2PP1/1K1R3R w - - 8 20", + "r7/b2bk1pp/4pn2/2r1B3/2BNp3/1P5P/2P2PP1/1K1R3R w - - 1 21", + "rb6/3bk1pp/4pn2/2r1B3/2BNpP2/1P5P/2P3P1/1K1R3R w - - 1 22", + "1r6/3bk1pp/4pn2/2r5/2BNpP2/1P5P/2P3P1/1K1R3R w - - 0 23", + "1r6/3bk1p1/4pn1p/2r5/2BNpP2/1P5P/2P3P1/2KR3R w - - 0 24", + "8/3bk1p1/1r2pn1p/2r5/2BNpP1P/1P6/2P3P1/2KR3R w - - 1 25", + "8/3bk3/1r2pnpp/2r5/2BNpP1P/1P6/2P3P1/2K1R2R w - - 0 26", + "2b5/4k3/1r2pnpp/2r5/2BNpP1P/1P4P1/2P5/2K1R2R w - - 1 27", + "8/1b2k3/1r2pnpp/2r5/2BNpP1P/1P4P1/2P5/2K1R1R1 w - - 3 28", + "8/1b1nk3/1r2p1pp/2r5/2BNpPPP/1P6/2P5/2K1R1R1 w - - 1 29", + "8/1b2k3/1r2p1pp/2r1nP2/2BNp1PP/1P6/2P5/2K1R1R1 w - - 1 30", + "8/1b2k3/1r2p1p1/2r1nPp1/2BNp2P/1P6/2P5/2K1R1R1 w - - 0 31", + "8/1b2k3/1r2p1n1/2r3p1/2BNp2P/1P6/2P5/2K1R1R1 w - - 0 32", + "8/1b2k3/1r2p1n1/6r1/2BNp2P/1P6/2P5/2K1R3 w - - 0 33", + "8/1b2k3/1r2p3/4n1P1/2BNp3/1P6/2P5/2K1R3 w - - 1 34", + "8/1b2k3/1r2p3/4n1P1/2BN4/1P2p3/2P5/2K4R w - - 0 35", + "8/1b2k3/1r2p2R/6P1/2nN4/1P2p3/2P5/2K5 w - - 0 36", + "8/1b2k3/3rp2R/6P1/2PN4/4p3/2P5/2K5 w - - 1 37", + "8/4k3/3rp2R/6P1/2PN4/2P1p3/6b1/2K5 w - - 1 38", + "8/4k3/r3p2R/2P3P1/3N4/2P1p3/6b1/2K5 w - - 1 39", + "8/3k4/r3p2R/2P2NP1/8/2P1p3/6b1/2K5 w - - 3 40", + "8/3k4/4p2R/2P3P1/8/2P1N3/6b1/r1K5 w - - 1 41", + "8/3k4/4p2R/2P3P1/8/2P1N3/3K2b1/6r1 w - - 3 42", + "8/3k4/4p2R/2P3P1/8/2PKNb2/8/6r1 w - - 5 43", + "8/4k3/4p1R1/2P3P1/8/2PKNb2/8/6r1 w - - 7 44", + "8/4k3/4p1R1/2P3P1/3K4/2P1N3/8/6rb w - - 9 45", + "8/3k4/4p1R1/2P1K1P1/8/2P1N3/8/6rb w - - 11 46", + "8/3k4/4p1R1/2P3P1/5K2/2P1N3/8/4r2b w - - 13 47", + "8/3k4/2b1p2R/2P3P1/5K2/2P1N3/8/4r3 w - - 15 48", + "8/3k4/2b1p3/2P3P1/5K2/2P1N2R/8/6r1 w - - 17 49", + "2k5/7R/2b1p3/2P3P1/5K2/2P1N3/8/6r1 w - - 19 50", + "2k5/7R/4p3/2P3P1/b1P2K2/4N3/8/6r1 w - - 1 51", + "2k5/3bR3/4p3/2P3P1/2P2K2/4N3/8/6r1 w - - 3 52", + "3k4/3b2R1/4p3/2P3P1/2P2K2/4N3/8/6r1 w - - 5 53", + "3kb3/6R1/4p1P1/2P5/2P2K2/4N3/8/6r1 w - - 1 54", + "3kb3/6R1/4p1P1/2P5/2P2KN1/8/8/2r5 w - - 3 55", + "3kb3/6R1/4p1P1/2P1N3/2P2K2/8/8/5r2 w - - 5 56", + "3kb3/6R1/4p1P1/2P1N3/2P5/4K3/8/4r3 w - - 7 57", + }, + { + "rnbq1rk1/ppp1npb1/4p1p1/3P3p/3PP3/2N2N2/PP2BPPP/R1BQ1RK1 b - - 0 8", + "rnbq1rk1/ppp1npb1/6p1/3pP2p/3P4/2N2N2/PP2BPPP/R1BQ1RK1 b - - 0 9", + "rn1q1rk1/ppp1npb1/6p1/3pP2p/3P2b1/2N2N2/PP2BPPP/R1BQR1K1 b - - 2 10", + "r2q1rk1/ppp1npb1/2n3p1/3pP2p/3P2bN/2N5/PP2BPPP/R1BQR1K1 b - - 4 11", + "r4rk1/pppqnpb1/2n3p1/3pP2p/3P2bN/2N4P/PP2BPP1/R1BQR1K1 b - - 0 12", + "r4rk1/pppqnpb1/2n3p1/3pP2p/3P3N/7P/PP2NPP1/R1BQR1K1 b - - 0 13", + "r4rk1/pppq1pb1/2n3p1/3pPN1p/3P4/7P/PP2NPP1/R1BQR1K1 b - - 0 14", + "r4rk1/ppp2pb1/2n3p1/3pPq1p/3P1N2/7P/PP3PP1/R1BQR1K1 b - - 1 15", + "r4rk1/pppq1pb1/2n3p1/3pP2p/P2P1N2/7P/1P3PP1/R1BQR1K1 b - - 0 16", + "r2n1rk1/pppq1pb1/6p1/3pP2p/P2P1N2/R6P/1P3PP1/2BQR1K1 b - - 2 17", + "r4rk1/pppq1pb1/4N1p1/3pP2p/P2P4/R6P/1P3PP1/2BQR1K1 b - - 0 18", + "r4rk1/ppp2pb1/4q1p1/3pP1Bp/P2P4/R6P/1P3PP1/3QR1K1 b - - 1 19", + "r3r1k1/ppp2pb1/4q1p1/3pP1Bp/P2P1P2/R6P/1P4P1/3QR1K1 b - - 0 20", + "r3r1k1/ppp3b1/4qpp1/3pP2p/P2P1P1B/R6P/1P4P1/3QR1K1 b - - 1 21", + "r3r1k1/ppp3b1/4q1p1/3pP2p/P4P1B/R6P/1P4P1/3QR1K1 b - - 0 22", + "r4rk1/ppp3b1/4q1p1/3pP1Bp/P4P2/R6P/1P4P1/3QR1K1 b - - 2 23", + "r4rk1/pp4b1/4q1p1/2ppP1Bp/P4P2/3R3P/1P4P1/3QR1K1 b - - 1 24", + "r4rk1/pp4b1/4q1p1/2p1P1Bp/P2p1PP1/3R3P/1P6/3QR1K1 b - - 0 25", + "r4rk1/pp4b1/4q1p1/2p1P1B1/P2p1PP1/3R4/1P6/3QR1K1 b - - 0 26", + "r5k1/pp3rb1/4q1p1/2p1P1B1/P2p1PP1/6R1/1P6/3QR1K1 b - - 2 27", + "5rk1/pp3rb1/4q1p1/2p1P1B1/P2pRPP1/6R1/1P6/3Q2K1 b - - 4 28", + "5rk1/1p3rb1/p3q1p1/P1p1P1B1/3pRPP1/6R1/1P6/3Q2K1 b - - 0 29", + "4r1k1/1p3rb1/p3q1p1/P1p1P1B1/3pRPP1/1P4R1/8/3Q2K1 b - - 0 30", + "4r1k1/5rb1/pP2q1p1/2p1P1B1/3pRPP1/1P4R1/8/3Q2K1 b - - 0 31", + "4r1k1/5rb1/pq4p1/2p1P1B1/3pRPP1/1P4R1/4Q3/6K1 b - - 1 32", + "4r1k1/1r4b1/pq4p1/2p1P1B1/3pRPP1/1P4R1/2Q5/6K1 b - - 3 33", + "4r1k1/1r4b1/1q4p1/p1p1P1B1/3p1PP1/1P4R1/2Q5/4R1K1 b - - 1 34", + "4r1k1/3r2b1/1q4p1/p1p1P1B1/2Qp1PP1/1P4R1/8/4R1K1 b - - 3 35", + "4r1k1/3r2b1/4q1p1/p1p1P1B1/2Qp1PP1/1P4R1/5K2/4R3 b - - 5 36", + "4r1k1/3r2b1/6p1/p1p1P1B1/2Pp1PP1/6R1/5K2/4R3 b - - 0 37", + "4r1k1/3r2b1/6p1/p1p1P1B1/2P2PP1/3p2R1/5K2/3R4 b - - 1 38", + "5rk1/3r2b1/6p1/p1p1P1B1/2P2PP1/3p2R1/8/3RK3 b - - 3 39", + "5rk1/6b1/6p1/p1p1P1B1/2Pr1PP1/3R4/8/3RK3 b - - 0 40", + "5rk1/3R2b1/6p1/p1p1P1B1/2r2PP1/8/8/3RK3 b - - 1 41", + "5rk1/3R2b1/6p1/p1p1P1B1/4rPP1/8/3K4/3R4 b - - 3 42", + "1r4k1/3R2b1/6p1/p1p1P1B1/4rPP1/2K5/8/3R4 b - - 5 43", + "1r4k1/3R2b1/6p1/p1p1P1B1/2K2PP1/4r3/8/3R4 b - - 7 44", + "1r3bk1/8/3R2p1/p1p1P1B1/2K2PP1/4r3/8/3R4 b - - 9 45", + "1r3bk1/8/6R1/2p1P1B1/p1K2PP1/4r3/8/3R4 b - - 0 46", + "1r3b2/5k2/R7/2p1P1B1/p1K2PP1/4r3/8/3R4 b - - 2 47", + "5b2/1r3k2/R7/2p1P1B1/p1K2PP1/4r3/8/7R b - - 4 48", + "5b2/5k2/R7/2pKP1B1/pr3PP1/4r3/8/7R b - - 6 49", + "5b2/5k2/R1K5/2p1P1B1/p2r1PP1/4r3/8/7R b - - 8 50", + "8/R4kb1/2K5/2p1P1B1/p2r1PP1/4r3/8/7R b - - 10 51", + "8/R5b1/2K3k1/2p1PPB1/p2r2P1/4r3/8/7R b - - 0 52", + "8/6R1/2K5/2p1PPk1/p2r2P1/4r3/8/7R b - - 0 53", + "8/6R1/2K5/2p1PP2/p2r1kP1/4r3/8/5R2 b - - 2 54", + "8/6R1/2K2P2/2p1P3/p2r2P1/4r1k1/8/5R2 b - - 0 55", + "8/5PR1/2K5/2p1P3/p2r2P1/4r3/6k1/5R2 b - - 0 56", + }, + { + "rn1qkb1r/p1pbpppp/5n2/8/2pP4/2N5/1PQ1PPPP/R1B1KBNR w KQkq - 0 7", + "r2qkb1r/p1pbpppp/2n2n2/8/2pP4/2N2N2/1PQ1PPPP/R1B1KB1R w KQkq - 2 8", + "r2qkb1r/p1pbpppp/5n2/8/1npPP3/2N2N2/1PQ2PPP/R1B1KB1R w KQkq - 1 9", + "r2qkb1r/p1pb1ppp/4pn2/8/1npPP3/2N2N2/1P3PPP/R1BQKB1R w KQkq - 0 10", + "r2qk2r/p1pbbppp/4pn2/8/1nBPP3/2N2N2/1P3PPP/R1BQK2R w KQkq - 1 11", + "r2q1rk1/p1pbbppp/4pn2/8/1nBPP3/2N2N2/1P3PPP/R1BQ1RK1 w - - 3 12", + "r2q1rk1/2pbbppp/p3pn2/8/1nBPPB2/2N2N2/1P3PPP/R2Q1RK1 w - - 0 13", + "r2q1rk1/2p1bppp/p3pn2/1b6/1nBPPB2/2N2N2/1P3PPP/R2QR1K1 w - - 2 14", + "r2q1rk1/4bppp/p1p1pn2/1b6/1nBPPB2/1PN2N2/5PPP/R2QR1K1 w - - 0 15", + "r4rk1/3qbppp/p1p1pn2/1b6/1nBPPB2/1PN2N2/3Q1PPP/R3R1K1 w - - 2 16", + "r4rk1/1q2bppp/p1p1pn2/1b6/1nBPPB2/1PN2N1P/3Q1PP1/R3R1K1 w - - 1 17", + "r3r1k1/1q2bppp/p1p1pn2/1b6/1nBPPB2/1PN2N1P/4QPP1/R3R1K1 w - - 3 18", + "r3r1k1/1q1nbppp/p1p1p3/1b6/1nBPPB2/1PN2N1P/4QPP1/3RR1K1 w - - 5 19", + "r3rbk1/1q1n1ppp/p1p1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/4R1K1 w - - 7 20", + "r3rbk1/1q3ppp/pnp1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/4R2K w - - 9 21", + "2r1rbk1/1q3ppp/pnp1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/1R5K w - - 11 22", + "2r1rbk1/1q4pp/pnp1pp2/1b6/1nBPPB2/1PN2N1P/4QPP1/1R1R3K w - - 0 23", + "2r1rbk1/5qpp/pnp1pp2/1b6/1nBPP3/1PN1BN1P/4QPP1/1R1R3K w - - 2 24", + "2r1rbk1/5qp1/pnp1pp1p/1b6/1nBPP3/1PN1BN1P/4QPP1/1R1R2K1 w - - 0 25", + "2r1rbk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/n3QPP1/1R1R2K1 w - - 0 26", + "r3rbk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/Q4PP1/1R1R2K1 w - - 1 27", + "rr3bk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/Q4PP1/R2R2K1 w - - 3 28", + "rr2qbk1/6p1/pnp1pp1p/1b6/2BPP3/1P2BN1P/4QPP1/R2R2K1 w - - 5 29", + "rr2qbk1/6p1/1np1pp1p/pb6/2BPP3/1P1QBN1P/5PP1/R2R2K1 w - - 0 30", + "rr2qbk1/6p1/1n2pp1p/pp6/3PP3/1P1QBN1P/5PP1/R2R2K1 w - - 0 31", + "rr2qbk1/6p1/1n2pp1p/1p1P4/p3P3/1P1QBN1P/5PP1/R2R2K1 w - - 0 32", + "rr2qbk1/3n2p1/3Ppp1p/1p6/p3P3/1P1QBN1P/5PP1/R2R2K1 w - - 1 33", + "rr3bk1/3n2p1/3Ppp1p/1p5q/pP2P3/3QBN1P/5PP1/R2R2K1 w - - 1 34", + "rr3bk1/3n2p1/3Ppp1p/1p5q/1P2P3/p2QBN1P/5PP1/2RR2K1 w - - 0 35", + "1r3bk1/3n2p1/r2Ppp1p/1p5q/1P2P3/pQ2BN1P/5PP1/2RR2K1 w - - 2 36", + "1r2qbk1/2Rn2p1/r2Ppp1p/1p6/1P2P3/pQ2BN1P/5PP1/3R2K1 w - - 4 37", + "1r2qbk1/2Rn2p1/r2Ppp1p/1pB5/1P2P3/1Q3N1P/p4PP1/3R2K1 w - - 0 38", + "1r2q1k1/2Rn2p1/r2bpp1p/1pB5/1P2P3/1Q3N1P/p4PP1/R5K1 w - - 0 39", + "1r2q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/1Q3N1P/p4PP1/R5K1 w - - 0 40", + "2r1q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 1 41", + "1r2q1k1/1R1n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 3 42", + "2r1q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 5 43", + "1r2q1k1/1R1n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 7 44", + "1rq3k1/R2n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 9 45", + "2q3k1/Rr1n2p1/3rpp1p/1p6/1P2P3/5N1P/4QPP1/R5K1 w - - 11 46", + "Rrq3k1/3n2p1/3rpp1p/1p6/1P2P3/5N1P/4QPP1/R5K1 w - - 13 47", + }, + { + "rn1qkb1r/1pp2ppp/p4p2/3p1b2/5P2/1P2PN2/P1PP2PP/RN1QKB1R b KQkq - 1 6", + "r2qkb1r/1pp2ppp/p1n2p2/3p1b2/3P1P2/1P2PN2/P1P3PP/RN1QKB1R b KQkq - 0 7", + "r2qkb1r/1pp2ppp/p4p2/3p1b2/1n1P1P2/1P1BPN2/P1P3PP/RN1QK2R b KQkq - 2 8", + "r2qkb1r/1pp2ppp/p4p2/3p1b2/3P1P2/1P1PPN2/P5PP/RN1QK2R b KQkq - 0 9", + "r2qk2r/1pp2ppp/p2b1p2/3p1b2/3P1P2/1PNPPN2/P5PP/R2QK2R b KQkq - 2 10", + "r2qk2r/1p3ppp/p1pb1p2/3p1b2/3P1P2/1PNPPN2/P5PP/R2Q1RK1 b kq - 1 11", + "r2q1rk1/1p3ppp/p1pb1p2/3p1b2/3P1P2/1PNPPN2/P2Q2PP/R4RK1 b - - 3 12", + "r2qr1k1/1p3ppp/p1pb1p2/3p1b2/3P1P2/1P1PPN2/P2QN1PP/R4RK1 b - - 5 13", + "r3r1k1/1p3ppp/pqpb1p2/3p1b2/3P1P2/1P1PPNN1/P2Q2PP/R4RK1 b - - 7 14", + "r3r1k1/1p3ppp/pqp2p2/3p1b2/1b1P1P2/1P1PPNN1/P1Q3PP/R4RK1 b - - 9 15", + "r3r1k1/1p1b1ppp/pqp2p2/3p4/1b1P1P2/1P1PPNN1/P4QPP/R4RK1 b - - 11 16", + "2r1r1k1/1p1b1ppp/pqp2p2/3p4/1b1PPP2/1P1P1NN1/P4QPP/R4RK1 b - - 0 17", + "2r1r1k1/1p1b1ppp/pq3p2/2pp4/1b1PPP2/PP1P1NN1/5QPP/R4RK1 b - - 0 18", + "2r1r1k1/1p1b1ppp/pq3p2/2Pp4/4PP2/PPbP1NN1/5QPP/R4RK1 b - - 0 19", + "2r1r1k1/1p1b1ppp/p4p2/2Pp4/4PP2/PqbP1NN1/5QPP/RR4K1 b - - 1 20", + "2r1r1k1/1p1b1ppp/p4p2/2Pp4/q3PP2/P1bP1NN1/R4QPP/1R4K1 b - - 3 21", + "2r1r1k1/1p3ppp/p4p2/1bPP4/q4P2/P1bP1NN1/R4QPP/1R4K1 b - - 0 22", + "2r1r1k1/1p3ppp/p4p2/2PP4/q4P2/P1bb1NN1/R4QPP/2R3K1 b - - 1 23", + "2r1r1k1/1p3ppp/p2P1p2/2P5/2q2P2/P1bb1NN1/R4QPP/2R3K1 b - - 0 24", + "2rr2k1/1p3ppp/p2P1p2/2P5/2q2P2/P1bb1NN1/R4QPP/2R4K b - - 2 25", + "2rr2k1/1p3ppp/p2P1p2/2Q5/5P2/P1bb1NN1/R5PP/2R4K b - - 0 26", + "3r2k1/1p3ppp/p2P1p2/2r5/5P2/P1bb1N2/R3N1PP/2R4K b - - 1 27", + "3r2k1/1p3ppp/p2P1p2/2r5/5P2/P1b2N2/4R1PP/2R4K b - - 0 28", + "3r2k1/1p3ppp/p2P1p2/2r5/1b3P2/P4N2/4R1PP/3R3K b - - 2 29", + "3r2k1/1p2Rppp/p2P1p2/b1r5/5P2/P4N2/6PP/3R3K b - - 4 30", + "3r2k1/1R3ppp/p1rP1p2/b7/5P2/P4N2/6PP/3R3K b - - 0 31", + "3r2k1/1R3ppp/p2R1p2/b7/5P2/P4N2/6PP/7K b - - 0 32", + "6k1/1R3ppp/p2r1p2/b7/5P2/P4NP1/7P/7K b - - 0 33", + "6k1/1R3p1p/p2r1pp1/b7/5P1P/P4NP1/8/7K b - - 0 34", + "6k1/3R1p1p/pr3pp1/b7/5P1P/P4NP1/8/7K b - - 2 35", + "6k1/5p2/pr3pp1/b2R3p/5P1P/P4NP1/8/7K b - - 1 36", + "6k1/5p2/pr3pp1/7p/5P1P/P1bR1NP1/8/7K b - - 3 37", + "6k1/5p2/p1r2pp1/7p/5P1P/P1bR1NP1/6K1/8 b - - 5 38", + "6k1/5p2/p1r2pp1/b2R3p/5P1P/P4NP1/6K1/8 b - - 7 39", + "6k1/5p2/p4pp1/b2R3p/5P1P/P4NPK/2r5/8 b - - 9 40", + "6k1/2b2p2/p4pp1/7p/5P1P/P2R1NPK/2r5/8 b - - 11 41", + "6k1/2b2p2/5pp1/p6p/3N1P1P/P2R2PK/2r5/8 b - - 1 42", + "6k1/2b2p2/5pp1/p6p/3N1P1P/P1R3PK/r7/8 b - - 3 43", + "6k1/5p2/1b3pp1/p6p/5P1P/P1R3PK/r1N5/8 b - - 5 44", + "8/5pk1/1bR2pp1/p6p/5P1P/P5PK/r1N5/8 b - - 7 45", + "3b4/5pk1/2R2pp1/p4P1p/7P/P5PK/r1N5/8 b - - 0 46", + "8/4bpk1/2R2pp1/p4P1p/6PP/P6K/r1N5/8 b - - 0 47", + "8/5pk1/2R2pP1/p6p/6PP/b6K/r1N5/8 b - - 0 48", + "8/6k1/2R2pp1/p6P/7P/b6K/r1N5/8 b - - 0 49", + "8/6k1/2R2p2/p6p/7P/b5K1/r1N5/8 b - - 1 50", + "8/8/2R2pk1/p6p/7P/b4K2/r1N5/8 b - - 3 51", + "8/8/2R2pk1/p6p/7P/4NK2/rb6/8 b - - 5 52", + "2R5/8/5pk1/7p/p6P/4NK2/rb6/8 b - - 1 53", + "6R1/8/5pk1/7p/p6P/4NK2/1b6/r7 b - - 3 54", + "R7/5k2/5p2/7p/p6P/4NK2/1b6/r7 b - - 5 55", + "R7/5k2/5p2/7p/7P/p3N3/1b2K3/r7 b - - 1 56", + "8/R4k2/5p2/7p/7P/p3N3/1b2K3/7r b - - 3 57", + "8/8/5pk1/7p/R6P/p3N3/1b2K3/7r b - - 5 58", + "8/8/5pk1/7p/R6P/p7/4K3/2bN3r b - - 7 59", + "8/8/5pk1/7p/R6P/p7/4KN1r/2b5 b - - 9 60", + "8/8/5pk1/7p/R6P/p3K3/1b3N1r/8 b - - 11 61", + "8/8/R4pk1/7p/7P/p1b1K3/5N1r/8 b - - 13 62", + "8/8/5pk1/7p/7P/2b1K3/R4N1r/8 b - - 0 63", + "8/8/5pk1/7p/3K3P/8/R4N1r/4b3 b - - 2 64", + } +}; +// clang-format on + } // namespace namespace Stockfish::Benchmark { @@ -160,4 +437,76 @@ std::vector setup_bench(const std::string& currentFen, std::istream return list; } +BenchmarkSetup setup_benchmark(std::istream& is) { + // TT_SIZE_PER_THREAD is chosen such that roughly half of the hash is used all positions + // for the current sequence have been searched. + static constexpr int TT_SIZE_PER_THREAD = 128; + + static constexpr int DEFAULT_DURATION_S = 150; + + BenchmarkSetup setup{}; + + // Assign default values to missing arguments + int desiredTimeS; + + if (!(is >> setup.threads)) + setup.threads = get_hardware_concurrency(); + else + setup.originalInvocation += std::to_string(setup.threads); + + if (!(is >> setup.ttSize)) + setup.ttSize = TT_SIZE_PER_THREAD * setup.threads; + else + setup.originalInvocation += " " + std::to_string(setup.ttSize); + + if (!(is >> desiredTimeS)) + desiredTimeS = DEFAULT_DURATION_S; + else + setup.originalInvocation += " " + std::to_string(desiredTimeS); + + setup.filledInvocation += std::to_string(setup.threads) + " " + std::to_string(setup.ttSize) + + " " + std::to_string(desiredTimeS); + + auto getCorrectedTime = [&](int ply) { + // time per move is fit roughly based on LTC games + // seconds = 50/{ply+15} + // ms = 50000/{ply+15} + // with this fit 10th move gets 2000ms + // adjust for desired 10th move time + return 50000.0 / (static_cast(ply) + 15.0); + }; + + float totalTime = 0; + for (const auto& game : BenchmarkPositions) + { + setup.commands.emplace_back("ucinewgame"); + int ply = 1; + for (int i = 0; i < static_cast(game.size()); ++i) + { + const float correctedTime = getCorrectedTime(ply); + totalTime += correctedTime; + ply += 1; + } + } + + float timeScaleFactor = static_cast(desiredTimeS * 1000) / totalTime; + + for (const auto& game : BenchmarkPositions) + { + setup.commands.emplace_back("ucinewgame"); + int ply = 1; + for (const std::string& fen : game) + { + setup.commands.emplace_back("position fen " + fen); + + const int correctedTime = static_cast(getCorrectedTime(ply) * timeScaleFactor); + setup.commands.emplace_back("go movetime " + std::to_string(correctedTime)); + + ply += 1; + } + } + + return setup; +} + } // namespace Stockfish \ No newline at end of file diff --git a/src/benchmark.h b/src/benchmark.h index b1eba40f..eb3a52d8 100644 --- a/src/benchmark.h +++ b/src/benchmark.h @@ -27,6 +27,16 @@ namespace Stockfish::Benchmark { std::vector setup_bench(const std::string&, std::istream&); +struct BenchmarkSetup { + int ttSize; + int threads; + std::vector commands; + std::string originalInvocation; + std::string filledInvocation; +}; + +BenchmarkSetup setup_benchmark(std::istream&); + } // namespace Stockfish #endif // #ifndef BENCHMARK_H_INCLUDED diff --git a/src/bitboard.cpp b/src/bitboard.cpp index 32c626d4..deda6da2 100644 --- a/src/bitboard.cpp +++ b/src/bitboard.cpp @@ -34,15 +34,14 @@ Bitboard BetweenBB[SQUARE_NB][SQUARE_NB]; Bitboard PseudoAttacks[PIECE_TYPE_NB][SQUARE_NB]; Bitboard PawnAttacks[COLOR_NB][SQUARE_NB]; -Magic RookMagics[SQUARE_NB]; -Magic BishopMagics[SQUARE_NB]; +alignas(64) Magic Magics[SQUARE_NB][2]; namespace { Bitboard RookTable[0x19000]; // To store rook attacks Bitboard BishopTable[0x1480]; // To store bishop attacks -void init_magics(PieceType pt, Bitboard table[], Magic magics[]); +void init_magics(PieceType pt, Bitboard table[], Magic magics[][2]); // Returns the bitboard of target square for the given step // from the given square. If the step is off the board, returns empty bitboard. @@ -82,8 +81,8 @@ void Bitboards::init() { for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2) SquareDistance[s1][s2] = std::max(distance(s1, s2), distance(s1, s2)); - init_magics(ROOK, RookTable, RookMagics); - init_magics(BISHOP, BishopTable, BishopMagics); + init_magics(ROOK, RookTable, Magics); + init_magics(BISHOP, BishopTable, Magics); for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1) { @@ -124,8 +123,14 @@ Bitboard sliding_attack(PieceType pt, Square sq, Bitboard occupied) { for (Direction d : (pt == ROOK ? RookDirections : BishopDirections)) { Square s = sq; - while (safe_destination(s, d) && !(occupied & s)) + while (safe_destination(s, d)) + { attacks |= (s += d); + if (occupied & s) + { + break; + } + } } return attacks; @@ -134,41 +139,49 @@ Bitboard sliding_attack(PieceType pt, Square sq, Bitboard occupied) { // Computes all rook and bishop attacks at startup. Magic // bitboards are used to look up attacks of sliding pieces. As a reference see -// www.chessprogramming.org/Magic_Bitboards. In particular, here we use the so -// called "fancy" approach. -void init_magics(PieceType pt, Bitboard table[], Magic magics[]) { +// https://www.chessprogramming.org/Magic_Bitboards. In particular, here we use +// the so called "fancy" approach. +void init_magics(PieceType pt, Bitboard table[], Magic magics[][2]) { +#ifndef USE_PEXT // Optimal PRNG seeds to pick the correct magics in the shortest time int seeds[][RANK_NB] = {{8977, 44560, 54343, 38998, 5731, 95205, 104912, 17020}, {728, 10316, 55013, 32803, 12281, 15100, 16645, 255}}; - Bitboard occupancy[4096], reference[4096], edges, b; - int epoch[4096] = {}, cnt = 0, size = 0; + Bitboard occupancy[4096]; + int epoch[4096] = {}, cnt = 0; +#endif + Bitboard reference[4096]; + int size = 0; for (Square s = SQ_A1; s <= SQ_H8; ++s) { // Board edges are not considered in the relevant occupancies - edges = ((Rank1BB | Rank8BB) & ~rank_bb(s)) | ((FileABB | FileHBB) & ~file_bb(s)); + Bitboard edges = ((Rank1BB | Rank8BB) & ~rank_bb(s)) | ((FileABB | FileHBB) & ~file_bb(s)); // Given a square 's', the mask is the bitboard of sliding attacks from // 's' computed on an empty board. The index must be big enough to contain // all the attacks for each possible subset of the mask and so is 2 power // the number of 1s of the mask. Hence we deduce the size of the shift to // apply to the 64 or 32 bits word to get the index. - Magic& m = magics[s]; + Magic& m = magics[s][pt - BISHOP]; m.mask = sliding_attack(pt, s, 0) & ~edges; - m.shift = (Is64Bit ? 64 : 32) - popcount(m.mask); - +#ifndef USE_PEXT + m.shift = (Is64Bit ? 64 : 32) - popcount(m.mask); +#endif // Set the offset for the attacks table of the square. We have individual // table sizes for each square with "Fancy Magic Bitboards". - m.attacks = s == SQ_A1 ? table : magics[s - 1].attacks + size; + m.attacks = s == SQ_A1 ? table : magics[s - 1][pt - BISHOP].attacks + size; + size = 0; // Use Carry-Rippler trick to enumerate all subsets of masks[s] and // store the corresponding sliding attack bitboard in reference[]. - b = size = 0; + Bitboard b = 0; do { +#ifndef USE_PEXT occupancy[size] = b; +#endif reference[size] = sliding_attack(pt, s, b); if (HasPext) @@ -178,9 +191,7 @@ void init_magics(PieceType pt, Bitboard table[], Magic magics[]) { b = (b - m.mask) & m.mask; } while (b); - if (HasPext) - continue; - +#ifndef USE_PEXT PRNG rng(seeds[Is64Bit][rank_of(s)]); // Find a magic for square 's' picking up an (almost) random number @@ -209,6 +220,7 @@ void init_magics(PieceType pt, Bitboard table[], Magic magics[]) { break; } } +#endif } } } diff --git a/src/bitboard.h b/src/bitboard.h index cdff4c75..c4bf18b5 100644 --- a/src/bitboard.h +++ b/src/bitboard.h @@ -67,27 +67,31 @@ extern Bitboard PawnAttacks[COLOR_NB][SQUARE_NB]; // Magic holds all magic bitboards relevant data for a single square struct Magic { Bitboard mask; - Bitboard magic; Bitboard* attacks; - unsigned shift; +#ifndef USE_PEXT + Bitboard magic; + unsigned shift; +#endif // Compute the attack's index using the 'magic bitboards' approach unsigned index(Bitboard occupied) const { - if (HasPext) - return unsigned(pext(occupied, mask)); - +#ifdef USE_PEXT + return unsigned(pext(occupied, mask)); +#else if (Is64Bit) return unsigned(((occupied & mask) * magic) >> shift); unsigned lo = unsigned(occupied) & unsigned(mask); unsigned hi = unsigned(occupied >> 32) & unsigned(mask >> 32); return (lo * unsigned(magic) ^ hi * unsigned(magic >> 32)) >> shift; +#endif } + + Bitboard attacks_bb(Bitboard occupied) const { return attacks[index(occupied)]; } }; -extern Magic RookMagics[SQUARE_NB]; -extern Magic BishopMagics[SQUARE_NB]; +extern Magic Magics[SQUARE_NB][2]; constexpr Bitboard square_bb(Square s) { assert(is_ok(s)); @@ -229,9 +233,8 @@ inline Bitboard attacks_bb(Square s, Bitboard occupied) { switch (Pt) { case BISHOP : - return BishopMagics[s].attacks[BishopMagics[s].index(occupied)]; case ROOK : - return RookMagics[s].attacks[RookMagics[s].index(occupied)]; + return Magics[s][Pt - BISHOP].attacks_bb(occupied); case QUEEN : return attacks_bb(s, occupied) | attacks_bb(s, occupied); default : diff --git a/src/engine.cpp b/src/engine.cpp index e8da24aa..85c84099 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -18,15 +18,15 @@ #include "engine.h" +#include #include +#include #include #include +#include #include #include #include -#include -#include -#include #include "evaluate.h" #include "misc.h" @@ -44,16 +44,76 @@ namespace Stockfish { namespace NN = Eval::NNUE; -constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; +constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; +constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048; -Engine::Engine(std::string path) : - binaryDirectory(CommandLine::get_binary_directory(path)), +Engine::Engine(std::optional path) : + binaryDirectory(path ? CommandLine::get_binary_directory(*path) : ""), + numaContext(NumaConfig::from_system()), states(new std::deque(1)), - networks(NN::Networks( - NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG), - NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) { + threads(), + networks( + numaContext, + NN::Networks( + NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG), + NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) { pos.set(StartFEN, false, &states->back()); capSq = SQ_NONE; + + options["Debug Log File"] << Option("", [](const Option& o) { + start_logger(o); + return std::nullopt; + }); + + options["NumaPolicy"] << Option("auto", [this](const Option& o) { + set_numa_config_from_option(o); + return numa_config_information_as_string() + "\n" + + thread_allocation_information_as_string(); + }); + + options["Threads"] << Option(1, 1, 1024, [this](const Option&) { + resize_threads(); + return thread_allocation_information_as_string(); + }); + + options["Hash"] << Option(16, 1, MaxHashMB, [this](const Option& o) { + set_tt_size(o); + return std::nullopt; + }); + + options["Clear Hash"] << Option([this](const Option&) { + search_clear(); + return std::nullopt; + }); + options["Ponder"] << Option(false); + options["MultiPV"] << Option(1, 1, MAX_MOVES); + options["Skill Level"] << Option(20, 0, 20); + options["Move Overhead"] << Option(10, 0, 5000); + options["nodestime"] << Option(0, 0, 10000); + options["UCI_Chess960"] << Option(false); + options["UCI_LimitStrength"] << Option(false); + options["UCI_Elo"] << Option(Stockfish::Search::Skill::LowestElo, + Stockfish::Search::Skill::LowestElo, + Stockfish::Search::Skill::HighestElo); + options["UCI_ShowWDL"] << Option(false); + options["SyzygyPath"] << Option("", [](const Option& o) { + Tablebases::init(o); + return std::nullopt; + }); + options["SyzygyProbeDepth"] << Option(1, 1, 100); + options["Syzygy50MoveRule"] << Option(true); + options["SyzygyProbeLimit"] << Option(7, 0, 7); + options["EvalFile"] << Option(EvalFileDefaultNameBig, [this](const Option& o) { + load_big_network(o); + return std::nullopt; + }); + options["EvalFileSmall"] << Option(EvalFileDefaultNameSmall, [this](const Option& o) { + load_small_network(o); + return std::nullopt; + }); + + load_networks(); + resize_threads(); } std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960) { @@ -74,7 +134,7 @@ void Engine::stop() { threads.stop = true; } void Engine::search_clear() { wait_for_search_finished(); - tt.clear(options["Threads"]); + tt.clear(threads); threads.clear(); // @TODO wont work with multiple instances @@ -97,6 +157,10 @@ void Engine::set_on_bestmove(std::function&& f) { + onVerifyNetworks = std::move(f); +} + void Engine::wait_for_search_finished() { threads.main_thread()->wait_for_search_finished(); } void Engine::set_position(const std::string& fen, const std::vector& moves) { @@ -124,11 +188,42 @@ void Engine::set_position(const std::string& fen, const std::vector // modifiers -void Engine::resize_threads() { threads.set({options, threads, tt, networks}, updateContext); } +void Engine::set_numa_config_from_option(const std::string& o) { + if (o == "auto" || o == "system") + { + numaContext.set_numa_config(NumaConfig::from_system()); + } + else if (o == "hardware") + { + // Don't respect affinity set in the system. + numaContext.set_numa_config(NumaConfig::from_system(false)); + } + else if (o == "none") + { + numaContext.set_numa_config(NumaConfig{}); + } + else + { + numaContext.set_numa_config(NumaConfig::from_string(o)); + } + + // Force reallocation of threads in case affinities need to change. + resize_threads(); + threads.ensure_network_replicated(); +} + +void Engine::resize_threads() { + threads.wait_for_search_finished(); + threads.set(numaContext.get_numa_config(), {options, threads, tt, networks}, updateContext); + + // Reallocate the hash with the new threadpool size + set_tt_size(options["Hash"]); + threads.ensure_network_replicated(); +} void Engine::set_tt_size(size_t mb) { wait_for_search_finished(); - tt.resize(mb, options["Threads"]); + tt.resize(mb, threads); } void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; } @@ -136,28 +231,38 @@ void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; } // network related void Engine::verify_networks() const { - networks.big.verify(options["EvalFile"]); - networks.small.verify(options["EvalFileSmall"]); + networks->big.verify(options["EvalFile"], onVerifyNetworks); + networks->small.verify(options["EvalFileSmall"], onVerifyNetworks); } void Engine::load_networks() { - load_big_network(options["EvalFile"]); - load_small_network(options["EvalFileSmall"]); + networks.modify_and_replicate([this](NN::Networks& networks_) { + networks_.big.load(binaryDirectory, options["EvalFile"]); + networks_.small.load(binaryDirectory, options["EvalFileSmall"]); + }); + threads.clear(); + threads.ensure_network_replicated(); } void Engine::load_big_network(const std::string& file) { - networks.big.load(binaryDirectory, file); + networks.modify_and_replicate( + [this, &file](NN::Networks& networks_) { networks_.big.load(binaryDirectory, file); }); threads.clear(); + threads.ensure_network_replicated(); } void Engine::load_small_network(const std::string& file) { - networks.small.load(binaryDirectory, file); + networks.modify_and_replicate( + [this, &file](NN::Networks& networks_) { networks_.small.load(binaryDirectory, file); }); threads.clear(); + threads.ensure_network_replicated(); } void Engine::save_network(const std::pair, std::string> files[2]) { - networks.big.save(files[0].first); - networks.small.save(files[1].first); + networks.modify_and_replicate([&files](NN::Networks& networks_) { + networks_.big.save(files[0].first); + networks_.small.save(files[1].first); + }); } // utility functions @@ -169,10 +274,11 @@ void Engine::trace_eval() const { verify_networks(); - sync_cout << "\n" << Eval::trace(p, networks) << sync_endl; + sync_cout << "\n" << Eval::trace(p, *networks) << sync_endl; } -OptionsMap& Engine::get_options() { return options; } +const OptionsMap& Engine::get_options() const { return options; } +OptionsMap& Engine::get_options() { return options; } std::string Engine::fen() const { return pos.fen(); } @@ -184,4 +290,63 @@ std::string Engine::visualize() const { return ss.str(); } +int Engine::get_hashfull(int maxAge) const { return tt.hashfull(maxAge); } + +std::vector> Engine::get_bound_thread_count_by_numa_node() const { + auto counts = threads.get_bound_thread_count_by_numa_node(); + const NumaConfig& cfg = numaContext.get_numa_config(); + std::vector> ratios; + NumaIndex n = 0; + for (; n < counts.size(); ++n) + ratios.emplace_back(counts[n], cfg.num_cpus_in_numa_node(n)); + if (!counts.empty()) + for (; n < cfg.num_numa_nodes(); ++n) + ratios.emplace_back(0, cfg.num_cpus_in_numa_node(n)); + return ratios; +} + +std::string Engine::get_numa_config_as_string() const { + return numaContext.get_numa_config().to_string(); +} + +std::string Engine::numa_config_information_as_string() const { + auto cfgStr = get_numa_config_as_string(); + return "Available processors: " + cfgStr; +} + +std::string Engine::thread_binding_information_as_string() const { + auto boundThreadsByNode = get_bound_thread_count_by_numa_node(); + std::stringstream ss; + if (boundThreadsByNode.empty()) + return ss.str(); + + bool isFirst = true; + + for (auto&& [current, total] : boundThreadsByNode) + { + if (!isFirst) + ss << ":"; + ss << current << "/" << total; + isFirst = false; + } + + return ss.str(); +} + +std::string Engine::thread_allocation_information_as_string() const { + std::stringstream ss; + + size_t threadsSize = threads.size(); + ss << "Using " << threadsSize << (threadsSize > 1 ? " threads" : " thread"); + + auto boundThreadsByNodeStr = thread_binding_information_as_string(); + if (boundThreadsByNodeStr.empty()) + return ss.str(); + + ss << " with NUMA node thread binding: "; + ss << boundThreadsByNodeStr; + + return ss.str(); +} + } diff --git a/src/engine.h b/src/engine.h index 64a814cb..25782693 100644 --- a/src/engine.h +++ b/src/engine.h @@ -29,6 +29,7 @@ #include #include "nnue/network.h" +#include "numa.h" #include "position.h" #include "search.h" #include "syzygy/tbprobe.h" // for Stockfish::Depth @@ -46,7 +47,14 @@ class Engine { using InfoFull = Search::InfoFull; using InfoIter = Search::InfoIteration; - Engine(std::string path = ""); + Engine(std::optional path = std::nullopt); + + // Cannot be movable due to components holding backreferences to fields + Engine(const Engine&) = delete; + Engine(Engine&&) = delete; + Engine& operator=(const Engine&) = delete; + Engine& operator=(Engine&&) = delete; + ~Engine() { wait_for_search_finished(); } std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960); @@ -63,6 +71,7 @@ class Engine { // modifiers + void set_numa_config_from_option(const std::string& o); void resize_threads(); void set_tt_size(size_t mb); void set_ponderhit(bool); @@ -72,6 +81,7 @@ class Engine { void set_on_update_full(std::function&&); void set_on_iter(std::function&&); void set_on_bestmove(std::function&&); + void set_on_verify_networks(std::function&&); // network related @@ -83,25 +93,38 @@ class Engine { // utility functions - void trace_eval() const; - OptionsMap& get_options(); - std::string fen() const; - void flip(); - std::string visualize() const; + void trace_eval() const; + + const OptionsMap& get_options() const; + OptionsMap& get_options(); + + int get_hashfull(int maxAge = 0) const; + + std::string fen() const; + void flip(); + std::string visualize() const; + std::vector> get_bound_thread_count_by_numa_node() const; + std::string get_numa_config_as_string() const; + std::string numa_config_information_as_string() const; + std::string thread_allocation_information_as_string() const; + std::string thread_binding_information_as_string() const; private: const std::string binaryDirectory; + NumaReplicationContext numaContext; + Position pos; StateListPtr states; Square capSq; - OptionsMap options; - ThreadPool threads; - TranspositionTable tt; - Eval::NNUE::Networks networks; + OptionsMap options; + ThreadPool threads; + TranspositionTable tt; + LazyNumaReplicated networks; - Search::SearchManager::UpdateContext updateContext; + Search::SearchManager::UpdateContext updateContext; + std::function onVerifyNetworks; }; } // namespace Stockfish diff --git a/src/evaluate.cpp b/src/evaluate.cpp index dd0480fa..5e580da5 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -24,8 +24,9 @@ #include #include #include -#include #include +#include +#include #include "nnue/network.h" #include "nnue/nnue_misc.h" @@ -44,6 +45,10 @@ int Eval::simple_eval(const Position& pos, Color c) { + (pos.non_pawn_material(c) - pos.non_pawn_material(~c)); } +bool Eval::use_smallnet(const Position& pos) { + int simpleEval = simple_eval(pos, pos.side_to_move()); + return std::abs(simpleEval) > 962; +} // Evaluate is the evaluator for the outer world. It returns a static evaluation // of the position from the point of view of the side to move. @@ -54,34 +59,30 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, assert(!pos.checkers()); - int simpleEval = simple_eval(pos, pos.side_to_move()); - bool smallNet = std::abs(simpleEval) > SmallNetThreshold; - int nnueComplexity; - int v; + bool smallNet = use_smallnet(pos); + auto [psqt, positional] = smallNet ? networks.small.evaluate(pos, &caches.small) + : networks.big.evaluate(pos, &caches.big); - Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity) - : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity); + Value nnue = (125 * psqt + 131 * positional) / 128; - const auto adjustEval = [&](int nnueDiv, int pawnCountConstant, int pawnCountMul, - int npmConstant, int evalDiv, int shufflingConstant) { - // Blend optimism and eval with nnue complexity and material imbalance - optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / 584; - nnue -= nnue * (nnueComplexity * 5 / 3) / nnueDiv; + // Re-evaluate the position when higher eval accuracy is worth the time spent + if (smallNet && (std::abs(nnue) < 236)) + { + std::tie(psqt, positional) = networks.big.evaluate(pos, &caches.big); + nnue = (125 * psqt + 131 * positional) / 128; + smallNet = false; + } - int npm = pos.non_pawn_material() / 64; - v = (nnue * (npm + pawnCountConstant + pawnCountMul * pos.count()) - + optimism * (npmConstant + npm)) - / evalDiv; + // Blend optimism and eval with nnue complexity + int nnueComplexity = std::abs(psqt - positional); + optimism += optimism * nnueComplexity / 468; + nnue -= nnue * nnueComplexity / (smallNet ? 20233 : 17879); - // Damp down the evaluation linearly when shuffling - int shuffling = pos.rule50_count(); - v = v * (shufflingConstant - shuffling) / 207; - }; + int material = (smallNet ? 553 : 532) * pos.count() + pos.non_pawn_material(); + int v = (nnue * (77777 + material) + optimism * (7777 + material)) / 77777; - if (!smallNet) - adjustEval(32395, 942, 11, 139, 1058, 178); - else - adjustEval(32793, 944, 9, 140, 1067, 206); + // Damp down the evaluation linearly when shuffling + v -= v * pos.rule50_count() / 212; // Guarantee evaluation does not hit the tablebase range v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); @@ -108,8 +109,9 @@ std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) { ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15); - Value v = networks.big.evaluate(pos, &caches->big, false); - v = pos.side_to_move() == WHITE ? v : -v; + auto [psqt, positional] = networks.big.evaluate(pos, &caches->big); + Value v = psqt + positional; + v = pos.side_to_move() == WHITE ? v : -v; ss << "NNUE evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n"; v = evaluate(networks, pos, *caches, VALUE_ZERO); diff --git a/src/evaluate.h b/src/evaluate.h index 2d244ff6..4604321d 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -29,14 +29,12 @@ class Position; namespace Eval { -constexpr inline int SmallNetThreshold = 1274; - // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro or the location where this macro is defined, as it is used // in the Makefile/Fishtest. -#define EvalFileDefaultNameBig "nn-ae6a388e4a1a.nnue" -#define EvalFileDefaultNameSmall "nn-baff1ede1f90.nnue" +#define EvalFileDefaultNameBig "nn-1c0000000000.nnue" +#define EvalFileDefaultNameSmall "nn-37f18f62d772.nnue" namespace NNUE { struct Networks; @@ -46,6 +44,7 @@ struct AccumulatorCaches; std::string trace(Position& pos, const Eval::NNUE::Networks& networks); int simple_eval(const Position& pos, Color c); +bool use_smallnet(const Position& pos); Value evaluate(const NNUE::Networks& networks, const Position& pos, Eval::NNUE::AccumulatorCaches& caches, diff --git a/src/history.h b/src/history.h new file mode 100644 index 00000000..8d14a7a7 --- /dev/null +++ b/src/history.h @@ -0,0 +1,185 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef HISTORY_H_INCLUDED +#define HISTORY_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: keep + +#include "position.h" + +namespace Stockfish { + +constexpr int PAWN_HISTORY_SIZE = 512; // has to be a power of 2 +constexpr int CORRECTION_HISTORY_SIZE = 32768; // has to be a power of 2 +constexpr int CORRECTION_HISTORY_LIMIT = 1024; +constexpr int LOW_PLY_HISTORY_SIZE = 4; + +static_assert((PAWN_HISTORY_SIZE & (PAWN_HISTORY_SIZE - 1)) == 0, + "PAWN_HISTORY_SIZE has to be a power of 2"); + +static_assert((CORRECTION_HISTORY_SIZE & (CORRECTION_HISTORY_SIZE - 1)) == 0, + "CORRECTION_HISTORY_SIZE has to be a power of 2"); + +enum PawnHistoryType { + Normal, + Correction +}; + +template +inline int pawn_structure_index(const Position& pos) { + return pos.pawn_key() & ((T == Normal ? PAWN_HISTORY_SIZE : CORRECTION_HISTORY_SIZE) - 1); +} + +inline int major_piece_index(const Position& pos) { + return pos.major_piece_key() & (CORRECTION_HISTORY_SIZE - 1); +} + +inline int minor_piece_index(const Position& pos) { + return pos.minor_piece_key() & (CORRECTION_HISTORY_SIZE - 1); +} + +template +inline int non_pawn_index(const Position& pos) { + return pos.non_pawn_key(c) & (CORRECTION_HISTORY_SIZE - 1); +} + +// StatsEntry stores the stat table value. It is usually a number but could +// be a move or even a nested history. We use a class instead of a naked value +// to directly call history update operator<<() on the entry so to use stats +// tables at caller sites as simple multi-dim arrays. +template +class StatsEntry { + + T entry; + + public: + void operator=(const T& v) { entry = v; } + T* operator&() { return &entry; } + T* operator->() { return &entry; } + operator const T&() const { return entry; } + + void operator<<(int bonus) { + static_assert(D <= std::numeric_limits::max(), "D overflows T"); + + // Make sure that bonus is in range [-D, D] + int clampedBonus = std::clamp(bonus, -D, D); + entry += clampedBonus - entry * std::abs(clampedBonus) / D; + + assert(std::abs(entry) <= D); + } +}; + +// Stats is a generic N-dimensional array used to store various statistics. +// The first template parameter T is the base type of the array, and the second +// template parameter D limits the range of updates in [-D, D] when we update +// values with the << operator, while the last parameters (Size and Sizes) +// encode the dimensions of the array. +template +struct Stats: public std::array, Size> { + using stats = Stats; + + void fill(const T& v) { + + // For standard-layout 'this' points to the first struct member + assert(std::is_standard_layout_v); + + using entry = StatsEntry; + entry* p = reinterpret_cast(this); + std::fill(p, p + sizeof(*this) / sizeof(entry), v); + } +}; + +template +struct Stats: public std::array, Size> {}; + +// In stats table, D=0 means that the template parameter is not used +enum StatsParams { + NOT_USED = 0 +}; +enum StatsType { + NoCaptures, + Captures +}; + +// ButterflyHistory records how often quiet moves have been successful or unsuccessful +// during the current search, and is used for reduction and move ordering decisions. +// It uses 2 tables (one for each color) indexed by the move's from and to squares, +// see https://www.chessprogramming.org/Butterfly_Boards (~11 elo) +using ButterflyHistory = Stats; + +// LowPlyHistory is adressed by play and move's from and to squares, used +// to improve move ordering near the root +using LowPlyHistory = Stats; + +// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type] +using CapturePieceToHistory = Stats; + +// PieceToHistory is like ButterflyHistory but is addressed by a move's [piece][to] +using PieceToHistory = Stats; + +// ContinuationHistory is the combined history of a given pair of moves, usually +// the current one given a previous one. The nested history table is based on +// PieceToHistory instead of ButterflyBoards. +// (~63 elo) +using ContinuationHistory = Stats; + +// PawnHistory is addressed by the pawn structure and a move's [piece][to] +using PawnHistory = Stats; + +// Correction histories record differences between the static evaluation of +// positions and their search score. It is used to improve the static evaluation +// used by some search heuristics. +// see https://www.chessprogramming.org/Static_Evaluation_Correction_History +enum CorrHistType { + Pawn, // By color and pawn structure + Major, // By color and positions of major pieces (Queen, Rook) and King + Minor, // By color and positions of minor pieces (Knight, Bishop) and King + NonPawn, // By color and non-pawn material positions + PieceTo, // By [piece][to] move + Continuation, // Combined history of move pairs +}; + +template +struct CorrHistTypedef { + using type = Stats; +}; + +template<> +struct CorrHistTypedef { + using type = Stats; +}; + +template<> +struct CorrHistTypedef { + using type = Stats::type, NOT_USED, PIECE_NB, SQUARE_NB>; +}; + +template +using CorrectionHistory = typename CorrHistTypedef::type; + +} // namespace Stockfish + +#endif // #ifndef HISTORY_H_INCLUDED diff --git a/src/memory.cpp b/src/memory.cpp new file mode 100644 index 00000000..47c901b4 --- /dev/null +++ b/src/memory.cpp @@ -0,0 +1,268 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "memory.h" + +#include + +#if __has_include("features.h") + #include +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + #include +#endif + +#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \ + || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \ + || defined(__e2k__) + #define POSIXALIGNEDALLOC + #include +#endif + +#ifdef _WIN32 + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + + #ifndef NOMINMAX + #define NOMINMAX + #endif + + #include // std::hex, std::dec + #include // std::cerr + #include // std::endl + #include + +// The needed Windows API for processor groups could be missed from old Windows +// versions, so instead of calling them directly (forcing the linker to resolve +// the calls at compile time), try to load them at runtime. To do this we need +// first to define the corresponding function pointers. + +extern "C" { +using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE); +using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID); +using AdjustTokenPrivileges_t = + bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD); +} +#endif + + +namespace Stockfish { + +// Wrappers for systems where the c++17 implementation does not guarantee the +// availability of aligned_alloc(). Memory allocated with std_aligned_alloc() +// must be freed with std_aligned_free(). + +void* std_aligned_alloc(size_t alignment, size_t size) { +#if defined(_ISOC11_SOURCE) + return aligned_alloc(alignment, size); +#elif defined(POSIXALIGNEDALLOC) + void* mem = nullptr; + posix_memalign(&mem, alignment, size); + return mem; +#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) + return _mm_malloc(size, alignment); +#elif defined(_WIN32) + return _aligned_malloc(size, alignment); +#else + return std::aligned_alloc(alignment, size); +#endif +} + +void std_aligned_free(void* ptr) { + +#if defined(POSIXALIGNEDALLOC) + free(ptr); +#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) + _mm_free(ptr); +#elif defined(_WIN32) + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +// aligned_large_pages_alloc() will return suitably aligned memory, +// if possible using large pages. + +#if defined(_WIN32) + +static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) { + + #if !defined(_WIN64) + return nullptr; + #else + + HANDLE hProcessToken{}; + LUID luid{}; + void* mem = nullptr; + + const size_t largePageSize = GetLargePageMinimum(); + if (!largePageSize) + return nullptr; + + // Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges + + HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll")); + + if (!hAdvapi32) + hAdvapi32 = LoadLibrary(TEXT("advapi32.dll")); + + auto OpenProcessToken_f = + OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken")); + if (!OpenProcessToken_f) + return nullptr; + auto LookupPrivilegeValueA_f = + LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA")); + if (!LookupPrivilegeValueA_f) + return nullptr; + auto AdjustTokenPrivileges_f = + AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges")); + if (!AdjustTokenPrivileges_f) + return nullptr; + + // We need SeLockMemoryPrivilege, so try to enable it for the process + + if (!OpenProcessToken_f( // OpenProcessToken() + GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) + return nullptr; + + if (LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid)) + { + TOKEN_PRIVILEGES tp{}; + TOKEN_PRIVILEGES prevTp{}; + DWORD prevTpLen = 0; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Luid = luid; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() + // succeeds, we still need to query GetLastError() to ensure that the privileges + // were actually obtained. + + if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, + &prevTpLen) + && GetLastError() == ERROR_SUCCESS) + { + // Round up size to full pages and allocate + allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); + mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, + PAGE_READWRITE); + + // Privilege no longer needed, restore previous state + AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr); + } + } + + CloseHandle(hProcessToken); + + return mem; + + #endif +} + +void* aligned_large_pages_alloc(size_t allocSize) { + + // Try to allocate large pages + void* mem = aligned_large_pages_alloc_windows(allocSize); + + // Fall back to regular, page-aligned, allocation if necessary + if (!mem) + mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + + return mem; +} + +#else + +void* aligned_large_pages_alloc(size_t allocSize) { + + #if defined(__linux__) + constexpr size_t alignment = 2 * 1024 * 1024; // 2MB page size assumed + #else + constexpr size_t alignment = 4096; // small page size assumed + #endif + + // Round up to multiples of alignment + size_t size = ((allocSize + alignment - 1) / alignment) * alignment; + void* mem = std_aligned_alloc(alignment, size); + #if defined(MADV_HUGEPAGE) + madvise(mem, size, MADV_HUGEPAGE); + #endif + return mem; +} + +#endif + +bool has_large_pages() { + +#if defined(_WIN32) + + constexpr size_t page_size = 2 * 1024 * 1024; // 2MB page size assumed + void* mem = aligned_large_pages_alloc_windows(page_size); + if (mem == nullptr) + { + return false; + } + else + { + aligned_large_pages_free(mem); + return true; + } + +#elif defined(__linux__) + + #if defined(MADV_HUGEPAGE) + return true; + #else + return false; + #endif + +#else + + return false; + +#endif +} + + +// aligned_large_pages_free() will free the previously memory allocated +// by aligned_large_pages_alloc(). The effect is a nop if mem == nullptr. + +#if defined(_WIN32) + +void aligned_large_pages_free(void* mem) { + + if (mem && !VirtualFree(mem, 0, MEM_RELEASE)) + { + DWORD err = GetLastError(); + std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err + << std::dec << std::endl; + exit(EXIT_FAILURE); + } +} + +#else + +void aligned_large_pages_free(void* mem) { std_aligned_free(mem); } + +#endif +} // namespace Stockfish diff --git a/src/memory.h b/src/memory.h new file mode 100644 index 00000000..eaf0261a --- /dev/null +++ b/src/memory.h @@ -0,0 +1,218 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef MEMORY_H_INCLUDED +#define MEMORY_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +namespace Stockfish { + +void* std_aligned_alloc(size_t alignment, size_t size); +void std_aligned_free(void* ptr); + +// Memory aligned by page size, min alignment: 4096 bytes +void* aligned_large_pages_alloc(size_t size); +void aligned_large_pages_free(void* mem); + +bool has_large_pages(); + +// Frees memory which was placed there with placement new. +// Works for both single objects and arrays of unknown bound. +template +void memory_deleter(T* ptr, FREE_FUNC free_func) { + if (!ptr) + return; + + // Explicitly needed to call the destructor + if constexpr (!std::is_trivially_destructible_v) + ptr->~T(); + + free_func(ptr); + return; +} + +// Frees memory which was placed there with placement new. +// Works for both single objects and arrays of unknown bound. +template +void memory_deleter_array(T* ptr, FREE_FUNC free_func) { + if (!ptr) + return; + + + // Move back on the pointer to where the size is allocated + const size_t array_offset = std::max(sizeof(size_t), alignof(T)); + char* raw_memory = reinterpret_cast(ptr) - array_offset; + + if constexpr (!std::is_trivially_destructible_v) + { + const size_t size = *reinterpret_cast(raw_memory); + + // Explicitly call the destructor for each element in reverse order + for (size_t i = size; i-- > 0;) + ptr[i].~T(); + } + + free_func(raw_memory); +} + +// Allocates memory for a single object and places it there with placement new +template +inline std::enable_if_t, T*> memory_allocator(ALLOC_FUNC alloc_func, + Args&&... args) { + void* raw_memory = alloc_func(sizeof(T)); + ASSERT_ALIGNED(raw_memory, alignof(T)); + return new (raw_memory) T(std::forward(args)...); +} + +// Allocates memory for an array of unknown bound and places it there with placement new +template +inline std::enable_if_t, std::remove_extent_t*> +memory_allocator(ALLOC_FUNC alloc_func, size_t num) { + using ElementType = std::remove_extent_t; + + const size_t array_offset = std::max(sizeof(size_t), alignof(ElementType)); + + // Save the array size in the memory location + char* raw_memory = + reinterpret_cast(alloc_func(array_offset + num * sizeof(ElementType))); + ASSERT_ALIGNED(raw_memory, alignof(T)); + + new (raw_memory) size_t(num); + + for (size_t i = 0; i < num; ++i) + new (raw_memory + array_offset + i * sizeof(ElementType)) ElementType(); + + // Need to return the pointer at the start of the array so that + // the indexing in unique_ptr works. + return reinterpret_cast(raw_memory + array_offset); +} + +// +// +// aligned large page unique ptr +// +// + +template +struct LargePageDeleter { + void operator()(T* ptr) const { return memory_deleter(ptr, aligned_large_pages_free); } +}; + +template +struct LargePageArrayDeleter { + void operator()(T* ptr) const { return memory_deleter_array(ptr, aligned_large_pages_free); } +}; + +template +using LargePagePtr = + std::conditional_t, + std::unique_ptr>>, + std::unique_ptr>>; + +// make_unique_large_page for single objects +template +std::enable_if_t, LargePagePtr> make_unique_large_page(Args&&... args) { + static_assert(alignof(T) <= 4096, + "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); + + T* obj = memory_allocator(aligned_large_pages_alloc, std::forward(args)...); + + return LargePagePtr(obj); +} + +// make_unique_large_page for arrays of unknown bound +template +std::enable_if_t, LargePagePtr> make_unique_large_page(size_t num) { + using ElementType = std::remove_extent_t; + + static_assert(alignof(ElementType) <= 4096, + "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); + + ElementType* memory = memory_allocator(aligned_large_pages_alloc, num); + + return LargePagePtr(memory); +} + +// +// +// aligned unique ptr +// +// + +template +struct AlignedDeleter { + void operator()(T* ptr) const { return memory_deleter(ptr, std_aligned_free); } +}; + +template +struct AlignedArrayDeleter { + void operator()(T* ptr) const { return memory_deleter_array(ptr, std_aligned_free); } +}; + +template +using AlignedPtr = + std::conditional_t, + std::unique_ptr>>, + std::unique_ptr>>; + +// make_unique_aligned for single objects +template +std::enable_if_t, AlignedPtr> make_unique_aligned(Args&&... args) { + const auto func = [](size_t size) { return std_aligned_alloc(alignof(T), size); }; + T* obj = memory_allocator(func, std::forward(args)...); + + return AlignedPtr(obj); +} + +// make_unique_aligned for arrays of unknown bound +template +std::enable_if_t, AlignedPtr> make_unique_aligned(size_t num) { + using ElementType = std::remove_extent_t; + + const auto func = [](size_t size) { return std_aligned_alloc(alignof(ElementType), size); }; + ElementType* memory = memory_allocator(func, num); + + return AlignedPtr(memory); +} + + +// Get the first aligned element of an array. +// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, +// where N is the number of elements in the array. +template +T* align_ptr_up(T* ptr) { + static_assert(alignof(T) < Alignment); + + const uintptr_t ptrint = reinterpret_cast(reinterpret_cast(ptr)); + return reinterpret_cast( + reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); +} + + +} // namespace Stockfish + +#endif // #ifndef MEMORY_H_INCLUDED diff --git a/src/misc.cpp b/src/misc.cpp index 1abb81b1..10c86b7a 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -18,58 +18,21 @@ #include "misc.h" -#ifdef _WIN32 - #if _WIN32_WINNT < 0x0601 - #undef _WIN32_WINNT - #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes - #endif - - #ifndef NOMINMAX - #define NOMINMAX - #endif - - #include -// The needed Windows API for processor groups could be missed from old Windows -// versions, so instead of calling them directly (forcing the linker to resolve -// the calls at compile time), try to load them at runtime. To do this we need -// first to define the corresponding function pointers. -extern "C" { -using fun1_t = bool (*)(LOGICAL_PROCESSOR_RELATIONSHIP, - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, - PDWORD); -using fun2_t = bool (*)(USHORT, PGROUP_AFFINITY); -using fun3_t = bool (*)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); -using fun4_t = bool (*)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT); -using fun5_t = WORD (*)(); -using fun6_t = bool (*)(HANDLE, DWORD, PHANDLE); -using fun7_t = bool (*)(LPCSTR, LPCSTR, PLUID); -using fun8_t = bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD); -} -#endif - #include +#include #include #include #include #include #include +#include +#include #include #include #include #include "types.h" -#if defined(__linux__) && !defined(__ANDROID__) - #include -#endif - -#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \ - || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \ - || defined(__e2k__) - #define POSIXALIGNEDALLOC - #include -#endif - namespace Stockfish { namespace { @@ -149,15 +112,17 @@ class Logger { // Returns the full name of the current Stockfish version. -// For local dev compiles we try to append the commit sha and commit date -// from git if that fails only the local compilation date is set and "nogit" is specified: -// Stockfish dev-YYYYMMDD-SHA -// or -// Stockfish dev-YYYYMMDD-nogit +// +// For local dev compiles we try to append the commit SHA and +// commit date from git. If that fails only the local compilation +// date is set and "nogit" is specified: +// Stockfish dev-YYYYMMDD-SHA +// or +// Stockfish dev-YYYYMMDD-nogit // // For releases (non-dev builds) we only include the version number: -// Stockfish version -std::string engine_info(bool to_uci) { +// Stockfish version +std::string engine_version_info() { std::stringstream ss; ss << "Stockfish " << version << std::setfill('0'); @@ -168,8 +133,9 @@ std::string engine_info(bool to_uci) { ss << stringify(GIT_DATE); #else constexpr std::string_view months("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec"); - std::string month, day, year; - std::stringstream date(__DATE__); // From compiler, format is "Sep 21 2008" + + std::string month, day, year; + std::stringstream date(__DATE__); // From compiler, format is "Sep 21 2008" date >> month >> day >> year; ss << year << std::setw(2) << std::setfill('0') << (1 + months.find(month) / 4) @@ -185,11 +151,14 @@ std::string engine_info(bool to_uci) { #endif } - ss << (to_uci ? "\nid author " : " by ") << "the Stockfish developers (see AUTHORS file)"; - return ss.str(); } +std::string engine_info(bool to_uci) { + return engine_version_info() + (to_uci ? "\nid author " : " by ") + + "the Stockfish developers (see AUTHORS file)"; +} + // Returns a string trying to describe the compiler we use std::string compiler_info() { @@ -318,13 +287,21 @@ template struct DebugInfo { std::atomic data[N] = {0}; - constexpr inline std::atomic& operator[](int index) { return data[index]; } + constexpr std::atomic& operator[](int index) { return data[index]; } }; -DebugInfo<2> hit[MaxDebugSlots]; -DebugInfo<2> mean[MaxDebugSlots]; -DebugInfo<3> stdev[MaxDebugSlots]; -DebugInfo<6> correl[MaxDebugSlots]; +struct DebugExtremes: public DebugInfo<3> { + DebugExtremes() { + data[1] = std::numeric_limits::min(); + data[2] = std::numeric_limits::max(); + } +}; + +DebugInfo<2> hit[MaxDebugSlots]; +DebugInfo<2> mean[MaxDebugSlots]; +DebugInfo<3> stdev[MaxDebugSlots]; +DebugInfo<6> correl[MaxDebugSlots]; +DebugExtremes extremes[MaxDebugSlots]; } // namespace @@ -348,6 +325,18 @@ void dbg_stdev_of(int64_t value, int slot) { stdev[slot][2] += value * value; } +void dbg_extremes_of(int64_t value, int slot) { + ++extremes[slot][0]; + + int64_t current_max = extremes[slot][1].load(); + while (current_max < value && !extremes[slot][1].compare_exchange_weak(current_max, value)) + {} + + int64_t current_min = extremes[slot][2].load(); + while (current_min > value && !extremes[slot][2].compare_exchange_weak(current_min, value)) + {} +} + void dbg_correl_of(int64_t value1, int64_t value2, int slot) { ++correl[slot][0]; @@ -382,6 +371,13 @@ void dbg_print() { std::cerr << "Stdev #" << i << ": Total " << n << " Stdev " << r << std::endl; } + for (int i = 0; i < MaxDebugSlots; ++i) + if ((n = extremes[i][0])) + { + std::cerr << "Extremity #" << i << ": Total " << n << " Min " << extremes[i][2] + << " Max " << extremes[i][1] << std::endl; + } + for (int i = 0; i < MaxDebugSlots; ++i) if ((n = correl[i][0])) { @@ -408,6 +404,8 @@ std::ostream& operator<<(std::ostream& os, SyncCout sc) { return os; } +void sync_cout_start() { std::cout << IO_LOCK; } +void sync_cout_end() { std::cout << IO_UNLOCK; } // Trampoline helper to avoid moving Logger to misc.h void start_logger(const std::string& fname) { Logger::start(fname); } @@ -415,14 +413,14 @@ void start_logger(const std::string& fname) { Logger::start(fname); } #ifdef NO_PREFETCH -void prefetch(void*) {} +void prefetch(const void*) {} #else -void prefetch(void* addr) { +void prefetch(const void* addr) { #if defined(_MSC_VER) - _mm_prefetch((char*) addr, _MM_HINT_T0); + _mm_prefetch((char const*) addr, _MM_HINT_T0); #else __builtin_prefetch(addr); #endif @@ -430,291 +428,6 @@ void prefetch(void* addr) { #endif - -// Wrapper for systems where the c++17 implementation -// does not guarantee the availability of aligned_alloc(). Memory allocated with -// std_aligned_alloc() must be freed with std_aligned_free(). -void* std_aligned_alloc(size_t alignment, size_t size) { - -#if defined(POSIXALIGNEDALLOC) - void* mem; - return posix_memalign(&mem, alignment, size) ? nullptr : mem; -#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) - return _mm_malloc(size, alignment); -#elif defined(_WIN32) - return _aligned_malloc(size, alignment); -#else - return std::aligned_alloc(alignment, size); -#endif -} - -void std_aligned_free(void* ptr) { - -#if defined(POSIXALIGNEDALLOC) - free(ptr); -#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) - _mm_free(ptr); -#elif defined(_WIN32) - _aligned_free(ptr); -#else - free(ptr); -#endif -} - -// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages. - -#if defined(_WIN32) - -static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) { - - #if !defined(_WIN64) - return nullptr; - #else - - HANDLE hProcessToken{}; - LUID luid{}; - void* mem = nullptr; - - const size_t largePageSize = GetLargePageMinimum(); - if (!largePageSize) - return nullptr; - - // Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges - - HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll")); - - if (!hAdvapi32) - hAdvapi32 = LoadLibrary(TEXT("advapi32.dll")); - - auto fun6 = fun6_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken")); - if (!fun6) - return nullptr; - auto fun7 = fun7_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA")); - if (!fun7) - return nullptr; - auto fun8 = fun8_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges")); - if (!fun8) - return nullptr; - - // We need SeLockMemoryPrivilege, so try to enable it for the process - if (!fun6( // OpenProcessToken() - GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) - return nullptr; - - if (fun7( // LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME, &luid) - nullptr, "SeLockMemoryPrivilege", &luid)) - { - TOKEN_PRIVILEGES tp{}; - TOKEN_PRIVILEGES prevTp{}; - DWORD prevTpLen = 0; - - tp.PrivilegeCount = 1; - tp.Privileges[0].Luid = luid; - tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - - // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds, - // we still need to query GetLastError() to ensure that the privileges were actually obtained. - if (fun8( // AdjustTokenPrivileges() - hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) - && GetLastError() == ERROR_SUCCESS) - { - // Round up size to full pages and allocate - allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); - mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, - PAGE_READWRITE); - - // Privilege no longer needed, restore previous state - fun8( // AdjustTokenPrivileges () - hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr); - } - } - - CloseHandle(hProcessToken); - - return mem; - - #endif -} - -void* aligned_large_pages_alloc(size_t allocSize) { - - // Try to allocate large pages - void* mem = aligned_large_pages_alloc_windows(allocSize); - - // Fall back to regular, page-aligned, allocation if necessary - if (!mem) - mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); - - return mem; -} - -#else - -void* aligned_large_pages_alloc(size_t allocSize) { - - #if defined(__linux__) - constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size - #else - constexpr size_t alignment = 4096; // assumed small page size - #endif - - // Round up to multiples of alignment - size_t size = ((allocSize + alignment - 1) / alignment) * alignment; - void* mem = std_aligned_alloc(alignment, size); - #if defined(MADV_HUGEPAGE) - madvise(mem, size, MADV_HUGEPAGE); - #endif - return mem; -} - -#endif - - -// aligned_large_pages_free() will free the previously allocated ttmem - -#if defined(_WIN32) - -void aligned_large_pages_free(void* mem) { - - if (mem && !VirtualFree(mem, 0, MEM_RELEASE)) - { - DWORD err = GetLastError(); - std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err - << std::dec << std::endl; - exit(EXIT_FAILURE); - } -} - -#else - -void aligned_large_pages_free(void* mem) { std_aligned_free(mem); } - -#endif - - -namespace WinProcGroup { - -#ifndef _WIN32 - -void bind_this_thread(size_t) {} - -#else - -namespace { -// Retrieves logical processor information using Windows-specific -// API and returns the best node id for the thread with index idx. Original -// code from Texel by Peter Österlund. -int best_node(size_t idx) { - - int threads = 0; - int nodes = 0; - int cores = 0; - DWORD returnLength = 0; - DWORD byteOffset = 0; - - // Early exit if the needed API is not available at runtime - HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); - auto fun1 = (fun1_t) (void (*)()) GetProcAddress(k32, "GetLogicalProcessorInformationEx"); - if (!fun1) - return -1; - - // First call to GetLogicalProcessorInformationEx() to get returnLength. - // We expect the call to fail due to null buffer. - if (fun1(RelationAll, nullptr, &returnLength)) - return -1; - - // Once we know returnLength, allocate the buffer - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr; - ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) malloc(returnLength); - - // Second call to GetLogicalProcessorInformationEx(), now we expect to succeed - if (!fun1(RelationAll, buffer, &returnLength)) - { - free(buffer); - return -1; - } - - while (byteOffset < returnLength) - { - if (ptr->Relationship == RelationNumaNode) - nodes++; - - else if (ptr->Relationship == RelationProcessorCore) - { - cores++; - threads += (ptr->Processor.Flags == LTP_PC_SMT) ? 2 : 1; - } - - assert(ptr->Size); - byteOffset += ptr->Size; - ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) (((char*) ptr) + ptr->Size); - } - - free(buffer); - - std::vector groups; - - // Run as many threads as possible on the same node until the core limit is - // reached, then move on to filling the next node. - for (int n = 0; n < nodes; n++) - for (int i = 0; i < cores / nodes; i++) - groups.push_back(n); - - // In case a core has more than one logical processor (we assume 2) and we - // still have threads to allocate, spread them evenly across available nodes. - for (int t = 0; t < threads - cores; t++) - groups.push_back(t % nodes); - - // If we still have more threads than the total number of logical processors - // then return -1 and let the OS to decide what to do. - return idx < groups.size() ? groups[idx] : -1; -} -} - - -// Sets the group affinity of the current thread -void bind_this_thread(size_t idx) { - - // Use only local variables to be thread-safe - int node = best_node(idx); - - if (node == -1) - return; - - // Early exit if the needed API are not available at runtime - HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); - auto fun2 = fun2_t((void (*)()) GetProcAddress(k32, "GetNumaNodeProcessorMaskEx")); - auto fun3 = fun3_t((void (*)()) GetProcAddress(k32, "SetThreadGroupAffinity")); - auto fun4 = fun4_t((void (*)()) GetProcAddress(k32, "GetNumaNodeProcessorMask2")); - auto fun5 = fun5_t((void (*)()) GetProcAddress(k32, "GetMaximumProcessorGroupCount")); - - if (!fun2 || !fun3) - return; - - if (!fun4 || !fun5) - { - GROUP_AFFINITY affinity; - if (fun2(node, &affinity)) // GetNumaNodeProcessorMaskEx - fun3(GetCurrentThread(), &affinity, nullptr); // SetThreadGroupAffinity - } - else - { - // If a numa node has more than one processor group, we assume they are - // sized equal and we spread threads evenly across the groups. - USHORT elements, returnedElements; - elements = fun5(); // GetMaximumProcessorGroupCount - GROUP_AFFINITY* affinity = (GROUP_AFFINITY*) malloc(elements * sizeof(GROUP_AFFINITY)); - if (fun4(node, affinity, elements, &returnedElements)) // GetNumaNodeProcessorMask2 - fun3(GetCurrentThread(), &affinity[idx % returnedElements], - nullptr); // SetThreadGroupAffinity - free(affinity); - } -} - -#endif - -} // namespace WinProcGroup - #ifdef _WIN32 #include #define GETCWD _getcwd @@ -723,6 +436,27 @@ void bind_this_thread(size_t idx) { #define GETCWD getcwd #endif +size_t str_to_size_t(const std::string& s) { + unsigned long long value = std::stoull(s); + if (value > std::numeric_limits::max()) + std::exit(EXIT_FAILURE); + return static_cast(value); +} + +std::optional read_file_to_string(const std::string& path) { + std::ifstream f(path, std::ios_base::binary); + if (!f) + return std::nullopt; + return std::string(std::istreambuf_iterator(f), std::istreambuf_iterator()); +} + +void remove_whitespace(std::string& s) { + s.erase(std::remove_if(s.begin(), s.end(), [](char c) { return std::isspace(c); }), s.end()); +} + +bool is_whitespace(std::string_view s) { + return std::all_of(s.begin(), s.end(), [](char c) { return std::isspace(c); }); +} std::string CommandLine::get_binary_directory(std::string argv0) { std::string pathSeparator; diff --git a/src/misc.h b/src/misc.h index d75b236f..21093769 100644 --- a/src/misc.h +++ b/src/misc.h @@ -24,9 +24,11 @@ #include #include #include +#include #include -#include +#include #include +#include #include #define stringify2(x) #x @@ -34,49 +36,40 @@ namespace Stockfish { +std::string engine_version_info(); std::string engine_info(bool to_uci = false); std::string compiler_info(); // Preloads the given address in L1/L2 cache. This is a non-blocking // function that doesn't stall the CPU waiting for data to be loaded from memory, // which can be quite slow. -void prefetch(void* addr); +void prefetch(const void* addr); -void start_logger(const std::string& fname); -void* std_aligned_alloc(size_t alignment, size_t size); -void std_aligned_free(void* ptr); -// memory aligned by page size, min alignment: 4096 bytes -void* aligned_large_pages_alloc(size_t size); -// nop if mem == nullptr -void aligned_large_pages_free(void* mem); +void start_logger(const std::string& fname); -// Deleter for automating release of memory area -template -struct AlignedDeleter { - void operator()(T* ptr) const { - ptr->~T(); - std_aligned_free(ptr); +size_t str_to_size_t(const std::string& s); + +#if defined(__linux__) + +struct PipeDeleter { + void operator()(FILE* file) const { + if (file != nullptr) + { + pclose(file); + } } }; -template -struct LargePageDeleter { - void operator()(T* ptr) const { - ptr->~T(); - aligned_large_pages_free(ptr); - } -}; - -template -using AlignedPtr = std::unique_ptr>; - -template -using LargePagePtr = std::unique_ptr>; +#endif +// Reads the file as bytes. +// Returns std::nullopt if the file does not exist. +std::optional read_file_to_string(const std::string& path); void dbg_hit_on(bool cond, int slot = 0); void dbg_mean_of(int64_t value, int slot = 0); void dbg_stdev_of(int64_t value, int slot = 0); +void dbg_extremes_of(int64_t value, int slot = 0); void dbg_correl_of(int64_t value1, int64_t value2, int slot = 0); void dbg_print(); @@ -88,6 +81,30 @@ inline TimePoint now() { .count(); } +inline std::vector split(std::string_view s, std::string_view delimiter) { + std::vector res; + + if (s.empty()) + return res; + + size_t begin = 0; + for (;;) + { + const size_t end = s.find(delimiter, begin); + if (end == std::string::npos) + break; + + res.emplace_back(s.substr(begin, end - begin)); + begin = end + delimiter.size(); + } + + res.emplace_back(s.substr(begin)); + + return res; +} + +void remove_whitespace(std::string& s); +bool is_whitespace(std::string_view s); enum SyncCout { IO_LOCK, @@ -98,19 +115,8 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK - -// Get the first aligned element of an array. -// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, -// where N is the number of elements in the array. -template -T* align_ptr_up(T* ptr) { - static_assert(alignof(T) < Alignment); - - const uintptr_t ptrint = reinterpret_cast(reinterpret_cast(ptr)); - return reinterpret_cast( - reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); -} - +void sync_cout_start(); +void sync_cout_end(); // True if and only if the binary is compiled on a little-endian machine static inline const union { @@ -194,15 +200,6 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) { #endif } -// Under Windows it is not possible for a process to run on more than one -// logical processor group. This usually means being limited to using max 64 -// cores. To overcome this, some special platform-specific API should be -// called to set group affinity for each thread. Original code from Texel by -// Peter Österlund. -namespace WinProcGroup { -void bind_this_thread(size_t idx); -} - struct CommandLine { public: diff --git a/src/movegen.cpp b/src/movegen.cpp index e6923067..69b8fe6a 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -75,17 +75,6 @@ ExtMove* generate_pawn_moves(const Position& pos, ExtMove* moveList, Bitboard ta b2 &= target; } - if constexpr (Type == QUIET_CHECKS) - { - // To make a quiet check, you either make a direct check by pushing a pawn - // or push a blocker pawn that is not on the same file as the enemy king. - // Discovered check promotion has been already generated amongst the captures. - Square ksq = pos.square(Them); - Bitboard dcCandidatePawns = pos.blockers_for_king(Them) & ~file_bb(ksq); - b1 &= pawn_attacks_bb(Them, ksq) | shift(dcCandidatePawns); - b2 &= pawn_attacks_bb(Them, ksq) | shift(dcCandidatePawns); - } - while (b1) { Square to = pop_lsb(b1); @@ -158,7 +147,7 @@ ExtMove* generate_pawn_moves(const Position& pos, ExtMove* moveList, Bitboard ta } -template +template ExtMove* generate_moves(const Position& pos, ExtMove* moveList, Bitboard target) { static_assert(Pt != KING && Pt != PAWN, "Unsupported piece type in generate_moves()"); @@ -170,10 +159,6 @@ ExtMove* generate_moves(const Position& pos, ExtMove* moveList, Bitboard target) Square from = pop_lsb(bb); Bitboard b = attacks_bb(from, pos.pieces()) & target; - // To check, you either move freely a blocker or make a direct check. - if (Checks && (Pt == QUEEN || !(pos.blockers_for_king(~Us) & from))) - b &= pos.check_squares(Pt); - while (b) *moveList++ = Move(from, pop_lsb(b)); } @@ -187,9 +172,8 @@ ExtMove* generate_all(const Position& pos, ExtMove* moveList) { static_assert(Type != LEGAL, "Unsupported type in generate_all()"); - constexpr bool Checks = Type == QUIET_CHECKS; // Reduce template instantiations - const Square ksq = pos.square(Us); - Bitboard target; + const Square ksq = pos.square(Us); + Bitboard target; // Skip generating non-king moves when in double check if (Type != EVASIONS || !more_than_one(pos.checkers())) @@ -197,29 +181,24 @@ ExtMove* generate_all(const Position& pos, ExtMove* moveList) { target = Type == EVASIONS ? between_bb(ksq, lsb(pos.checkers())) : Type == NON_EVASIONS ? ~pos.pieces(Us) : Type == CAPTURES ? pos.pieces(~Us) - : ~pos.pieces(); // QUIETS || QUIET_CHECKS + : ~pos.pieces(); // QUIETS moveList = generate_pawn_moves(pos, moveList, target); - moveList = generate_moves(pos, moveList, target); - moveList = generate_moves(pos, moveList, target); - moveList = generate_moves(pos, moveList, target); - moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); + moveList = generate_moves(pos, moveList, target); } - if (!Checks || pos.blockers_for_king(~Us) & ksq) - { - Bitboard b = attacks_bb(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target); - if (Checks) - b &= ~attacks_bb(pos.square(~Us)); + Bitboard b = attacks_bb(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target); - while (b) - *moveList++ = Move(ksq, pop_lsb(b)); + while (b) + *moveList++ = Move(ksq, pop_lsb(b)); - if ((Type == QUIETS || Type == NON_EVASIONS) && pos.can_castle(Us & ANY_CASTLING)) - for (CastlingRights cr : {Us & KING_SIDE, Us & QUEEN_SIDE}) - if (!pos.castling_impeded(cr) && pos.can_castle(cr)) - *moveList++ = Move::make(ksq, pos.castling_rook_square(cr)); - } + if ((Type == QUIETS || Type == NON_EVASIONS) && pos.can_castle(Us & ANY_CASTLING)) + for (CastlingRights cr : {Us & KING_SIDE, Us & QUEEN_SIDE}) + if (!pos.castling_impeded(cr) && pos.can_castle(cr)) + *moveList++ = Move::make(ksq, pos.castling_rook_square(cr)); return moveList; } @@ -231,8 +210,6 @@ ExtMove* generate_all(const Position& pos, ExtMove* moveList) { // Generates all pseudo-legal non-captures and underpromotions // Generates all pseudo-legal check evasions // Generates all pseudo-legal captures and non-captures -// Generates all pseudo-legal non-captures giving check, -// except castling and promotions // // Returns a pointer to the end of the move list. template @@ -251,7 +228,6 @@ ExtMove* generate(const Position& pos, ExtMove* moveList) { template ExtMove* generate(const Position&, ExtMove*); template ExtMove* generate(const Position&, ExtMove*); template ExtMove* generate(const Position&, ExtMove*); -template ExtMove* generate(const Position&, ExtMove*); template ExtMove* generate(const Position&, ExtMove*); diff --git a/src/movegen.h b/src/movegen.h index 5f650d2e..f067f880 100644 --- a/src/movegen.h +++ b/src/movegen.h @@ -31,7 +31,6 @@ class Position; enum GenType { CAPTURES, QUIETS, - QUIET_CHECKS, EVASIONS, NON_EVASIONS, LEGAL diff --git a/src/movepick.cpp b/src/movepick.cpp index 4a93662d..96f03171 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -18,10 +18,9 @@ #include "movepick.h" -#include +#include #include -#include -#include +#include #include "bitboard.h" #include "position.h" @@ -35,7 +34,6 @@ enum Stages { MAIN_TT, CAPTURE_INIT, GOOD_CAPTURE, - REFUTATION, QUIET_INIT, GOOD_QUIET, BAD_CAPTURE, @@ -54,13 +52,11 @@ enum Stages { // generate qsearch moves QSEARCH_TT, QCAPTURE_INIT, - QCAPTURE, - QCHECK_INIT, - QCHECK + QCAPTURE }; -// Sort moves in descending order up to and including -// a given limit. The order of moves smaller than the limit is left unspecified. +// Sort moves in descending order up to and including a given limit. +// The order of moves smaller than the limit is left unspecified. void partial_insertion_sort(ExtMove* begin, ExtMove* end, int limit) { for (ExtMove *sortedEnd = begin, *p = begin + 1; p < end; ++p) @@ -78,56 +74,38 @@ void partial_insertion_sort(ExtMove* begin, ExtMove* end, int limit) { // Constructors of the MovePicker class. As arguments, we pass information -// to help it return the (presumably) good moves first, to decide which -// moves to return (in the quiescence search, for instance, we only want to -// search captures, promotions, and some checks) and how important a good -// move ordering is at the current node. +// to decide which class of moves to emit, to help sorting the (presumably) +// good moves first, and how important move ordering is at the current node. -// MovePicker constructor for the main search +// MovePicker constructor for the main search and for the quiescence search MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHistory* mh, + const LowPlyHistory* lph, const CapturePieceToHistory* cph, const PieceToHistory** ch, const PawnHistory* ph, - Move cm, - const Move* killers) : + int pl) : pos(p), mainHistory(mh), + lowPlyHistory(lph), captureHistory(cph), continuationHistory(ch), pawnHistory(ph), ttMove(ttm), - refutations{{killers[0], 0}, {killers[1], 0}, {cm, 0}}, - depth(d) { - assert(d > 0); + depth(d), + ply(pl) { - stage = (pos.checkers() ? EVASION_TT : MAIN_TT) + !(ttm && pos.pseudo_legal(ttm)); + if (pos.checkers()) + stage = EVASION_TT + !(ttm && pos.pseudo_legal(ttm)); + + else + stage = (depth > 0 ? MAIN_TT : QSEARCH_TT) + !(ttm && pos.pseudo_legal(ttm)); } -// Constructor for quiescence search -MovePicker::MovePicker(const Position& p, - Move ttm, - Depth d, - const ButterflyHistory* mh, - const CapturePieceToHistory* cph, - const PieceToHistory** ch, - const PawnHistory* ph) : - pos(p), - mainHistory(mh), - captureHistory(cph), - continuationHistory(ch), - pawnHistory(ph), - ttMove(ttm), - depth(d) { - assert(d <= 0); - - stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) + !(ttm && pos.pseudo_legal(ttm)); -} - -// Constructor for ProbCut: we generate captures with SEE greater -// than or equal to the given threshold. +// MovePicker constructor for ProbCut: we generate captures with Static Exchange +// Evaluation (SEE) greater than or equal to the given threshold. MovePicker::MovePicker(const Position& p, Move ttm, int th, const CapturePieceToHistory* cph) : pos(p), captureHistory(cph), @@ -139,9 +117,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, int th, const CapturePieceTo + !(ttm && pos.capture_stage(ttm) && pos.pseudo_legal(ttm) && pos.see_ge(ttm, threshold)); } -// Assigns a numerical value to each move in a list, used -// for sorting. Captures are ordered by Most Valuable Victim (MVV), preferring -// captures with a good history. Quiets moves are ordered using the history tables. +// Assigns a numerical value to each move in a list, used for sorting. +// Captures are ordered by Most Valuable Victim (MVV), preferring captures +// with a good history. Quiets moves are ordered using the history tables. template void MovePicker::score() { @@ -180,9 +158,9 @@ void MovePicker::score() { // histories m.value = 2 * (*mainHistory)[pos.side_to_move()][m.from_to()]; m.value += 2 * (*pawnHistory)[pawn_structure_index(pos)][pc][to]; - m.value += 2 * (*continuationHistory[0])[pc][to]; + m.value += (*continuationHistory[0])[pc][to]; m.value += (*continuationHistory[1])[pc][to]; - m.value += (*continuationHistory[2])[pc][to] / 4; + m.value += (*continuationHistory[2])[pc][to]; m.value += (*continuationHistory[3])[pc][to]; m.value += (*continuationHistory[5])[pc][to]; @@ -197,13 +175,12 @@ void MovePicker::score() { : 0; // malus for putting piece en prise - m.value -= !(threatenedPieces & from) - ? (pt == QUEEN ? bool(to & threatenedByRook) * 48150 - + bool(to & threatenedByMinor) * 10650 - : pt == ROOK ? bool(to & threatenedByMinor) * 24335 - : pt != PAWN ? bool(to & threatenedByPawn) * 14950 - : 0) - : 0; + m.value -= (pt == QUEEN ? bool(to & threatenedByRook) * 49000 + : pt == ROOK && bool(to & threatenedByMinor) ? 24335 + : 0); + + if (ply < LOW_PLY_HISTORY_SIZE) + m.value += 8 * (*lowPlyHistory)[ply][m.from_to()] / (1 + 2 * ply); } else // Type == EVASIONS @@ -219,27 +196,21 @@ void MovePicker::score() { } // Returns the next move satisfying a predicate function. -// It never returns the TT move. -template +// This never returns the TT move, as it was emitted before. +template Move MovePicker::select(Pred filter) { - while (cur < endMoves) - { - if constexpr (T == Best) - std::swap(*cur, *std::max_element(cur, endMoves)); - + for (; cur < endMoves; ++cur) if (*cur != ttMove && filter()) return *cur++; - cur++; - } return Move::none(); } -// Most important method of the MovePicker class. It -// returns a new pseudo-legal move every time it is called until there are no more -// moves left, picking the move with the highest score from a list of generated moves. -Move MovePicker::next_move(bool skipQuiets) { +// This is the most important method of the MovePicker class. We emit one +// new pseudo-legal move on every call until there are no more moves left, +// picking the move with the highest score from a list of generated moves. +Move MovePicker::next_move() { auto quiet_threshold = [](Depth d) { return -3560 * d; }; @@ -266,29 +237,13 @@ top: goto top; case GOOD_CAPTURE : - if (select([&]() { + if (select([&]() { // Move losing capture to endBadCaptures to be tried later return pos.see_ge(*cur, -cur->value / 18) ? true : (*endBadCaptures++ = *cur, false); })) return *(cur - 1); - // Prepare the pointers to loop over the refutations array - cur = std::begin(refutations); - endMoves = std::end(refutations); - - // If the countermove is the same as a killer, skip it - if (refutations[0] == refutations[2] || refutations[1] == refutations[2]) - --endMoves; - - ++stage; - [[fallthrough]]; - - case REFUTATION : - if (select([&]() { - return *cur != Move::none() && !pos.capture_stage(*cur) && pos.pseudo_legal(*cur); - })) - return *(cur - 1); ++stage; [[fallthrough]]; @@ -306,9 +261,7 @@ top: [[fallthrough]]; case GOOD_QUIET : - if (!skipQuiets && select([&]() { - return *cur != refutations[0] && *cur != refutations[1] && *cur != refutations[2]; - })) + if (!skipQuiets && select([]() { return true; })) { if ((cur - 1)->value > -7998 || (cur - 1)->value <= quiet_threshold(depth)) return *(cur - 1); @@ -325,7 +278,7 @@ top: [[fallthrough]]; case BAD_CAPTURE : - if (select([]() { return true; })) + if (select([]() { return true; })) return *(cur - 1); // Prepare the pointers to loop over the bad quiets @@ -337,9 +290,7 @@ top: case BAD_QUIET : if (!skipQuiets) - return select([&]() { - return *cur != refutations[0] && *cur != refutations[1] && *cur != refutations[2]; - }); + return select([]() { return true; }); return Move::none(); @@ -348,39 +299,22 @@ top: endMoves = generate(pos, cur); score(); + partial_insertion_sort(cur, endMoves, std::numeric_limits::min()); ++stage; [[fallthrough]]; case EVASION : - return select([]() { return true; }); + case QCAPTURE : + return select([]() { return true; }); case PROBCUT : - return select([&]() { return pos.see_ge(*cur, threshold); }); - - case QCAPTURE : - if (select([]() { return true; })) - return *(cur - 1); - - // If we did not find any move and we do not try checks, we have finished - if (depth != DEPTH_QS_CHECKS) - return Move::none(); - - ++stage; - [[fallthrough]]; - - case QCHECK_INIT : - cur = moves; - endMoves = generate(pos, cur); - - ++stage; - [[fallthrough]]; - - case QCHECK : - return select([]() { return true; }); + return select([&]() { return pos.see_ge(*cur, threshold); }); } assert(false); return Move::none(); // Silence warning } +void MovePicker::skip_quiet_moves() { skipQuiets = true; } + } // namespace Stockfish diff --git a/src/movepick.h b/src/movepick.h index b81f76e1..ab4e832f 100644 --- a/src/movepick.h +++ b/src/movepick.h @@ -19,141 +19,22 @@ #ifndef MOVEPICK_H_INCLUDED #define MOVEPICK_H_INCLUDED -#include -#include -#include -#include -#include -#include -#include -#include // IWYU pragma: keep - +#include "history.h" #include "movegen.h" -#include "position.h" #include "types.h" namespace Stockfish { -constexpr int PAWN_HISTORY_SIZE = 512; // has to be a power of 2 -constexpr int CORRECTION_HISTORY_SIZE = 16384; // has to be a power of 2 -constexpr int CORRECTION_HISTORY_LIMIT = 1024; +class Position; -static_assert((PAWN_HISTORY_SIZE & (PAWN_HISTORY_SIZE - 1)) == 0, - "PAWN_HISTORY_SIZE has to be a power of 2"); - -static_assert((CORRECTION_HISTORY_SIZE & (CORRECTION_HISTORY_SIZE - 1)) == 0, - "CORRECTION_HISTORY_SIZE has to be a power of 2"); - -enum PawnHistoryType { - Normal, - Correction -}; - -template -inline int pawn_structure_index(const Position& pos) { - return pos.pawn_key() & ((T == Normal ? PAWN_HISTORY_SIZE : CORRECTION_HISTORY_SIZE) - 1); -} - -// StatsEntry stores the stat table value. It is usually a number but could -// be a move or even a nested history. We use a class instead of a naked value -// to directly call history update operator<<() on the entry so to use stats -// tables at caller sites as simple multi-dim arrays. -template -class StatsEntry { - - T entry; - - public: - void operator=(const T& v) { entry = v; } - T* operator&() { return &entry; } - T* operator->() { return &entry; } - operator const T&() const { return entry; } - - void operator<<(int bonus) { - static_assert(D <= std::numeric_limits::max(), "D overflows T"); - - // Make sure that bonus is in range [-D, D] - int clampedBonus = std::clamp(bonus, -D, D); - entry += clampedBonus - entry * std::abs(clampedBonus) / D; - - assert(std::abs(entry) <= D); - } -}; - -// Stats is a generic N-dimensional array used to store various statistics. -// The first template parameter T is the base type of the array, and the second -// template parameter D limits the range of updates in [-D, D] when we update -// values with the << operator, while the last parameters (Size and Sizes) -// encode the dimensions of the array. -template -struct Stats: public std::array, Size> { - using stats = Stats; - - void fill(const T& v) { - - // For standard-layout 'this' points to the first struct member - assert(std::is_standard_layout_v); - - using entry = StatsEntry; - entry* p = reinterpret_cast(this); - std::fill(p, p + sizeof(*this) / sizeof(entry), v); - } -}; - -template -struct Stats: public std::array, Size> {}; - -// In stats table, D=0 means that the template parameter is not used -enum StatsParams { - NOT_USED = 0 -}; -enum StatsType { - NoCaptures, - Captures -}; - -// ButterflyHistory records how often quiet moves have been successful or unsuccessful -// during the current search, and is used for reduction and move ordering decisions. -// It uses 2 tables (one for each color) indexed by the move's from and to squares, -// see www.chessprogramming.org/Butterfly_Boards (~11 elo) -using ButterflyHistory = Stats; - -// CounterMoveHistory stores counter moves indexed by [piece][to] of the previous -// move, see www.chessprogramming.org/Countermove_Heuristic -using CounterMoveHistory = Stats; - -// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type] -using CapturePieceToHistory = Stats; - -// PieceToHistory is like ButterflyHistory but is addressed by a move's [piece][to] -using PieceToHistory = Stats; - -// ContinuationHistory is the combined history of a given pair of moves, usually -// the current one given a previous one. The nested history table is based on -// PieceToHistory instead of ButterflyBoards. -// (~63 elo) -using ContinuationHistory = Stats; - -// PawnHistory is addressed by the pawn structure and a move's [piece][to] -using PawnHistory = Stats; - -// CorrectionHistory is addressed by color and pawn structure -using CorrectionHistory = - Stats; - -// MovePicker class is used to pick one pseudo-legal move at a time from the -// current position. The most important method is next_move(), which returns a -// new pseudo-legal move each time it is called, until there are no moves left, -// when Move::none() is returned. In order to improve the efficiency of the -// alpha-beta algorithm, MovePicker attempts to return the moves which are most -// likely to get a cut-off first. +// The MovePicker class is used to pick one pseudo-legal move at a time from the +// current position. The most important method is next_move(), which emits one +// new pseudo-legal move on every call, until there are no moves left, when +// Move::none() is returned. In order to improve the efficiency of the alpha-beta +// algorithm, MovePicker attempts to return the moves which are most likely to get +// a cut-off first. class MovePicker { - enum PickType { - Next, - Best - }; - public: MovePicker(const MovePicker&) = delete; MovePicker& operator=(const MovePicker&) = delete; @@ -161,23 +42,17 @@ class MovePicker { Move, Depth, const ButterflyHistory*, + const LowPlyHistory*, const CapturePieceToHistory*, const PieceToHistory**, const PawnHistory*, - Move, - const Move*); - MovePicker(const Position&, - Move, - Depth, - const ButterflyHistory*, - const CapturePieceToHistory*, - const PieceToHistory**, - const PawnHistory*); + int); MovePicker(const Position&, Move, int, const CapturePieceToHistory*); - Move next_move(bool skipQuiets = false); + Move next_move(); + void skip_quiet_moves(); private: - template + template Move select(Pred); template void score(); @@ -186,15 +61,18 @@ class MovePicker { const Position& pos; const ButterflyHistory* mainHistory; + const LowPlyHistory* lowPlyHistory; const CapturePieceToHistory* captureHistory; const PieceToHistory** continuationHistory; const PawnHistory* pawnHistory; Move ttMove; - ExtMove refutations[3], *cur, *endMoves, *endBadCaptures, *beginBadQuiets, *endBadQuiets; - int stage; - int threshold; - Depth depth; - ExtMove moves[MAX_MOVES]; + ExtMove * cur, *endMoves, *endBadCaptures, *beginBadQuiets, *endBadQuiets; + int stage; + int threshold; + Depth depth; + int ply; + bool skipQuiets = false; + ExtMove moves[MAX_MOVES]; }; } // namespace Stockfish diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index ad9167c0..59a6149f 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -39,25 +39,26 @@ namespace Stockfish::Eval::NNUE::Layers { +#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD) + #define ENABLE_SEQ_OPT +#endif + // Fallback implementation for older/other architectures. // Requires the input to be padded to at least 16 values. -#if !defined(USE_SSSE3) +#ifndef ENABLE_SEQ_OPT + template static void affine_transform_non_ssse3(std::int32_t* output, const std::int8_t* weights, const std::int32_t* biases, const std::uint8_t* input) { - #if defined(USE_SSE2) || defined(USE_NEON_DOTPROD) || defined(USE_NEON) + #if defined(USE_SSE2) || defined(USE_NEON) #if defined(USE_SSE2) // At least a multiple of 16, with SSE2. constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; const __m128i Zeros = _mm_setzero_si128(); const auto inputVector = reinterpret_cast(input); - #elif defined(USE_NEON_DOTPROD) - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; - const auto inputVector = reinterpret_cast(input); - #elif defined(USE_NEON) constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; const auto inputVector = reinterpret_cast(input); @@ -91,16 +92,8 @@ static void affine_transform_non_ssse3(std::int32_t* output, sum = _mm_add_epi32(sum, sum_second_32); output[i] = _mm_cvtsi128_si32(sum); - #elif defined(USE_NEON_DOTPROD) - int32x4_t sum = {biases[i]}; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) - { - sum = vdotq_s32(sum, inputVector[j], row[j]); - } - output[i] = vaddvq_s32(sum); - #elif defined(USE_NEON) + int32x4_t sum = {biases[i]}; const auto row = reinterpret_cast(&weights[offset]); for (IndexType j = 0; j < NumChunks; ++j) @@ -127,7 +120,8 @@ static void affine_transform_non_ssse3(std::int32_t* output, } #endif } -#endif + +#endif // !ENABLE_SEQ_OPT template class AffineTransform { @@ -162,7 +156,7 @@ class AffineTransform { } static constexpr IndexType get_weight_index(IndexType i) { -#if defined(USE_SSSE3) +#ifdef ENABLE_SEQ_OPT return get_weight_index_scrambled(i); #else return i; @@ -190,29 +184,28 @@ class AffineTransform { // Forward propagation void propagate(const InputType* input, OutputType* output) const { -#if defined(USE_SSSE3) +#ifdef ENABLE_SEQ_OPT if constexpr (OutputDimensions > 1) { - #if defined(USE_AVX512) using vec_t = __m512i; - #define vec_setzero _mm512_setzero_si512 #define vec_set_32 _mm512_set1_epi32 #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 - #define vec_hadd Simd::m512_hadd #elif defined(USE_AVX2) using vec_t = __m256i; - #define vec_setzero _mm256_setzero_si256 #define vec_set_32 _mm256_set1_epi32 #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 - #define vec_hadd Simd::m256_hadd #elif defined(USE_SSSE3) using vec_t = __m128i; - #define vec_setzero _mm_setzero_si128 #define vec_set_32 _mm_set1_epi32 #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 - #define vec_hadd Simd::m128_hadd + #elif defined(USE_NEON_DOTPROD) + using vec_t = int32x4_t; + #define vec_set_32 vdupq_n_s32 + #define vec_add_dpbusd_32(acc, a, b) \ + Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \ + vreinterpretq_s8_s32(b)) #endif static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); @@ -242,28 +235,33 @@ class AffineTransform { for (IndexType k = 0; k < NumRegs; ++k) outptr[k] = acc[k]; - #undef vec_setzero #undef vec_set_32 #undef vec_add_dpbusd_32 - #undef vec_hadd } else if constexpr (OutputDimensions == 1) { - // We cannot use AVX512 for the last layer because there are only 32 inputs // and the buffer is not padded to 64 elements. #if defined(USE_AVX2) using vec_t = __m256i; - #define vec_setzero _mm256_setzero_si256 + #define vec_setzero() _mm256_setzero_si256() #define vec_set_32 _mm256_set1_epi32 #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 #define vec_hadd Simd::m256_hadd #elif defined(USE_SSSE3) using vec_t = __m128i; - #define vec_setzero _mm_setzero_si128 + #define vec_setzero() _mm_setzero_si128() #define vec_set_32 _mm_set1_epi32 #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 #define vec_hadd Simd::m128_hadd + #elif defined(USE_NEON_DOTPROD) + using vec_t = int32x4_t; + #define vec_setzero() vdupq_n_s32(0) + #define vec_set_32 vdupq_n_s32 + #define vec_add_dpbusd_32(acc, a, b) \ + Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \ + vreinterpretq_s8_s32(b)) + #define vec_hadd Simd::neon_m128_hadd #endif const auto inputVector = reinterpret_cast(input); diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 813234c5..2ee378ad 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -65,41 +65,37 @@ class ClippedReLU { if constexpr (InputDimensions % SimdWidth == 0) { constexpr IndexType NumChunks = InputDimensions / SimdWidth; - const __m256i Zero = _mm256_setzero_si256(); const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); const auto in = reinterpret_cast(input); const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < NumChunks; ++i) { const __m256i words0 = - _mm256_srai_epi16(_mm256_packs_epi32(_mm256_load_si256(&in[i * 4 + 0]), - _mm256_load_si256(&in[i * 4 + 1])), + _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]), + _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits); const __m256i words1 = - _mm256_srai_epi16(_mm256_packs_epi32(_mm256_load_si256(&in[i * 4 + 2]), - _mm256_load_si256(&in[i * 4 + 3])), + _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]), + _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits); - _mm256_store_si256( - &out[i], _mm256_permutevar8x32_epi32( - _mm256_max_epi8(_mm256_packs_epi16(words0, words1), Zero), Offsets)); + _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32( + _mm256_packs_epi16(words0, words1), Offsets)); } } else { constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2); - const __m128i Zero = _mm_setzero_si128(); const auto in = reinterpret_cast(input); const auto out = reinterpret_cast<__m128i*>(output); for (IndexType i = 0; i < NumChunks; ++i) { - const __m128i words0 = _mm_srai_epi16( - _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), + const __m128i words0 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits); - const __m128i words1 = _mm_srai_epi16( - _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), + const __m128i words1 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits); - const __m128i packedbytes = _mm_packs_epi16(words0, words1); - _mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero)); + _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); } } constexpr IndexType Start = InputDimensions % SimdWidth == 0 @@ -109,9 +105,7 @@ class ClippedReLU { #elif defined(USE_SSE2) constexpr IndexType NumChunks = InputDimensions / SimdWidth; - #ifdef USE_SSE41 - const __m128i Zero = _mm_setzero_si128(); - #else + #ifndef USE_SSE41 const __m128i k0x80s = _mm_set1_epi8(-128); #endif @@ -119,6 +113,15 @@ class ClippedReLU { const auto out = reinterpret_cast<__m128i*>(output); for (IndexType i = 0; i < NumChunks; ++i) { + #if defined(USE_SSE41) + const __m128i words0 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), + WeightScaleBits); + const __m128i words1 = _mm_srli_epi16( + _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), + WeightScaleBits); + _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); + #else const __m128i words0 = _mm_srai_epi16( _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits); @@ -126,15 +129,8 @@ class ClippedReLU { _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits); const __m128i packedbytes = _mm_packs_epi16(words0, words1); - _mm_store_si128(&out[i], - - #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, Zero) - #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) + _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); #endif - - ); } constexpr IndexType Start = NumChunks * SimdWidth; diff --git a/src/nnue/layers/simd.h b/src/nnue/layers/simd.h index cec41474..55cb7df1 100644 --- a/src/nnue/layers/simd.h +++ b/src/nnue/layers/simd.h @@ -43,39 +43,6 @@ namespace Stockfish::Simd { return _mm512_reduce_add_epi32(sum) + bias; } -/* - Parameters: - sum0 = [zmm0.i128[0], zmm0.i128[1], zmm0.i128[2], zmm0.i128[3]] - sum1 = [zmm1.i128[0], zmm1.i128[1], zmm1.i128[2], zmm1.i128[3]] - sum2 = [zmm2.i128[0], zmm2.i128[1], zmm2.i128[2], zmm2.i128[3]] - sum3 = [zmm3.i128[0], zmm3.i128[1], zmm3.i128[2], zmm3.i128[3]] - - Returns: - ret = [ - reduce_add_epi32(zmm0.i128[0]), reduce_add_epi32(zmm1.i128[0]), reduce_add_epi32(zmm2.i128[0]), reduce_add_epi32(zmm3.i128[0]), - reduce_add_epi32(zmm0.i128[1]), reduce_add_epi32(zmm1.i128[1]), reduce_add_epi32(zmm2.i128[1]), reduce_add_epi32(zmm3.i128[1]), - reduce_add_epi32(zmm0.i128[2]), reduce_add_epi32(zmm1.i128[2]), reduce_add_epi32(zmm2.i128[2]), reduce_add_epi32(zmm3.i128[2]), - reduce_add_epi32(zmm0.i128[3]), reduce_add_epi32(zmm1.i128[3]), reduce_add_epi32(zmm2.i128[3]), reduce_add_epi32(zmm3.i128[3]) - ] - */ -[[maybe_unused]] static __m512i -m512_hadd128x16_interleave(__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) { - - __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1); - __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1); - - __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3); - __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3); - - __m512i sum01 = _mm512_add_epi32(sum01a, sum01b); - __m512i sum23 = _mm512_add_epi32(sum23a, sum23b); - - __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23); - __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23); - - return _mm512_add_epi32(sum0123a, sum0123b); -} - [[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) { #if defined(USE_VNNI) diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index de2c7eca..a8e901a0 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -18,17 +18,17 @@ #include "network.h" -#include #include -#include #include #include +#include #include #include #include #include "../evaluate.h" #include "../incbin/incbin.h" +#include "../memory.h" #include "../misc.h" #include "../position.h" #include "../types.h" @@ -85,23 +85,6 @@ namespace Stockfish::Eval::NNUE { namespace Detail { -// Initialize the evaluation function parameters -template -void initialize(AlignedPtr& pointer) { - - pointer.reset(reinterpret_cast(std_aligned_alloc(alignof(T), sizeof(T)))); - std::memset(pointer.get(), 0, sizeof(T)); -} - -template -void initialize(LargePagePtr& pointer) { - - static_assert(alignof(T) <= 4096, - "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); - pointer.reset(reinterpret_cast(aligned_large_pages_alloc(sizeof(T)))); - std::memset(pointer.get(), 0, sizeof(T)); -} - // Read evaluation function parameters template bool read_parameters(std::istream& stream, T& reference) { @@ -123,6 +106,42 @@ bool write_parameters(std::ostream& stream, const T& reference) { } // namespace Detail +template +Network::Network(const Network& other) : + evalFile(other.evalFile), + embeddedType(other.embeddedType) { + + if (other.featureTransformer) + featureTransformer = make_unique_large_page(*other.featureTransformer); + + network = make_unique_aligned(LayerStacks); + + if (!other.network) + return; + + for (std::size_t i = 0; i < LayerStacks; ++i) + network[i] = other.network[i]; +} + +template +Network& +Network::operator=(const Network& other) { + evalFile = other.evalFile; + embeddedType = other.embeddedType; + + if (other.featureTransformer) + featureTransformer = make_unique_large_page(*other.featureTransformer); + + network = make_unique_aligned(LayerStacks); + + if (!other.network) + return *this; + + for (std::size_t i = 0; i < LayerStacks; ++i) + network[i] = other.network[i]; + + return *this; +} template void Network::load(const std::string& rootDirectory, std::string evalfilePath) { @@ -186,15 +205,13 @@ bool Network::save(const std::optional& filename template -Value Network::evaluate(const Position& pos, - AccumulatorCaches::Cache* cache, - bool adjusted, - int* complexity) const { +NetworkOutput +Network::evaluate(const Position& pos, + AccumulatorCaches::Cache* cache) const { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. constexpr uint64_t alignment = CacheLineSize; - constexpr int delta = 24; #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) TransformedFeatureType @@ -211,50 +228,50 @@ Value Network::evaluate(const Position& const int bucket = (pos.count() - 1) / 4; const auto psqt = featureTransformer->transform(pos, cache, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); - - if (complexity) - *complexity = std::abs(psqt - positional) / OutputScale; - - // Give more value to positional evaluation when adjusted flag is set - if (adjusted) - return static_cast(((1024 - delta) * psqt + (1024 + delta) * positional) - / (1024 * OutputScale)); - else - return static_cast((psqt + positional) / OutputScale); + const auto positional = network[bucket].propagate(transformedFeatures); + return {static_cast(psqt / OutputScale), static_cast(positional / OutputScale)}; } template -void Network::verify(std::string evalfilePath) const { +void Network::verify(std::string evalfilePath, + const std::function& f) const { if (evalfilePath.empty()) evalfilePath = evalFile.defaultName; if (evalFile.current != evalfilePath) { - std::string msg1 = - "Network evaluation parameters compatible with the engine must be available."; - std::string msg2 = "The network file " + evalfilePath + " was not loaded successfully."; - std::string msg3 = "The UCI option EvalFile might need to specify the full path, " - "including the directory name, to the network file."; - std::string msg4 = "The default net can be downloaded from: " - "https://tests.stockfishchess.org/api/nn/" - + evalFile.defaultName; - std::string msg5 = "The engine will be terminated now."; + if (f) + { + std::string msg1 = + "Network evaluation parameters compatible with the engine must be available."; + std::string msg2 = "The network file " + evalfilePath + " was not loaded successfully."; + std::string msg3 = "The UCI option EvalFile might need to specify the full path, " + "including the directory name, to the network file."; + std::string msg4 = "The default net can be downloaded from: " + "https://tests.stockfishchess.org/api/nn/" + + evalFile.defaultName; + std::string msg5 = "The engine will be terminated now."; + + std::string msg = "ERROR: " + msg1 + '\n' + "ERROR: " + msg2 + '\n' + "ERROR: " + msg3 + + '\n' + "ERROR: " + msg4 + '\n' + "ERROR: " + msg5 + '\n'; + + f(msg); + } - sync_cout << "info string ERROR: " << msg1 << sync_endl; - sync_cout << "info string ERROR: " << msg2 << sync_endl; - sync_cout << "info string ERROR: " << msg3 << sync_endl; - sync_cout << "info string ERROR: " << msg4 << sync_endl; - sync_cout << "info string ERROR: " << msg5 << sync_endl; exit(EXIT_FAILURE); } - size_t size = sizeof(*featureTransformer) + sizeof(*network) * LayerStacks; - sync_cout << "info string NNUE evaluation using " << evalfilePath << " (" - << size / (1024 * 1024) << "MiB, (" << featureTransformer->InputDimensions << ", " - << network[0]->TransformedFeatureDimensions << ", " << network[0]->FC_0_OUTPUTS - << ", " << network[0]->FC_1_OUTPUTS << ", 1))" << sync_endl; + if (f) + { + size_t size = sizeof(*featureTransformer) + sizeof(Arch) * LayerStacks; + f("info string NNUE evaluation using " + evalfilePath + " (" + + std::to_string(size / (1024 * 1024)) + "MiB, (" + + std::to_string(featureTransformer->InputDimensions) + ", " + + std::to_string(network[0].TransformedFeatureDimensions) + ", " + + std::to_string(network[0].FC_0_OUTPUTS) + ", " + std::to_string(network[0].FC_1_OUTPUTS) + + ", 1))"); + } } @@ -291,7 +308,7 @@ Network::trace_evaluate(const Position& { const auto materialist = featureTransformer->transform(pos, cache, transformedFeatures, bucket); - const auto positional = network[bucket]->propagate(transformedFeatures); + const auto positional = network[bucket].propagate(transformedFeatures); t.psqt[bucket] = static_cast(materialist / OutputScale); t.positional[bucket] = static_cast(positional / OutputScale); @@ -344,9 +361,8 @@ void Network::load_internal() { template void Network::initialize() { - Detail::initialize(featureTransformer); - for (std::size_t i = 0; i < LayerStacks; ++i) - Detail::initialize(network[i]); + featureTransformer = make_unique_large_page(); + network = make_unique_aligned(LayerStacks); } @@ -413,7 +429,7 @@ bool Network::read_parameters(std::istream& stream, return false; for (std::size_t i = 0; i < LayerStacks; ++i) { - if (!Detail::read_parameters(stream, *(network[i]))) + if (!Detail::read_parameters(stream, network[i])) return false; } return stream && stream.peek() == std::ios::traits_type::eof(); @@ -429,7 +445,7 @@ bool Network::write_parameters(std::ostream& stream, return false; for (std::size_t i = 0; i < LayerStacks; ++i) { - if (!Detail::write_parameters(stream, *(network[i]))) + if (!Detail::write_parameters(stream, network[i])) return false; } return bool(stream); diff --git a/src/nnue/network.h b/src/nnue/network.h index 23f56663..95253595 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -20,18 +20,21 @@ #define NETWORK_H_INCLUDED #include +#include #include #include #include +#include +#include #include -#include "../misc.h" +#include "../memory.h" #include "../position.h" #include "../types.h" +#include "nnue_accumulator.h" #include "nnue_architecture.h" #include "nnue_feature_transformer.h" #include "nnue_misc.h" -#include "nnue_accumulator.h" namespace Stockfish::Eval::NNUE { @@ -40,6 +43,7 @@ enum class EmbeddedNNUEType { SMALL, }; +using NetworkOutput = std::tuple; template class Network { @@ -50,19 +54,23 @@ class Network { evalFile(file), embeddedType(type) {} + Network(const Network& other); + Network(Network&& other) = default; + + Network& operator=(const Network& other); + Network& operator=(Network&& other) = default; + void load(const std::string& rootDirectory, std::string evalfilePath); bool save(const std::optional& filename) const; - Value evaluate(const Position& pos, - AccumulatorCaches::Cache* cache, - bool adjusted = false, - int* complexity = nullptr) const; + NetworkOutput evaluate(const Position& pos, + AccumulatorCaches::Cache* cache) const; void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const; - void verify(std::string evalfilePath) const; + void verify(std::string evalfilePath, const std::function&) const; NnueEvalTrace trace_evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const; @@ -85,7 +93,7 @@ class Network { LargePagePtr featureTransformer; // Evaluation function - AlignedPtr network[LayerStacks]; + AlignedPtr network; EvalFile evalFile; EmbeddedNNUEType embeddedType; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index b8dcf1e4..b92901e4 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -80,11 +80,6 @@ struct AccumulatorCaches { entry.clear(network.featureTransformer->biases); } - void clear(const BiasType* biases) { - for (auto& entry : entries) - entry.clear(biases); - } - std::array& operator[](Square sq) { return entries[sq]; } std::array, SQUARE_NB> entries; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 2b11adef..fa180678 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -55,14 +55,14 @@ using psqt_vec_t = __m256i; #define vec_store(a, b) _mm512_store_si512(a, b) #define vec_add_16(a, b) _mm512_add_epi16(a, b) #define vec_sub_16(a, b) _mm512_sub_epi16(a, b) - #define vec_mul_16(a, b) _mm512_mullo_epi16(a, b) + #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b) #define vec_zero() _mm512_setzero_epi32() #define vec_set_16(a) _mm512_set1_epi16(a) #define vec_max_16(a, b) _mm512_max_epi16(a, b) #define vec_min_16(a, b) _mm512_min_epi16(a, b) + #define vec_slli_16(a, b) _mm512_slli_epi16(a, b) // Inverse permuted at load time - #define vec_msb_pack_16(a, b) \ - _mm512_packs_epi16(_mm512_srli_epi16(a, 7), _mm512_srli_epi16(b, 7)) + #define vec_packus_16(a, b) _mm512_packus_epi16(a, b) #define vec_load_psqt(a) _mm256_load_si256(a) #define vec_store_psqt(a, b) _mm256_store_si256(a, b) #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) @@ -78,14 +78,14 @@ using psqt_vec_t = __m256i; #define vec_store(a, b) _mm256_store_si256(a, b) #define vec_add_16(a, b) _mm256_add_epi16(a, b) #define vec_sub_16(a, b) _mm256_sub_epi16(a, b) - #define vec_mul_16(a, b) _mm256_mullo_epi16(a, b) + #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b) #define vec_zero() _mm256_setzero_si256() #define vec_set_16(a) _mm256_set1_epi16(a) #define vec_max_16(a, b) _mm256_max_epi16(a, b) #define vec_min_16(a, b) _mm256_min_epi16(a, b) + #define vec_slli_16(a, b) _mm256_slli_epi16(a, b) // Inverse permuted at load time - #define vec_msb_pack_16(a, b) \ - _mm256_packs_epi16(_mm256_srli_epi16(a, 7), _mm256_srli_epi16(b, 7)) + #define vec_packus_16(a, b) _mm256_packus_epi16(a, b) #define vec_load_psqt(a) _mm256_load_si256(a) #define vec_store_psqt(a, b) _mm256_store_si256(a, b) #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) @@ -101,12 +101,13 @@ using psqt_vec_t = __m128i; #define vec_store(a, b) *(a) = (b) #define vec_add_16(a, b) _mm_add_epi16(a, b) #define vec_sub_16(a, b) _mm_sub_epi16(a, b) - #define vec_mul_16(a, b) _mm_mullo_epi16(a, b) + #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b) #define vec_zero() _mm_setzero_si128() #define vec_set_16(a) _mm_set1_epi16(a) #define vec_max_16(a, b) _mm_max_epi16(a, b) #define vec_min_16(a, b) _mm_min_epi16(a, b) - #define vec_msb_pack_16(a, b) _mm_packs_epi16(_mm_srli_epi16(a, 7), _mm_srli_epi16(b, 7)) + #define vec_slli_16(a, b) _mm_slli_epi16(a, b) + #define vec_packus_16(a, b) _mm_packus_epi16(a, b) #define vec_load_psqt(a) (*(a)) #define vec_store_psqt(a, b) *(a) = (b) #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b) @@ -122,18 +123,14 @@ using psqt_vec_t = int32x4_t; #define vec_store(a, b) *(a) = (b) #define vec_add_16(a, b) vaddq_s16(a, b) #define vec_sub_16(a, b) vsubq_s16(a, b) - #define vec_mul_16(a, b) vmulq_s16(a, b) + #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b) #define vec_zero() \ vec_t { 0 } #define vec_set_16(a) vdupq_n_s16(a) #define vec_max_16(a, b) vmaxq_s16(a, b) #define vec_min_16(a, b) vminq_s16(a, b) -inline vec_t vec_msb_pack_16(vec_t a, vec_t b) { - const int8x8_t shifta = vshrn_n_s16(a, 7); - const int8x8_t shiftb = vshrn_n_s16(b, 7); - const int8x16_t compacted = vcombine_s8(shifta, shiftb); - return *reinterpret_cast(&compacted); -} + #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b)) + #define vec_packus_16(a, b) reinterpret_cast(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b))) #define vec_load_psqt(a) (*(a)) #define vec_store_psqt(a, b) *(a) = (b) #define vec_add_psqt_32(a, b) vaddq_s32(a, b) @@ -281,6 +278,19 @@ class FeatureTransformer { #endif } + inline void scale_weights(bool read) const { + for (IndexType j = 0; j < InputDimensions; ++j) + { + WeightType* w = const_cast(&weights[j * HalfDimensions]); + for (IndexType i = 0; i < HalfDimensions; ++i) + w[i] = read ? w[i] * 2 : w[i] / 2; + } + + BiasType* b = const_cast(biases); + for (IndexType i = 0; i < HalfDimensions; ++i) + b[i] = read ? b[i] * 2 : b[i] / 2; + } + // Read network parameters bool read_parameters(std::istream& stream) { @@ -289,6 +299,7 @@ class FeatureTransformer { read_leb_128(stream, psqtWeights, PSQTBuckets * InputDimensions); permute_weights(inverse_order_packs); + scale_weights(true); return !stream.fail(); } @@ -296,12 +307,14 @@ class FeatureTransformer { bool write_parameters(std::ostream& stream) const { permute_weights(order_packs); + scale_weights(false); write_leb_128(stream, biases, HalfDimensions); write_leb_128(stream, weights, HalfDimensions * InputDimensions); write_leb_128(stream, psqtWeights, PSQTBuckets * InputDimensions); permute_weights(inverse_order_packs); + scale_weights(true); return !stream.fail(); } @@ -332,24 +345,86 @@ class FeatureTransformer { constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; const vec_t Zero = vec_zero(); - const vec_t One = vec_set_16(127); + const vec_t One = vec_set_16(127 * 2); const vec_t* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); const vec_t* in1 = reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); vec_t* out = reinterpret_cast(output + offset); + // Per the NNUE architecture, here we want to multiply pairs of + // clipped elements and divide the product by 128. To do this, + // we can naively perform min/max operation to clip each of the + // four int16 vectors, mullo pairs together, then pack them into + // one int8 vector. However, there exists a faster way. + + // The idea here is to use the implicit clipping from packus to + // save us two vec_max_16 instructions. This clipping works due + // to the fact that any int16 integer below zero will be zeroed + // on packus. + + // Consider the case where the second element is negative. + // If we do standard clipping, that element will be zero, which + // means our pairwise product is zero. If we perform packus and + // remove the lower-side clip for the second element, then our + // product before packus will be negative, and is zeroed on pack. + // The two operation produce equivalent results, but the second + // one (using packus) saves one max operation per pair. + + // But here we run into a problem: mullo does not preserve the + // sign of the multiplication. We can get around this by doing + // mulhi, which keeps the sign. But that requires an additional + // tweak. + + // mulhi cuts off the last 16 bits of the resulting product, + // which is the same as performing a rightward shift of 16 bits. + // We can use this to our advantage. Recall that we want to + // divide the final product by 128, which is equivalent to a + // 7-bit right shift. Intuitively, if we shift the clipped + // value left by 9, and perform mulhi, which shifts the product + // right by 16 bits, then we will net a right shift of 7 bits. + // However, this won't work as intended. Since we clip the + // values to have a maximum value of 127, shifting it by 9 bits + // might occupy the signed bit, resulting in some positive + // values being interpreted as negative after the shift. + + // There is a way, however, to get around this limitation. When + // loading the network, scale accumulator weights and biases by + // 2. To get the same pairwise multiplication result as before, + // we need to divide the product by 128 * 2 * 2 = 512, which + // amounts to a right shift of 9 bits. So now we only have to + // shift left by 7 bits, perform mulhi (shifts right by 16 bits) + // and net a 9 bit right shift. Since we scaled everything by + // two, the values are clipped at 127 * 2 = 254, which occupies + // 8 bits. Shifting it by 7 bits left will no longer occupy the + // signed bit, so we are safe. + + // Note that on NEON processors, we shift left by 6 instead + // because the instruction "vqdmulhq_s16" also doubles the + // return value after the multiplication, adding an extra shift + // to the left by 1, so we compensate by shifting less before + // the multiplication. + + constexpr int shift = + #if defined(USE_SSE2) + 7; + #else + 6; + #endif + for (IndexType j = 0; j < NumOutputChunks; ++j) { - const vec_t sum0a = vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero); - const vec_t sum0b = vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero); - const vec_t sum1a = vec_max_16(vec_min_16(in1[j * 2 + 0], One), Zero); - const vec_t sum1b = vec_max_16(vec_min_16(in1[j * 2 + 1], One), Zero); + const vec_t sum0a = + vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift); + const vec_t sum0b = + vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift); + const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One); + const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One); - const vec_t pa = vec_mul_16(sum0a, sum1a); - const vec_t pb = vec_mul_16(sum0b, sum1b); + const vec_t pa = vec_mulhi_16(sum0a, sum1a); + const vec_t pb = vec_mulhi_16(sum0b, sum1b); - out[j] = vec_msb_pack_16(pa, pb); + out[j] = vec_packus_16(pa, pb); } #else @@ -359,9 +434,9 @@ class FeatureTransformer { BiasType sum0 = accumulation[static_cast(perspectives[p])][j + 0]; BiasType sum1 = accumulation[static_cast(perspectives[p])][j + HalfDimensions / 2]; - sum0 = std::clamp(sum0, 0, 127); - sum1 = std::clamp(sum1, 0, 127); - output[offset + j] = static_cast(unsigned(sum0 * sum1) / 128); + sum0 = std::clamp(sum0, 0, 127 * 2); + sum1 = std::clamp(sum1, 0, 127 * 2); + output[offset + j] = static_cast(unsigned(sum0 * sum1) / 512); } #endif @@ -378,11 +453,10 @@ class FeatureTransformer { private: template - [[nodiscard]] std::pair - try_find_computed_accumulator(const Position& pos) const { + StateInfo* try_find_computed_accumulator(const Position& pos) const { // Look for a usable accumulator of an earlier position. We keep track // of the estimated gain in terms of features to be added/subtracted. - StateInfo *st = pos.state(), *next = nullptr; + StateInfo* st = pos.state(); int gain = FeatureSet::refresh_cost(pos); while (st->previous && !(st->*accPtr).computed[Perspective]) { @@ -391,236 +465,205 @@ class FeatureTransformer { if (FeatureSet::requires_refresh(st, Perspective) || (gain -= FeatureSet::update_cost(st) + 1) < 0) break; - next = st; - st = st->previous; + st = st->previous; } - return {st, next}; + return st; } - // NOTE: The parameter states_to_update is an array of position states. - // All states must be sequential, that is states_to_update[i] must either be reachable - // by repeatedly applying ->previous from states_to_update[i+1]. - // computed_st must be reachable by repeatedly applying ->previous on - // states_to_update[0]. - template - void update_accumulator_incremental(const Position& pos, - StateInfo* computed_st, - StateInfo* states_to_update[N]) const { - static_assert(N > 0); - assert([&]() { - for (size_t i = 0; i < N; ++i) - { - if (states_to_update[i] == nullptr) - return false; - } - return true; - }()); + // It computes the accumulator of the next position, or updates the + // current position's accumulator if CurrentOnly is true. + template + void update_accumulator_incremental(const Position& pos, StateInfo* computed) const { + assert((computed->*accPtr).computed[Perspective]); + assert(computed->next != nullptr); #ifdef VECTOR // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch + // is defined in the VECTOR code below, once in each branch. vec_t acc[NumRegs]; psqt_vec_t psqt[NumPsqtRegs]; #endif - // Update incrementally going back through states_to_update. - // Gather all features to be updated. const Square ksq = pos.square(Perspective); // The size must be enough to contain the largest possible update. // That might depend on the feature set and generally relies on the // feature set's update cost calculation to be correct and never allow // updates with more added/removed features than MaxActiveDimensions. - FeatureSet::IndexList removed[N], added[N]; + FeatureSet::IndexList removed, added; - for (int i = N - 1; i >= 0; --i) - { - (states_to_update[i]->*accPtr).computed[Perspective] = true; + if constexpr (CurrentOnly) + for (StateInfo* st = pos.state(); st != computed; st = st->previous) + FeatureSet::append_changed_indices(ksq, st->dirtyPiece, removed, + added); + else + FeatureSet::append_changed_indices(ksq, computed->next->dirtyPiece, + removed, added); - const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; + StateInfo* next = CurrentOnly ? pos.state() : computed->next; + assert(!(next->*accPtr).computed[Perspective]); - for (StateInfo* st2 = states_to_update[i]; st2 != end_state; st2 = st2->previous) - FeatureSet::append_changed_indices(ksq, st2->dirtyPiece, removed[i], - added[i]); - } - - StateInfo* st = computed_st; - - // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. #ifdef VECTOR - - if (N == 1 && (removed[0].size() == 1 || removed[0].size() == 2) && added[0].size() == 1) + if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1) { - assert(states_to_update[0]); - auto accIn = - reinterpret_cast(&(st->*accPtr).accumulation[Perspective][0]); - auto accOut = reinterpret_cast( - &(states_to_update[0]->*accPtr).accumulation[Perspective][0]); + reinterpret_cast(&(computed->*accPtr).accumulation[Perspective][0]); + auto accOut = reinterpret_cast(&(next->*accPtr).accumulation[Perspective][0]); - const IndexType offsetR0 = HalfDimensions * removed[0][0]; + const IndexType offsetR0 = HalfDimensions * removed[0]; auto columnR0 = reinterpret_cast(&weights[offsetR0]); - const IndexType offsetA = HalfDimensions * added[0][0]; + const IndexType offsetA = HalfDimensions * added[0]; auto columnA = reinterpret_cast(&weights[offsetA]); - if (removed[0].size() == 1) + if (removed.size() == 1) { - for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t); - ++k) - accOut[k] = vec_add_16(vec_sub_16(accIn[k], columnR0[k]), columnA[k]); + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) + accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA[i]); } else { - const IndexType offsetR1 = HalfDimensions * removed[0][1]; + const IndexType offsetR1 = HalfDimensions * removed[1]; auto columnR1 = reinterpret_cast(&weights[offsetR1]); - for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t); - ++k) - accOut[k] = vec_sub_16(vec_add_16(accIn[k], columnA[k]), - vec_add_16(columnR0[k], columnR1[k])); + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) + accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA[i]), + vec_add_16(columnR0[i], columnR1[i])); } - auto accPsqtIn = - reinterpret_cast(&(st->*accPtr).psqtAccumulation[Perspective][0]); - auto accPsqtOut = reinterpret_cast( - &(states_to_update[0]->*accPtr).psqtAccumulation[Perspective][0]); + auto accPsqtIn = reinterpret_cast( + &(computed->*accPtr).psqtAccumulation[Perspective][0]); + auto accPsqtOut = + reinterpret_cast(&(next->*accPtr).psqtAccumulation[Perspective][0]); - const IndexType offsetPsqtR0 = PSQTBuckets * removed[0][0]; + const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); - const IndexType offsetPsqtA = PSQTBuckets * added[0][0]; + const IndexType offsetPsqtA = PSQTBuckets * added[0]; auto columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]); - if (removed[0].size() == 1) + if (removed.size() == 1) { - for (std::size_t k = 0; k < PSQTBuckets * sizeof(std::int32_t) / sizeof(psqt_vec_t); - ++k) - accPsqtOut[k] = vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[k], columnPsqtR0[k]), - columnPsqtA[k]); + for (std::size_t i = 0; + i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) + accPsqtOut[i] = vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]), + columnPsqtA[i]); } else { - const IndexType offsetPsqtR1 = PSQTBuckets * removed[0][1]; + const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; auto columnPsqtR1 = reinterpret_cast(&psqtWeights[offsetPsqtR1]); - for (std::size_t k = 0; k < PSQTBuckets * sizeof(std::int32_t) / sizeof(psqt_vec_t); - ++k) - accPsqtOut[k] = - vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[k], columnPsqtA[k]), - vec_add_psqt_32(columnPsqtR0[k], columnPsqtR1[k])); + for (std::size_t i = 0; + i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) + accPsqtOut[i] = + vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA[i]), + vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])); } } else { - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) + for (IndexType i = 0; i < HalfDimensions / TileHeight; ++i) { // Load accumulator auto accTileIn = reinterpret_cast( - &(st->*accPtr).accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_load(&accTileIn[k]); + &(computed->*accPtr).accumulation[Perspective][i * TileHeight]); + for (IndexType j = 0; j < NumRegs; ++j) + acc[j] = vec_load(&accTileIn[j]); - for (IndexType i = 0; i < N; ++i) + // Difference calculation for the deactivated features + for (const auto index : removed) { - // Difference calculation for the deactivated features - for (const auto index : removed[i]) - { - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); - } - - // Difference calculation for the activated features - for (const auto index : added[i]) - { - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - - // Store accumulator - auto accTileOut = reinterpret_cast( - &(states_to_update[i]->*accPtr).accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - vec_store(&accTileOut[k], acc[k]); + const IndexType offset = HalfDimensions * index + i * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumRegs; ++j) + acc[j] = vec_sub_16(acc[j], column[j]); } + + // Difference calculation for the activated features + for (const auto index : added) + { + const IndexType offset = HalfDimensions * index + i * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumRegs; ++j) + acc[j] = vec_add_16(acc[j], column[j]); + } + + // Store accumulator + auto accTileOut = reinterpret_cast( + &(next->*accPtr).accumulation[Perspective][i * TileHeight]); + for (IndexType j = 0; j < NumRegs; ++j) + vec_store(&accTileOut[j], acc[j]); } - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) + for (IndexType i = 0; i < PSQTBuckets / PsqtTileHeight; ++i) { // Load accumulator auto accTilePsqtIn = reinterpret_cast( - &(st->*accPtr).psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_load_psqt(&accTilePsqtIn[k]); + &(computed->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]); + for (std::size_t j = 0; j < NumPsqtRegs; ++j) + psqt[j] = vec_load_psqt(&accTilePsqtIn[j]); - for (IndexType i = 0; i < N; ++i) + // Difference calculation for the deactivated features + for (const auto index : removed) { - // Difference calculation for the deactivated features - for (const auto index : removed[i]) - { - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); - } - - // Difference calculation for the activated features - for (const auto index : added[i]) - { - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); - } - - // Store accumulator - auto accTilePsqtOut = reinterpret_cast( - &(states_to_update[i]->*accPtr) - .psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqtOut[k], psqt[k]); + const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t j = 0; j < NumPsqtRegs; ++j) + psqt[j] = vec_sub_psqt_32(psqt[j], columnPsqt[j]); } + + // Difference calculation for the activated features + for (const auto index : added) + { + const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t j = 0; j < NumPsqtRegs; ++j) + psqt[j] = vec_add_psqt_32(psqt[j], columnPsqt[j]); + } + + // Store accumulator + auto accTilePsqtOut = reinterpret_cast( + &(next->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]); + for (std::size_t j = 0; j < NumPsqtRegs; ++j) + vec_store_psqt(&accTilePsqtOut[j], psqt[j]); } } #else - for (IndexType i = 0; i < N; ++i) + std::memcpy((next->*accPtr).accumulation[Perspective], + (computed->*accPtr).accumulation[Perspective], + HalfDimensions * sizeof(BiasType)); + std::memcpy((next->*accPtr).psqtAccumulation[Perspective], + (computed->*accPtr).psqtAccumulation[Perspective], + PSQTBuckets * sizeof(PSQTWeightType)); + + // Difference calculation for the deactivated features + for (const auto index : removed) { - std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective], - (st->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType)); + const IndexType offset = HalfDimensions * index; + for (IndexType i = 0; i < HalfDimensions; ++i) + (next->*accPtr).accumulation[Perspective][i] -= weights[offset + i]; - for (std::size_t k = 0; k < PSQTBuckets; ++k) - (states_to_update[i]->*accPtr).psqtAccumulation[Perspective][k] = - (st->*accPtr).psqtAccumulation[Perspective][k]; + for (std::size_t i = 0; i < PSQTBuckets; ++i) + (next->*accPtr).psqtAccumulation[Perspective][i] -= + psqtWeights[index * PSQTBuckets + i]; + } - st = states_to_update[i]; + // Difference calculation for the activated features + for (const auto index : added) + { + const IndexType offset = HalfDimensions * index; + for (IndexType i = 0; i < HalfDimensions; ++i) + (next->*accPtr).accumulation[Perspective][i] += weights[offset + i]; - // Difference calculation for the deactivated features - for (const auto index : removed[i]) - { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - (st->*accPtr).accumulation[Perspective][j] -= weights[offset + j]; - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - (st->*accPtr).psqtAccumulation[Perspective][k] -= - psqtWeights[index * PSQTBuckets + k]; - } - - // Difference calculation for the activated features - for (const auto index : added[i]) - { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - (st->*accPtr).accumulation[Perspective][j] += weights[offset + j]; - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - (st->*accPtr).psqtAccumulation[Perspective][k] += - psqtWeights[index * PSQTBuckets + k]; - } + for (std::size_t i = 0; i < PSQTBuckets; ++i) + (next->*accPtr).psqtAccumulation[Perspective][i] += + psqtWeights[index * PSQTBuckets + i]; } #endif + + (next->*accPtr).computed[Perspective] = true; + + if (!CurrentOnly && next != pos.state()) + update_accumulator_incremental(pos, next); } template @@ -664,7 +707,10 @@ class FeatureTransformer { for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { + auto accTile = + reinterpret_cast(&accumulator.accumulation[Perspective][j * TileHeight]); auto entryTile = reinterpret_cast(&entry.accumulation[j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = entryTile[k]; @@ -679,7 +725,7 @@ class FeatureTransformer { auto columnA = reinterpret_cast(&weights[offsetA]); for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), columnA[k]); + acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k])); } for (; i < int(removed.size()); ++i) { @@ -702,12 +748,17 @@ class FeatureTransformer { for (IndexType k = 0; k < NumRegs; k++) vec_store(&entryTile[k], acc[k]); + for (IndexType k = 0; k < NumRegs; k++) + vec_store(&accTile[k], acc[k]); } for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { + auto accTilePsqt = reinterpret_cast( + &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); auto entryTilePsqt = reinterpret_cast(&entry.psqtAccumulation[j * PsqtTileHeight]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = entryTilePsqt[k]; @@ -732,6 +783,8 @@ class FeatureTransformer { for (std::size_t k = 0; k < NumPsqtRegs; ++k) vec_store_psqt(&entryTilePsqt[k], psqt[k]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + vec_store_psqt(&accTilePsqt[k], psqt[k]); } #else @@ -755,16 +808,15 @@ class FeatureTransformer { entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k]; } -#endif - // The accumulator of the refresh entry has been updated. - // Now copy its content to the actual accumulator we were refreshing + // Now copy its content to the actual accumulator we were refreshing. std::memcpy(accumulator.accumulation[Perspective], entry.accumulation, sizeof(BiasType) * HalfDimensions); std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation, sizeof(int32_t) * PSQTBuckets); +#endif for (Color c : {WHITE, BLACK}) entry.byColorBB[c] = pos.pieces(c); @@ -786,14 +838,10 @@ class FeatureTransformer { if ((pos.state()->*accPtr).computed[Perspective]) return; - auto [oldest_st, _] = try_find_computed_accumulator(pos); + StateInfo* oldest = try_find_computed_accumulator(pos); - if ((oldest_st->*accPtr).computed[Perspective]) - { - // Only update current position accumulator to minimize work. - StateInfo* states_to_update[1] = {pos.state()}; - update_accumulator_incremental(pos, oldest_st, states_to_update); - } + if ((oldest->*accPtr).computed[Perspective] && oldest != pos.state()) + update_accumulator_incremental(pos, oldest); else update_accumulator_refresh_cache(pos, cache); } @@ -802,31 +850,12 @@ class FeatureTransformer { void update_accumulator(const Position& pos, AccumulatorCaches::Cache* cache) const { - auto [oldest_st, next] = try_find_computed_accumulator(pos); + StateInfo* oldest = try_find_computed_accumulator(pos); - if ((oldest_st->*accPtr).computed[Perspective]) - { - if (next == nullptr) - return; - - // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. - // Currently we update 2 accumulators. - // 1. for the current position - // 2. the next accumulator after the computed one - // The heuristic may change in the future. - if (next == pos.state()) - { - StateInfo* states_to_update[1] = {next}; - - update_accumulator_incremental(pos, oldest_st, states_to_update); - } - else - { - StateInfo* states_to_update[2] = {next, pos.state()}; - - update_accumulator_incremental(pos, oldest_st, states_to_update); - } - } + if ((oldest->*accPtr).computed[Perspective] && oldest != pos.state()) + // Start from the oldest computed accumulator, update all the + // accumulators up to the current position. + update_accumulator_incremental(pos, oldest); else update_accumulator_refresh_cache(pos, cache); } diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index bf73a58b..a2bece21 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include "../evaluate.h" #include "../position.h" @@ -45,9 +46,7 @@ constexpr std::string_view PieceToChar(" PNBRQK pnbrqk"); void hint_common_parent_position(const Position& pos, const Networks& networks, AccumulatorCaches& caches) { - - int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move())); - if (simpleEvalAbs > Eval::SmallNetThreshold) + if (Eval::use_smallnet(pos)) networks.small.hint_common_access(pos, &caches.small); else networks.big.hint_common_access(pos, &caches.big); @@ -127,14 +126,15 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat board[y][x] = board[y][x + 8] = board[y + 3][x + 8] = board[y + 3][x] = '+'; if (pc != NO_PIECE) board[y + 1][x + 4] = PieceToChar[pc]; - if (value != VALUE_NONE) + if (is_valid(value)) format_cp_compact(value, &board[y + 2][x + 2], pos); }; // We estimate the value of each piece by doing a differential evaluation from // the current base eval, simulating the removal of the piece from its square. - Value base = networks.big.evaluate(pos, &caches.big); - base = pos.side_to_move() == WHITE ? base : -base; + auto [psqt, positional] = networks.big.evaluate(pos, &caches.big); + Value base = psqt + positional; + base = pos.side_to_move() == WHITE ? base : -base; for (File f = FILE_A; f <= FILE_H; ++f) for (Rank r = RANK_1; r <= RANK_8; ++r) @@ -150,9 +150,10 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat pos.remove_piece(sq); st->accumulatorBig.computed[WHITE] = st->accumulatorBig.computed[BLACK] = false; - Value eval = networks.big.evaluate(pos, &caches.big); - eval = pos.side_to_move() == WHITE ? eval : -eval; - v = base - eval; + std::tie(psqt, positional) = networks.big.evaluate(pos, &caches.big); + Value eval = psqt + positional; + eval = pos.side_to_move() == WHITE ? eval : -eval; + v = base - eval; pos.put_piece(pc, sq); st->accumulatorBig.computed[WHITE] = st->accumulatorBig.computed[BLACK] = false; @@ -177,16 +178,16 @@ trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::Accumulat for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) { - ss << "| " << bucket << " "; - ss << " | "; + ss << "| " << bucket << " " // + << " | "; format_cp_aligned_dot(t.psqt[bucket], ss, pos); - ss << " " + ss << " " // << " | "; format_cp_aligned_dot(t.positional[bucket], ss, pos); - ss << " " + ss << " " // << " | "; format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos); - ss << " " + ss << " " // << " |"; if (bucket == t.correctBucket) ss << " <-- this bucket is used"; diff --git a/src/numa.h b/src/numa.h new file mode 100644 index 00000000..1063721e --- /dev/null +++ b/src/numa.h @@ -0,0 +1,1346 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef NUMA_H_INCLUDED +#define NUMA_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory.h" + +// We support linux very well, but we explicitly do NOT support Android, +// because there is no affected systems, not worth maintaining. +#if defined(__linux__) && !defined(__ANDROID__) + #if !defined(_GNU_SOURCE) + #define _GNU_SOURCE + #endif + #include +#elif defined(_WIN64) + + #if _WIN32_WINNT < 0x0601 + #undef _WIN32_WINNT + #define _WIN32_WINNT 0x0601 // Force to include needed API prototypes + #endif + +// On Windows each processor group can have up to 64 processors. +// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups +static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64; + + #if !defined(NOMINMAX) + #define NOMINMAX + #endif + #include + #if defined small + #undef small + #endif + +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks +using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT); + +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks +using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT); + +#endif + +#include "misc.h" + +namespace Stockfish { + +using CpuIndex = size_t; +using NumaIndex = size_t; + +inline CpuIndex get_hardware_concurrency() { + CpuIndex concurrency = std::thread::hardware_concurrency(); + + // Get all processors across all processor groups on windows, since + // hardware_concurrency() only returns the number of processors in + // the first group, because only these are available to std::thread. +#ifdef _WIN64 + concurrency = std::max(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS)); +#endif + + return concurrency; +} + +inline const CpuIndex SYSTEM_THREADS_NB = std::max(1, get_hardware_concurrency()); + +#if defined(_WIN64) + +struct WindowsAffinity { + std::optional> oldApi; + std::optional> newApi; + + // We also provide diagnostic for when the affinity is set to nullopt + // whether it was due to being indeterminate. If affinity is indeterminate + // it is best to assume it is not set at all, so consistent with the meaning + // of the nullopt affinity. + bool isNewDeterminate = true; + bool isOldDeterminate = true; + + std::optional> get_combined() const { + if (!oldApi.has_value()) + return newApi; + if (!newApi.has_value()) + return oldApi; + + std::set intersect; + std::set_intersection(oldApi->begin(), oldApi->end(), newApi->begin(), newApi->end(), + std::inserter(intersect, intersect.begin())); + return intersect; + } + + // Since Windows 11 and Windows Server 2022 thread affinities can span + // processor groups and can be set as such by a new WinAPI function. However, + // we may need to force using the old API if we detect that the process has + // affinity set by the old API already and we want to override that. Due to the + // limitations of the old API we cannot detect its use reliably. There will be + // cases where we detect not use but it has actually been used and vice versa. + + bool likely_used_old_api() const { return oldApi.has_value() || !isOldDeterminate; } +}; + +inline std::pair> get_process_group_affinity() { + + // GetProcessGroupAffinity requires the GroupArray argument to be + // aligned to 4 bytes instead of just 2. + static constexpr size_t GroupArrayMinimumAlignment = 4; + static_assert(GroupArrayMinimumAlignment >= alignof(USHORT)); + + // The function should succeed the second time, but it may fail if the group + // affinity has changed between GetProcessGroupAffinity calls. In such case + // we consider this a hard error, as we Cannot work with unstable affinities + // anyway. + static constexpr int MAX_TRIES = 2; + USHORT GroupCount = 1; + for (int i = 0; i < MAX_TRIES; ++i) + { + auto GroupArray = std::make_unique( + GroupCount + (GroupArrayMinimumAlignment / alignof(USHORT) - 1)); + + USHORT* GroupArrayAligned = align_ptr_up(GroupArray.get()); + + const BOOL status = + GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount, GroupArrayAligned); + + if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + { + break; + } + + if (status != 0) + { + return std::make_pair(status, + std::vector(GroupArrayAligned, GroupArrayAligned + GroupCount)); + } + } + + return std::make_pair(0, std::vector()); +} + +// On Windows there are two ways to set affinity, and therefore 2 ways to get it. +// These are not consistent, so we have to check both. In some cases it is actually +// not possible to determine affinity. For example when two different threads have +// affinity on different processor groups, set using SetThreadAffinityMask, we cannot +// retrieve the actual affinities. +// From documentation on GetProcessAffinityMask: +// > If the calling process contains threads in multiple groups, +// > the function returns zero for both affinity masks. +// In such cases we just give up and assume we have affinity for all processors. +// nullopt means no affinity is set, that is, all processors are allowed +inline WindowsAffinity get_process_affinity() { + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks")); + + BOOL status = 0; + + WindowsAffinity affinity; + + if (GetThreadSelectedCpuSetMasks_f != nullptr) + { + USHORT RequiredMaskCount; + status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount); + + // We expect ERROR_INSUFFICIENT_BUFFER from GetThreadSelectedCpuSetMasks, + // but other failure is an actual error. + if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) + { + affinity.isNewDeterminate = false; + } + else if (RequiredMaskCount > 0) + { + // If RequiredMaskCount then these affinities were never set, but it's + // not consistent so GetProcessAffinityMask may still return some affinity. + auto groupAffinities = std::make_unique(RequiredMaskCount); + + status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(), + RequiredMaskCount, &RequiredMaskCount); + + if (status == 0) + { + affinity.isNewDeterminate = false; + } + else + { + std::set cpus; + + for (USHORT i = 0; i < RequiredMaskCount; ++i) + { + const size_t procGroupIndex = groupAffinities[i].Group; + + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (groupAffinities[i].Mask & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + } + + affinity.newApi = std::move(cpus); + } + } + } + + // NOTE: There is no way to determine full affinity using the old API if + // individual threads set affinity on different processor groups. + + DWORD_PTR proc, sys; + status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys); + + // If proc == 0 then we cannot determine affinity because it spans processor groups. + // On Windows 11 and Server 2022 it will instead + // > If, however, hHandle specifies a handle to the current process, the function + // > always uses the calling thread's primary group (which by default is the same + // > as the process' primary group) in order to set the + // > lpProcessAffinityMask and lpSystemAffinityMask. + // So it will never be indeterminate here. We can only make assumptions later. + if (status == 0 || proc == 0) + { + affinity.isOldDeterminate = false; + return affinity; + } + + // If SetProcessAffinityMask was never called the affinity must span + // all processor groups, but if it was called it must only span one. + + std::vector groupAffinity; // We need to capture this later and capturing + // from structured bindings requires c++20. + + std::tie(status, groupAffinity) = get_process_group_affinity(); + if (status == 0) + { + affinity.isOldDeterminate = false; + return affinity; + } + + if (groupAffinity.size() == 1) + { + // We detect the case when affinity is set to all processors and correctly + // leave affinity.oldApi as nullopt. + if (GetActiveProcessorGroupCount() != 1 || proc != sys) + { + std::set cpus; + + const size_t procGroupIndex = groupAffinity[0]; + + const uint64_t mask = static_cast(proc); + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (mask & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + + affinity.oldApi = std::move(cpus); + } + } + else + { + // If we got here it means that either SetProcessAffinityMask was never set + // or we're on Windows 11/Server 2022. + + // Since Windows 11 and Windows Server 2022 the behaviour of + // GetProcessAffinityMask changed: + // > If, however, hHandle specifies a handle to the current process, + // > the function always uses the calling thread's primary group + // > (which by default is the same as the process' primary group) + // > in order to set the lpProcessAffinityMask and lpSystemAffinityMask. + // In which case we can actually retrieve the full affinity. + + if (GetThreadSelectedCpuSetMasks_f != nullptr) + { + std::thread th([&]() { + std::set cpus; + bool isAffinityFull = true; + + for (auto procGroupIndex : groupAffinity) + { + const int numActiveProcessors = + GetActiveProcessorCount(static_cast(procGroupIndex)); + + // We have to schedule to two different processors + // and & the affinities we get. Otherwise our processor + // choice could influence the resulting affinity. + // We assume the processor IDs within the group are + // filled sequentially from 0. + uint64_t procCombined = std::numeric_limits::max(); + uint64_t sysCombined = std::numeric_limits::max(); + + for (int i = 0; i < std::min(numActiveProcessors, 2); ++i) + { + GROUP_AFFINITY GroupAffinity; + std::memset(&GroupAffinity, 0, sizeof(GROUP_AFFINITY)); + GroupAffinity.Group = static_cast(procGroupIndex); + + GroupAffinity.Mask = static_cast(1) << i; + + status = + SetThreadGroupAffinity(GetCurrentThread(), &GroupAffinity, nullptr); + if (status == 0) + { + affinity.isOldDeterminate = false; + return; + } + + SwitchToThread(); + + DWORD_PTR proc2, sys2; + status = GetProcessAffinityMask(GetCurrentProcess(), &proc2, &sys2); + if (status == 0) + { + affinity.isOldDeterminate = false; + return; + } + + procCombined &= static_cast(proc2); + sysCombined &= static_cast(sys2); + } + + if (procCombined != sysCombined) + isAffinityFull = false; + + for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j) + { + if (procCombined & (KAFFINITY(1) << j)) + cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j); + } + } + + // We have to detect the case where the affinity was not set, + // or is set to all processors so that we correctly produce as + // std::nullopt result. + if (!isAffinityFull) + { + affinity.oldApi = std::move(cpus); + } + }); + + th.join(); + } + } + + return affinity; +} + +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + +inline std::set get_process_affinity() { + + std::set cpus; + + // For unsupported systems, or in case of a soft error, we may assume + // all processors are available for use. + [[maybe_unused]] auto set_to_all_cpus = [&]() { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + cpus.insert(c); + }; + + // cpu_set_t by default holds 1024 entries. This may not be enough soon, + // but there is no easy way to determine how many threads there actually + // is. In this case we just choose a reasonable upper bound. + static constexpr CpuIndex MaxNumCpus = 1024 * 64; + + cpu_set_t* mask = CPU_ALLOC(MaxNumCpus); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus); + + CPU_ZERO_S(masksize, mask); + + const int status = sched_getaffinity(0, masksize, mask); + + if (status != 0) + { + CPU_FREE(mask); + std::exit(EXIT_FAILURE); + } + + for (CpuIndex c = 0; c < MaxNumCpus; ++c) + if (CPU_ISSET_S(c, masksize, mask)) + cpus.insert(c); + + CPU_FREE(mask); + + return cpus; +} + +#endif + +#if defined(__linux__) && !defined(__ANDROID__) + +inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity(); + +#elif defined(_WIN64) + +inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity(); +inline static const auto STARTUP_USE_OLD_AFFINITY_API = + STARTUP_PROCESSOR_AFFINITY.likely_used_old_api(); + +#endif + +// We want to abstract the purpose of storing the numa node index somewhat. +// Whoever is using this does not need to know the specifics of the replication +// machinery to be able to access NUMA replicated memory. +class NumaReplicatedAccessToken { + public: + NumaReplicatedAccessToken() : + n(0) {} + + explicit NumaReplicatedAccessToken(NumaIndex idx) : + n(idx) {} + + NumaIndex get_numa_index() const { return n; } + + private: + NumaIndex n; +}; + +// Designed as immutable, because there is no good reason to alter an already +// existing config in a way that doesn't require recreating it completely, and +// it would be complex and expensive to maintain class invariants. +// The CPU (processor) numbers always correspond to the actual numbering used +// by the system. The NUMA node numbers MAY NOT correspond to the system's +// numbering of the NUMA nodes. In particular, empty nodes may be removed, or +// the user may create custom nodes. It is guaranteed that NUMA nodes are NOT +// empty: every node exposed by NumaConfig has at least one processor assigned. +// +// We use startup affinities so as not to modify its own behaviour in time. +// +// Since Stockfish doesn't support exceptions all places where an exception +// should be thrown are replaced by std::exit. +class NumaConfig { + public: + NumaConfig() : + highestCpuIndex(0), + customAffinity(false) { + const auto numCpus = SYSTEM_THREADS_NB; + add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1); + } + + // This function queries the system for the mapping of processors to NUMA nodes. + // On Linux we read from standardized kernel sysfs, with a fallback to single NUMA + // node. On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see + // comment for Windows implementation of get_process_affinity. + static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) { + NumaConfig cfg = empty(); + +#if defined(__linux__) && !defined(__ANDROID__) + + std::set allowedCpus; + + if (respectProcessAffinity) + allowedCpus = STARTUP_PROCESSOR_AFFINITY; + + auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) { + return !respectProcessAffinity || allowedCpus.count(c) == 1; + }; + + // On Linux things are straightforward, since there's no processor groups and + // any thread can be scheduled on all processors. + // We try to gather this information from the sysfs first + // https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node + + bool useFallback = false; + auto fallback = [&]() { + useFallback = true; + cfg = empty(); + }; + + // /sys/devices/system/node/online contains information about active NUMA nodes + auto nodeIdsStr = read_file_to_string("/sys/devices/system/node/online"); + if (!nodeIdsStr.has_value() || nodeIdsStr->empty()) + { + fallback(); + } + else + { + remove_whitespace(*nodeIdsStr); + for (size_t n : indices_from_shortened_string(*nodeIdsStr)) + { + // /sys/devices/system/node/node.../cpulist + std::string path = + std::string("/sys/devices/system/node/node") + std::to_string(n) + "/cpulist"; + auto cpuIdsStr = read_file_to_string(path); + // Now, we only bail if the file does not exist. Some nodes may be + // empty, that's fine. An empty node still has a file that appears + // to have some whitespace, so we need to handle that. + if (!cpuIdsStr.has_value()) + { + fallback(); + break; + } + else + { + remove_whitespace(*cpuIdsStr); + for (size_t c : indices_from_shortened_string(*cpuIdsStr)) + { + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(n, c); + } + } + } + } + + if (useFallback) + { + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + if (is_cpu_allowed(c)) + cfg.add_cpu_to_node(NumaIndex{0}, c); + } + +#elif defined(_WIN64) + + std::optional> allowedCpus; + + if (respectProcessAffinity) + allowedCpus = STARTUP_PROCESSOR_AFFINITY.get_combined(); + + // The affinity cannot be determined in all cases on Windows, + // but we at least guarantee that the number of allowed processors + // is >= number of processors in the affinity mask. In case the user + // is not satisfied they must set the processor numbers explicitly. + auto is_cpu_allowed = [&allowedCpus](CpuIndex c) { + return !allowedCpus.has_value() || allowedCpus->count(c) == 1; + }; + + WORD numProcGroups = GetActiveProcessorGroupCount(); + for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup) + { + for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number) + { + PROCESSOR_NUMBER procnum; + procnum.Group = procGroup; + procnum.Number = number; + procnum.Reserved = 0; + USHORT nodeNumber; + + const BOOL status = GetNumaProcessorNodeEx(&procnum, &nodeNumber); + const CpuIndex c = static_cast(procGroup) * WIN_PROCESSOR_GROUP_SIZE + + static_cast(number); + if (status != 0 && nodeNumber != std::numeric_limits::max() + && is_cpu_allowed(c)) + { + cfg.add_cpu_to_node(nodeNumber, c); + } + } + } + + // Split the NUMA nodes to be contained within a group if necessary. + // This is needed between Windows 10 Build 20348 and Windows 11, because + // the new NUMA allocation behaviour was introduced while there was + // still no way to set thread affinity spanning multiple processor groups. + // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + // We also do this is if need to force old API for some reason. + // + // 2024-08-26: It appears that we need to actually always force this behaviour. + // While Windows allows this to work now, such assignments have bad interaction + // with the scheduler - in particular it still prefers scheduling on the thread's + // "primary" node, even if it means scheduling SMT processors first. + // See https://github.com/official-stockfish/Stockfish/issues/5551 + // See https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups + // + // Each process is assigned a primary group at creation, and by default all + // of its threads' primary group is the same. Each thread's ideal processor + // is in the thread's primary group, so threads will preferentially be + // scheduled to processors on their primary group, but they are able to + // be scheduled to processors on any other group. + // + // used to be guarded by if (STARTUP_USE_OLD_AFFINITY_API) + { + NumaConfig splitCfg = empty(); + + NumaIndex splitNodeIndex = 0; + for (const auto& cpus : cfg.nodes) + { + if (cpus.empty()) + continue; + + size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE; + for (CpuIndex c : cpus) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + if (procGroupIndex != lastProcGroupIndex) + { + splitNodeIndex += 1; + lastProcGroupIndex = procGroupIndex; + } + splitCfg.add_cpu_to_node(splitNodeIndex, c); + } + splitNodeIndex += 1; + } + + cfg = std::move(splitCfg); + } + +#else + + // Fallback for unsupported systems. + for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) + cfg.add_cpu_to_node(NumaIndex{0}, c); + +#endif + + // We have to ensure no empty NUMA nodes persist. + cfg.remove_empty_numa_nodes(); + + // If the user explicitly opts out from respecting the current process affinity + // then it may be inconsistent with the current affinity (obviously), so we + // consider it custom. + if (!respectProcessAffinity) + cfg.customAffinity = true; + + return cfg; + } + + // ':'-separated numa nodes + // ','-separated cpu indices + // supports "first-last" range syntax for cpu indices + // For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191" + static NumaConfig from_string(const std::string& s) { + NumaConfig cfg = empty(); + + NumaIndex n = 0; + for (auto&& nodeStr : split(s, ":")) + { + auto indices = indices_from_shortened_string(std::string(nodeStr)); + if (!indices.empty()) + { + for (auto idx : indices) + { + if (!cfg.add_cpu_to_node(n, CpuIndex(idx))) + std::exit(EXIT_FAILURE); + } + + n += 1; + } + } + + cfg.customAffinity = true; + + return cfg; + } + + NumaConfig(const NumaConfig&) = delete; + NumaConfig(NumaConfig&&) = default; + NumaConfig& operator=(const NumaConfig&) = delete; + NumaConfig& operator=(NumaConfig&&) = default; + + bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; } + + NumaIndex num_numa_nodes() const { return nodes.size(); } + + CpuIndex num_cpus_in_numa_node(NumaIndex n) const { + assert(n < nodes.size()); + return nodes[n].size(); + } + + CpuIndex num_cpus() const { return nodeByCpu.size(); } + + bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; } + + std::string to_string() const { + std::string str; + + bool isFirstNode = true; + for (auto&& cpus : nodes) + { + if (!isFirstNode) + str += ":"; + + bool isFirstSet = true; + auto rangeStart = cpus.begin(); + for (auto it = cpus.begin(); it != cpus.end(); ++it) + { + auto next = std::next(it); + if (next == cpus.end() || *next != *it + 1) + { + // cpus[i] is at the end of the range (may be of size 1) + if (!isFirstSet) + str += ","; + + const CpuIndex last = *it; + + if (it != rangeStart) + { + const CpuIndex first = *rangeStart; + + str += std::to_string(first); + str += "-"; + str += std::to_string(last); + } + else + str += std::to_string(last); + + rangeStart = next; + isFirstSet = false; + } + } + + isFirstNode = false; + } + + return str; + } + + bool suggests_binding_threads(CpuIndex numThreads) const { + // If we can reasonably determine that the threads cannot be contained + // by the OS within the first NUMA node then we advise distributing + // and binding threads. When the threads are not bound we can only use + // NUMA memory replicated objects from the first node, so when the OS + // has to schedule on other nodes we lose performance. We also suggest + // binding if there's enough threads to distribute among nodes with minimal + // disparity. We try to ignore small nodes, in particular the empty ones. + + // If the affinity set by the user does not match the affinity given by + // the OS then binding is necessary to ensure the threads are running on + // correct processors. + if (customAffinity) + return true; + + // We obviously cannot distribute a single thread, so a single thread + // should never be bound. + if (numThreads <= 1) + return false; + + size_t largestNodeSize = 0; + for (auto&& cpus : nodes) + if (cpus.size() > largestNodeSize) + largestNodeSize = cpus.size(); + + auto is_node_small = [largestNodeSize](const std::set& node) { + static constexpr double SmallNodeThreshold = 0.6; + return static_cast(node.size()) / static_cast(largestNodeSize) + <= SmallNodeThreshold; + }; + + size_t numNotSmallNodes = 0; + for (auto&& cpus : nodes) + if (!is_node_small(cpus)) + numNotSmallNodes += 1; + + return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4) + && nodes.size() > 1; + } + + std::vector distribute_threads_among_numa_nodes(CpuIndex numThreads) const { + std::vector ns; + + if (nodes.size() == 1) + { + // Special case for when there's no NUMA nodes. This doesn't buy us + // much, but let's keep the default path simple. + ns.resize(numThreads, NumaIndex{0}); + } + else + { + std::vector occupation(nodes.size(), 0); + for (CpuIndex c = 0; c < numThreads; ++c) + { + NumaIndex bestNode{0}; + float bestNodeFill = std::numeric_limits::max(); + for (NumaIndex n = 0; n < nodes.size(); ++n) + { + float fill = + static_cast(occupation[n] + 1) / static_cast(nodes[n].size()); + // NOTE: Do we want to perhaps fill the first available node + // up to 50% first before considering other nodes? + // Probably not, because it would interfere with running + // multiple instances. We basically shouldn't favor any + // particular node. + if (fill < bestNodeFill) + { + bestNode = n; + bestNodeFill = fill; + } + } + ns.emplace_back(bestNode); + occupation[bestNode] += 1; + } + } + + return ns; + } + + NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const { + if (n >= nodes.size() || nodes[n].size() == 0) + std::exit(EXIT_FAILURE); + +#if defined(__linux__) && !defined(__ANDROID__) + + cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1); + if (mask == nullptr) + std::exit(EXIT_FAILURE); + + const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1); + + CPU_ZERO_S(masksize, mask); + + for (CpuIndex c : nodes[n]) + CPU_SET_S(c, masksize, mask); + + const int status = sched_setaffinity(0, masksize, mask); + + CPU_FREE(mask); + + if (status != 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + sched_yield(); + +#elif defined(_WIN64) + + // Requires Windows 11. No good way to set thread affinity spanning + // processor groups before that. + HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); + auto SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t( + (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks")); + + // We ALWAYS set affinity with the new API if available, because + // there's no downsides, and we forcibly keep it consistent with + // the old API should we need to use it. I.e. we always keep this + // as a superset of what we set with SetThreadGroupAffinity. + if (SetThreadSelectedCpuSetMasks_f != nullptr) + { + // Only available on Windows 11 and Windows Server 2022 onwards + const USHORT numProcGroups = USHORT( + ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE); + auto groupAffinities = std::make_unique(numProcGroups); + std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups); + for (WORD i = 0; i < numProcGroups; ++i) + groupAffinities[i].Group = i; + + for (CpuIndex c : nodes[n]) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE; + groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup; + } + + HANDLE hThread = GetCurrentThread(); + + const BOOL status = + SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups); + if (status == 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. + // This is defensive, allowed because this code is not performance critical. + SwitchToThread(); + } + + // Sometimes we need to force the old API, but do not use it unless necessary. + if (SetThreadSelectedCpuSetMasks_f == nullptr || STARTUP_USE_OLD_AFFINITY_API) + { + // On earlier windows version (since windows 7) we cannot run a single thread + // on multiple processor groups, so we need to restrict the group. + // We assume the group of the first processor listed for this node. + // Processors from outside this group will not be assigned for this thread. + // Normally this won't be an issue because windows used to assign NUMA nodes + // such that they cannot span processor groups. However, since Windows 10 + // Build 20348 the behaviour changed, so there's a small window of versions + // between this and Windows 11 that might exhibit problems with not all + // processors being utilized. + // + // We handle this in NumaConfig::from_system by manually splitting the + // nodes when we detect that there is no function to set affinity spanning + // processor nodes. This is required because otherwise our thread distribution + // code may produce suboptimal results. + // + // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support + GROUP_AFFINITY affinity; + std::memset(&affinity, 0, sizeof(GROUP_AFFINITY)); + // We use an ordered set to be sure to get the smallest cpu number here. + const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE; + affinity.Group = static_cast(forcedProcGroupIndex); + for (CpuIndex c : nodes[n]) + { + const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE; + const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE; + // We skip processors that are not in the same processor group. + // If everything was set up correctly this will never be an issue, + // but we have to account for bad NUMA node specification. + if (procGroupIndex != forcedProcGroupIndex) + continue; + + affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup; + } + + HANDLE hThread = GetCurrentThread(); + + const BOOL status = SetThreadGroupAffinity(hThread, &affinity, nullptr); + if (status == 0) + std::exit(EXIT_FAILURE); + + // We yield this thread just to be sure it gets rescheduled. This is + // defensive, allowed because this code is not performance critical. + SwitchToThread(); + } + +#endif + + return NumaReplicatedAccessToken(n); + } + + template + void execute_on_numa_node(NumaIndex n, FuncT&& f) const { + std::thread th([this, &f, n]() { + bind_current_thread_to_numa_node(n); + std::forward(f)(); + }); + + th.join(); + } + + private: + std::vector> nodes; + std::map nodeByCpu; + CpuIndex highestCpuIndex; + + bool customAffinity; + + static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); } + + struct EmptyNodeTag {}; + + NumaConfig(EmptyNodeTag) : + highestCpuIndex(0), + customAffinity(false) {} + + void remove_empty_numa_nodes() { + std::vector> newNodes; + for (auto&& cpus : nodes) + if (!cpus.empty()) + newNodes.emplace_back(std::move(cpus)); + nodes = std::move(newNodes); + } + + // Returns true if successful + // Returns false if failed, i.e. when the cpu is already present + // strong guarantee, the structure remains unmodified + bool add_cpu_to_node(NumaIndex n, CpuIndex c) { + if (is_cpu_assigned(c)) + return false; + + while (nodes.size() <= n) + nodes.emplace_back(); + + nodes[n].insert(c); + nodeByCpu[c] = n; + + if (c > highestCpuIndex) + highestCpuIndex = c; + + return true; + } + + // Returns true if successful + // Returns false if failed, i.e. when any of the cpus is already present + // strong guarantee, the structure remains unmodified + bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) { + for (CpuIndex c = cfirst; c <= clast; ++c) + if (is_cpu_assigned(c)) + return false; + + while (nodes.size() <= n) + nodes.emplace_back(); + + for (CpuIndex c = cfirst; c <= clast; ++c) + { + nodes[n].insert(c); + nodeByCpu[c] = n; + } + + if (clast > highestCpuIndex) + highestCpuIndex = clast; + + return true; + } + + static std::vector indices_from_shortened_string(const std::string& s) { + std::vector indices; + + if (s.empty()) + return indices; + + for (const auto& ss : split(s, ",")) + { + if (ss.empty()) + continue; + + auto parts = split(ss, "-"); + if (parts.size() == 1) + { + const CpuIndex c = CpuIndex{str_to_size_t(std::string(parts[0]))}; + indices.emplace_back(c); + } + else if (parts.size() == 2) + { + const CpuIndex cfirst = CpuIndex{str_to_size_t(std::string(parts[0]))}; + const CpuIndex clast = CpuIndex{str_to_size_t(std::string(parts[1]))}; + for (size_t c = cfirst; c <= clast; ++c) + { + indices.emplace_back(c); + } + } + } + + return indices; + } +}; + +class NumaReplicationContext; + +// Instances of this class are tracked by the NumaReplicationContext instance. +// NumaReplicationContext informs all tracked instances when NUMA configuration changes. +class NumaReplicatedBase { + public: + NumaReplicatedBase(NumaReplicationContext& ctx); + + NumaReplicatedBase(const NumaReplicatedBase&) = delete; + NumaReplicatedBase(NumaReplicatedBase&& other) noexcept; + + NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete; + NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept; + + virtual void on_numa_config_changed() = 0; + virtual ~NumaReplicatedBase(); + + const NumaConfig& get_numa_config() const; + + private: + NumaReplicationContext* context; +}; + +// We force boxing with a unique_ptr. If this becomes an issue due to added +// indirection we may need to add an option for a custom boxing type. When the +// NUMA config changes the value stored at the index 0 is replicated to other nodes. +template +class NumaReplicated: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + NumaReplicated(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + replicate_from(T{}); + } + + NumaReplicated(NumaReplicationContext& ctx, T&& source) : + NumaReplicatedBase(ctx) { + replicate_from(std::move(source)); + } + + NumaReplicated(const NumaReplicated&) = delete; + NumaReplicated(NumaReplicated&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + NumaReplicated& operator=(const NumaReplicated&) = delete; + NumaReplicated& operator=(NumaReplicated&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + NumaReplicated& operator=(T&& source) { + replicate_from(std::move(source)); + + return *this; + } + + ~NumaReplicated() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return instances[0].get(); } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::move(instances[0]); + std::forward(f)(*source); + replicate_from(std::move(*source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. + auto source = std::move(instances[0]); + replicate_from(std::move(*source)); + } + + private: + std::vector> instances; + + void replicate_from(T&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + if (cfg.requires_memory_replication()) + { + for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n) + { + cfg.execute_on_numa_node( + n, [this, &source]() { instances.emplace_back(std::make_unique(source)); }); + } + } + else + { + assert(cfg.num_numa_nodes() == 1); + // We take advantage of the fact that replication is not required + // and reuse the source value, avoiding one copy operation. + instances.emplace_back(std::make_unique(std::move(source))); + } + } +}; + +// We force boxing with a unique_ptr. If this becomes an issue due to added +// indirection we may need to add an option for a custom boxing type. +template +class LazyNumaReplicated: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + LazyNumaReplicated(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(T{}); + } + + LazyNumaReplicated(NumaReplicationContext& ctx, T&& source) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(std::move(source)); + } + + LazyNumaReplicated(const LazyNumaReplicated&) = delete; + LazyNumaReplicated(LazyNumaReplicated&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + LazyNumaReplicated& operator=(const LazyNumaReplicated&) = delete; + LazyNumaReplicated& operator=(LazyNumaReplicated&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + LazyNumaReplicated& operator=(T&& source) { + prepare_replicate_from(std::move(source)); + + return *this; + } + + ~LazyNumaReplicated() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + ensure_present(token.get_numa_index()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return instances[0].get(); } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::move(instances[0]); + std::forward(f)(*source); + prepare_replicate_from(std::move(*source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. + auto source = std::move(instances[0]); + prepare_replicate_from(std::move(*source)); + } + + private: + mutable std::vector> instances; + mutable std::mutex mutex; + + void ensure_present(NumaIndex idx) const { + assert(idx < instances.size()); + + if (instances[idx] != nullptr) + return; + + assert(idx != 0); + + std::unique_lock lock(mutex); + // Check again for races. + if (instances[idx] != nullptr) + return; + + const NumaConfig& cfg = get_numa_config(); + cfg.execute_on_numa_node( + idx, [this, idx]() { instances[idx] = std::make_unique(*instances[0]); }); + } + + void prepare_replicate_from(T&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + if (cfg.requires_memory_replication()) + { + assert(cfg.num_numa_nodes() > 0); + + // We just need to make sure the first instance is there. + // Note that we cannot move here as we need to reallocate the data + // on the correct NUMA node. + cfg.execute_on_numa_node( + 0, [this, &source]() { instances.emplace_back(std::make_unique(source)); }); + + // Prepare others for lazy init. + instances.resize(cfg.num_numa_nodes()); + } + else + { + assert(cfg.num_numa_nodes() == 1); + // We take advantage of the fact that replication is not required + // and reuse the source value, avoiding one copy operation. + instances.emplace_back(std::make_unique(std::move(source))); + } + } +}; + +class NumaReplicationContext { + public: + NumaReplicationContext(NumaConfig&& cfg) : + config(std::move(cfg)) {} + + NumaReplicationContext(const NumaReplicationContext&) = delete; + NumaReplicationContext(NumaReplicationContext&&) = delete; + + NumaReplicationContext& operator=(const NumaReplicationContext&) = delete; + NumaReplicationContext& operator=(NumaReplicationContext&&) = delete; + + ~NumaReplicationContext() { + // The context must outlive replicated objects + if (!trackedReplicatedObjects.empty()) + std::exit(EXIT_FAILURE); + } + + void attach(NumaReplicatedBase* obj) { + assert(trackedReplicatedObjects.count(obj) == 0); + trackedReplicatedObjects.insert(obj); + } + + void detach(NumaReplicatedBase* obj) { + assert(trackedReplicatedObjects.count(obj) == 1); + trackedReplicatedObjects.erase(obj); + } + + // oldObj may be invalid at this point + void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) { + assert(trackedReplicatedObjects.count(oldObj) == 1); + assert(trackedReplicatedObjects.count(newObj) == 0); + trackedReplicatedObjects.erase(oldObj); + trackedReplicatedObjects.insert(newObj); + } + + void set_numa_config(NumaConfig&& cfg) { + config = std::move(cfg); + for (auto&& obj : trackedReplicatedObjects) + obj->on_numa_config_changed(); + } + + const NumaConfig& get_numa_config() const { return config; } + + private: + NumaConfig config; + + // std::set uses std::less by default, which is required for pointer comparison + std::set trackedReplicatedObjects; +}; + +inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) : + context(&ctx) { + context->attach(this); +} + +inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept : + context(std::exchange(other.context, nullptr)) { + context->move_attached(&other, this); +} + +inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept { + context = std::exchange(other.context, nullptr); + + context->move_attached(&other, this); + + return *this; +} + +inline NumaReplicatedBase::~NumaReplicatedBase() { + if (context != nullptr) + context->detach(this); +} + +inline const NumaConfig& NumaReplicatedBase::get_numa_config() const { + return context->get_numa_config(); +} + +} // namespace Stockfish + + +#endif // #ifndef NUMA_H_INCLUDED diff --git a/src/position.cpp b/src/position.cpp index b46ba029..1b1c0269 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -334,8 +334,10 @@ void Position::set_check_info() const { // The function is only used when a new position is set up void Position::set_state() const { - st->key = st->materialKey = 0; - st->pawnKey = Zobrist::noPawns; + st->key = st->materialKey = 0; + st->majorPieceKey = st->minorPieceKey = 0; + st->nonPawnKey[WHITE] = st->nonPawnKey[BLACK] = 0; + st->pawnKey = Zobrist::noPawns; st->nonPawnMaterial[WHITE] = st->nonPawnMaterial[BLACK] = VALUE_ZERO; st->checkersBB = attackers_to(square(sideToMove)) & pieces(~sideToMove); @@ -350,8 +352,27 @@ void Position::set_state() const { if (type_of(pc) == PAWN) st->pawnKey ^= Zobrist::psq[pc][s]; - else if (type_of(pc) != KING) - st->nonPawnMaterial[color_of(pc)] += PieceValue[pc]; + else + { + st->nonPawnKey[color_of(pc)] ^= Zobrist::psq[pc][s]; + + if (type_of(pc) != KING) + { + st->nonPawnMaterial[color_of(pc)] += PieceValue[pc]; + + if (type_of(pc) >= ROOK) + st->majorPieceKey ^= Zobrist::psq[pc][s]; + + else + st->minorPieceKey ^= Zobrist::psq[pc][s]; + } + + else + { + st->majorPieceKey ^= Zobrist::psq[pc][s]; + st->minorPieceKey ^= Zobrist::psq[pc][s]; + } + } } if (st->epSquare != SQ_NONE) @@ -671,6 +692,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { // our state pointer to point to the new (ready to be updated) state. std::memcpy(&newSt, st, offsetof(StateInfo, key)); newSt.previous = st; + st->next = &newSt; st = &newSt; // Increment ply counters. In particular, rule50 will be reset to zero later on @@ -706,6 +728,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { do_castling(us, from, to, rfrom, rto); k ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto]; + st->majorPieceKey ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto]; + st->nonPawnKey[us] ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto]; captured = NO_PIECE; } @@ -731,7 +755,16 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { st->pawnKey ^= Zobrist::psq[captured][capsq]; } else + { st->nonPawnMaterial[them] -= PieceValue[captured]; + st->nonPawnKey[them] ^= Zobrist::psq[captured][capsq]; + + if (type_of(captured) >= ROOK) + st->majorPieceKey ^= Zobrist::psq[captured][capsq]; + + else + st->minorPieceKey ^= Zobrist::psq[captured][capsq]; + } dp.dirty_num = 2; // 1 piece moved, 1 piece captured dp.piece[1] = captured; @@ -789,7 +822,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { else if (m.type_of() == PROMOTION) { - Piece promotion = make_piece(us, m.promotion_type()); + Piece promotion = make_piece(us, m.promotion_type()); + PieceType promotionType = type_of(promotion); assert(relative_rank(us, to) == RANK_8); assert(type_of(promotion) >= KNIGHT && type_of(promotion) <= QUEEN); @@ -810,6 +844,12 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { st->materialKey ^= Zobrist::psq[promotion][pieceCount[promotion] - 1] ^ Zobrist::psq[pc][pieceCount[pc]]; + if (promotionType >= ROOK) + st->majorPieceKey ^= Zobrist::psq[promotion][to]; + + else + st->minorPieceKey ^= Zobrist::psq[promotion][to]; + // Update material st->nonPawnMaterial[us] += PieceValue[promotion]; } @@ -821,6 +861,23 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { st->rule50 = 0; } + else + { + st->nonPawnKey[us] ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + + if (type_of(pc) == KING) + { + st->majorPieceKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + st->minorPieceKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + } + + else if (type_of(pc) >= ROOK) + st->majorPieceKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + + else + st->minorPieceKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to]; + } + // Set capture piece st->capturedPiece = captured; @@ -963,6 +1020,7 @@ void Position::do_null_move(StateInfo& newSt, TranspositionTable& tt) { std::memcpy(&newSt, st, offsetof(StateInfo, accumulatorBig)); newSt.previous = st; + st->next = &newSt; st = &newSt; st->dirtyPiece.dirty_num = 0; @@ -1156,9 +1214,9 @@ bool Position::has_repeated() const { } -// Tests if the position has a move which draws by repetition, -// or an earlier position has a move that directly reaches the current position. -bool Position::has_game_cycle(int ply) const { +// Tests if the position has a move which draws by repetition. +// This function accurately matches the outcome of is_draw() over all legal moves. +bool Position::upcoming_repetition(int ply) const { int j; @@ -1169,10 +1227,16 @@ bool Position::has_game_cycle(int ply) const { Key originalKey = st->key; StateInfo* stp = st->previous; + Key other = originalKey ^ stp->key ^ Zobrist::side; for (int i = 3; i <= end; i += 2) { - stp = stp->previous->previous; + stp = stp->previous; + other ^= stp->key ^ stp->previous->key ^ Zobrist::side; + stp = stp->previous; + + if (other != 0) + continue; Key moveKey = originalKey ^ stp->key; if ((j = H1(moveKey), cuckoo[j] == moveKey) || (j = H2(moveKey), cuckoo[j] == moveKey)) @@ -1188,12 +1252,6 @@ bool Position::has_game_cycle(int ply) const { // For nodes before or at the root, check that the move is a // repetition rather than a move to the current position. - // In the cuckoo table, both moves Rc1c5 and Rc5c1 are stored in - // the same location, so we have to select which square to check. - if (color_of(piece_on(empty(s1) ? s2 : s1)) != side_to_move()) - continue; - - // For repetitions before or at the root, require one more if (stp->repetition) return true; } diff --git a/src/position.h b/src/position.h index 154ed652..888612da 100644 --- a/src/position.h +++ b/src/position.h @@ -43,6 +43,9 @@ struct StateInfo { // Copied when making a move Key materialKey; Key pawnKey; + Key majorPieceKey; + Key minorPieceKey; + Key nonPawnKey[COLOR_NB]; Value nonPawnMaterial[COLOR_NB]; int castlingRights; int rule50; @@ -53,6 +56,7 @@ struct StateInfo { Key key; Bitboard checkersBB; StateInfo* previous; + StateInfo* next; Bitboard blockersForKing[COLOR_NB]; Bitboard pinners[COLOR_NB]; Bitboard checkSquares[PIECE_TYPE_NB]; @@ -150,13 +154,16 @@ class Position { Key key_after(Move m) const; Key material_key() const; Key pawn_key() const; + Key major_piece_key() const; + Key minor_piece_key() const; + Key non_pawn_key(Color c) const; // Other properties of the position Color side_to_move() const; int game_ply() const; bool is_chess960() const; bool is_draw(int ply) const; - bool has_game_cycle(int ply) const; + bool upcoming_repetition(int ply) const; bool has_repeated() const; int rule50_count() const; Value non_pawn_material(Color c) const; @@ -297,6 +304,12 @@ inline Key Position::pawn_key() const { return st->pawnKey; } inline Key Position::material_key() const { return st->materialKey; } +inline Key Position::major_piece_key() const { return st->majorPieceKey; } + +inline Key Position::minor_piece_key() const { return st->minorPieceKey; } + +inline Key Position::non_pawn_key(Color c) const { return st->nonPawnKey[c]; } + inline Value Position::non_pawn_material(Color c) const { return st->nonPawnMaterial[c]; } inline Value Position::non_pawn_material() const { @@ -315,8 +328,8 @@ inline bool Position::capture(Move m) const { } // Returns true if a move is generated from the capture stage, having also -// queen promotions covered, i.e. consistency with the capture stage move generation -// is needed to avoid the generation of duplicate moves. +// queen promotions covered, i.e. consistency with the capture stage move +// generation is needed to avoid the generation of duplicate moves. inline bool Position::capture_stage(Move m) const { assert(m.is_ok()); return capture(m) || m.promotion_type() == QUEEN; diff --git a/src/score.cpp b/src/score.cpp index 292f5340..179796d2 100644 --- a/src/score.cpp +++ b/src/score.cpp @@ -29,7 +29,7 @@ namespace Stockfish { Score::Score(Value v, const Position& pos) { assert(-VALUE_INFINITE < v && v < VALUE_INFINITE); - if (std::abs(v) < VALUE_TB_WIN_IN_MAX_PLY) + if (!is_decisive(v)) { score = InternalUnits{UCIEngine::to_cp(v, pos)}; } diff --git a/src/search.cpp b/src/search.cpp index 6830e4b1..e352c96e 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -22,14 +22,19 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include #include #include "evaluate.h" +#include "history.h" #include "misc.h" #include "movegen.h" #include "movepick.h" @@ -49,105 +54,98 @@ namespace Stockfish { namespace TB = Tablebases; +void syzygy_extend_pv(const OptionsMap& options, + const Search::LimitsType& limits, + Stockfish::Position& pos, + Stockfish::Search::RootMove& rootMove, + Value& v); + using Eval::evaluate; using namespace Search; namespace { -static constexpr double EvalLevel[10] = {0.981, 0.956, 0.895, 0.949, 0.913, - 0.942, 0.933, 0.890, 0.984, 0.941}; - // Futility margin Value futility_margin(Depth d, bool noTtCutNode, bool improving, bool oppWorsening) { - Value futilityMult = 126 - 46 * noTtCutNode; - Value improvingDeduction = 58 * improving * futilityMult / 32; - Value worseningDeduction = (323 + 52 * improving) * oppWorsening * futilityMult / 1024; + Value futilityMult = 109 - 27 * noTtCutNode; + Value improvingDeduction = improving * futilityMult * 2; + Value worseningDeduction = oppWorsening * futilityMult / 3; return futilityMult * d - improvingDeduction - worseningDeduction; } constexpr int futility_move_count(bool improving, Depth depth) { - return improving ? (3 + depth * depth) : (3 + depth * depth) / 2; + return (3 + depth * depth) / (2 - improving); } -// Add correctionHistory value to raw staticEval and guarantee evaluation does not hit the tablebase range -Value to_corrected_static_eval(Value v, const Worker& w, const Position& pos) { - auto cv = w.correctionHistory[pos.side_to_move()][pawn_structure_index(pos)]; - v += cv * std::abs(cv) / 7350; +// Add correctionHistory value to raw staticEval and guarantee evaluation +// does not hit the tablebase range. +Value to_corrected_static_eval(Value v, const Worker& w, const Position& pos, Stack* ss) { + const Color us = pos.side_to_move(); + const auto m = (ss - 1)->currentMove; + const auto pcv = w.pawnCorrectionHistory[us][pawn_structure_index(pos)]; + const auto macv = w.majorPieceCorrectionHistory[us][major_piece_index(pos)]; + const auto micv = w.minorPieceCorrectionHistory[us][minor_piece_index(pos)]; + const auto wnpcv = w.nonPawnCorrectionHistory[WHITE][us][non_pawn_index(pos)]; + const auto bnpcv = w.nonPawnCorrectionHistory[BLACK][us][non_pawn_index(pos)]; + int cntcv = 1; + + if (m.is_ok()) + cntcv = int((*(ss - 2)->continuationCorrectionHistory)[pos.piece_on(m.to_sq())][m.to_sq()]); + + const auto cv = + (6384 * pcv + 3583 * macv + 6492 * micv + 6725 * (wnpcv + bnpcv) + cntcv * 5880) / 131072; + v += cv; return std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); } // History and stats update bonus, based on depth -int stat_bonus(Depth d) { return std::clamp(208 * d - 297, 16, 1406); } +int stat_bonus(Depth d) { return std::min(168 * d - 100, 1718); } // History and stats update malus, based on depth -int stat_malus(Depth d) { return (d < 4 ? 520 * d - 312 : 1479); } +int stat_malus(Depth d) { return std::min(768 * d - 257, 2351); } // Add a small random component to draw evaluations to avoid 3-fold blindness Value value_draw(size_t nodes) { return VALUE_DRAW - 1 + Value(nodes & 0x2); } - -// Skill structure is used to implement strength limit. If we have a UCI_Elo, -// we convert it to an appropriate skill level, anchored to the Stash engine. -// This method is based on a fit of the Elo results for games played between -// Stockfish at various skill levels and various versions of the Stash engine. -// Skill 0 .. 19 now covers CCRL Blitz Elo from 1320 to 3190, approximately -// Reference: https://github.com/vondele/Stockfish/commit/a08b8d4e9711c2 -struct Skill { - Skill(int skill_level, int uci_elo) { - if (uci_elo) - { - double e = double(uci_elo - 1320) / (3190 - 1320); - level = std::clamp((((37.2473 * e - 40.8525) * e + 22.2943) * e - 0.311438), 0.0, 19.0); - } - else - level = double(skill_level); - } - bool enabled() const { return level < 20.0; } - bool time_to_pick(Depth depth) const { return depth == 1 + int(level); } - Move pick_best(const RootMoves&, size_t multiPV); - - double level; - Move best = Move::none(); -}; - Value value_to_tt(Value v, int ply); Value value_from_tt(Value v, int ply, int r50c); void update_pv(Move* pv, Move move, const Move* childPv); void update_continuation_histories(Stack* ss, Piece pc, Square to, int bonus); -void update_refutations(const Position& pos, Stack* ss, Search::Worker& workerThread, Move move); void update_quiet_histories( const Position& pos, Stack* ss, Search::Worker& workerThread, Move move, int bonus); -void update_quiet_stats( - const Position& pos, Stack* ss, Search::Worker& workerThread, Move move, int bonus); -void update_all_stats(const Position& pos, - Stack* ss, - Search::Worker& workerThread, - Move bestMove, - Value bestValue, - Value beta, - Square prevSq, - Move* quietsSearched, - int quietCount, - Move* capturesSearched, - int captureCount, - Depth depth); +void update_all_stats(const Position& pos, + Stack* ss, + Search::Worker& workerThread, + Move bestMove, + Square prevSq, + ValueList& quietsSearched, + ValueList& capturesSearched, + Depth depth); } // namespace Search::Worker::Worker(SharedState& sharedState, std::unique_ptr sm, - size_t thread_id) : + size_t threadId, + NumaReplicatedAccessToken token) : // Unpack the SharedState struct into member variables - thread_idx(thread_id), + threadIdx(threadId), + numaAccessToken(token), manager(std::move(sm)), options(sharedState.options), threads(sharedState.threads), tt(sharedState.tt), networks(sharedState.networks), - refreshTable(networks) { + refreshTable(networks[token]) { clear(); } +void Search::Worker::ensure_network_replicated() { + // Access once to force lazy initialization. + // We do this because we want to avoid initialization during search. + (void) (networks[numaAccessToken]); +} + void Search::Worker::start_searching() { // Non-main threads go directly to iterative_deepening() @@ -157,7 +155,8 @@ void Search::Worker::start_searching() { return; } - main_manager()->tm.init(limits, rootPos.side_to_move(), rootPos.game_ply(), options); + main_manager()->tm.init(limits, rootPos.side_to_move(), rootPos.game_ply(), options, + main_manager()->originalTimeAdjust); tt.new_search(); if (rootMoves.empty()) @@ -181,7 +180,7 @@ void Search::Worker::start_searching() { {} // Busy wait for a stop or a ponder reset // Stop the threads if not already stopped (also raise the stop if - // "ponderhit" just reset threads.ponder). + // "ponderhit" just reset threads.ponder) threads.stop = true; // Wait until all threads have finished @@ -190,8 +189,8 @@ void Search::Worker::start_searching() { // When playing in 'nodes as time' mode, subtract the searched nodes from // the available ones before exiting. if (limits.npmsec) - main_manager()->tm.advance_nodes_time(limits.inc[rootPos.side_to_move()] - - threads.nodes_searched()); + main_manager()->tm.advance_nodes_time(threads.nodes_searched() + - limits.inc[rootPos.side_to_move()]); Worker* bestThread = this; Skill skill = @@ -239,7 +238,7 @@ void Search::Worker::iterative_deepening() { // Allocate stack with extra size to allow access from (ss - 7) to (ss + 2): // (ss - 7) is needed for update_continuation_histories(ss - 1) which accesses (ss - 6), - // (ss + 2) is needed for initialization of cutOffCnt and killers. + // (ss + 2) is needed for initialization of cutOffCnt. Stack stack[MAX_PLY + 10] = {}; Stack* ss = stack + 7; @@ -247,7 +246,8 @@ void Search::Worker::iterative_deepening() { { (ss - i)->continuationHistory = &this->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel - (ss - i)->staticEval = VALUE_NONE; + (ss - i)->continuationCorrectionHistory = &this->continuationCorrectionHistory[NO_PIECE][0]; + (ss - i)->staticEval = VALUE_NONE; } for (int i = 0; i <= MAX_PLY + 2; ++i) @@ -275,6 +275,8 @@ void Search::Worker::iterative_deepening() { int searchAgainCounter = 0; + lowPlyHistory.fill(0); + // Iterative deepening loop until requested to stop or the target depth is reached while (++rootDepth < MAX_PLY && !threads.stop && !(limits.depth && mainThread && rootDepth > limits.depth)) @@ -295,7 +297,7 @@ void Search::Worker::iterative_deepening() { searchAgainCounter++; // MultiPV loop. We perform a full root search for each PV line - for (pvIdx = 0; pvIdx < multiPV && !threads.stop; ++pvIdx) + for (pvIdx = 0; pvIdx < multiPV; ++pvIdx) { if (pvIdx == pvLast) { @@ -309,13 +311,13 @@ void Search::Worker::iterative_deepening() { selDepth = 0; // Reset aspiration window starting size + delta = 5 + std::abs(rootMoves[pvIdx].meanSquaredScore) / 13461; Value avg = rootMoves[pvIdx].averageScore; - delta = 10 + avg * avg / 9530; alpha = std::max(avg - delta, -VALUE_INFINITE); beta = std::min(avg + delta, VALUE_INFINITE); // Adjust optimism based on root move's averageScore (~4 Elo) - optimism[us] = 119 * avg / (std::abs(avg) + 88); + optimism[us] = 150 * avg / (std::abs(avg) + 85); optimism[~us] = -optimism[us]; // Start with a small aspiration window and, in the case of a fail @@ -324,10 +326,11 @@ void Search::Worker::iterative_deepening() { int failedHighCnt = 0; while (true) { - // Adjust the effective depth searched, but ensure at least one effective increment - // for every four searchAgain steps (see issue #2717). + // Adjust the effective depth searched, but ensure at least one + // effective increment for every four searchAgain steps (see issue #2717). Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - 3 * (searchAgainCounter + 1) / 4); + rootDelta = beta - alpha; bestValue = search(rootPos, ss, alpha, beta, adjustedDepth, false); // Bring the best move to the front. It is critical that sorting @@ -344,14 +347,15 @@ void Search::Worker::iterative_deepening() { if (threads.stop) break; - // When failing high/low give some update (without cluttering - // the UI) before a re-search. + // When failing high/low give some update before a re-search. To avoid + // excessive output that could hang GUIs like Fritz 19, only start + // at nodes > 10M (rather than depth N, which can be reached quickly) if (mainThread && multiPV == 1 && (bestValue <= alpha || bestValue >= beta) - && elapsed() > 3000) + && nodes > 10000000) main_manager()->pv(*this, threads, tt, rootDepth); - // In case of failing low/high increase aspiration window and - // re-search, otherwise exit the loop. + // In case of failing low/high increase aspiration window and re-search, + // otherwise exit the loop. if (bestValue <= alpha) { beta = (alpha + beta) / 2; @@ -378,13 +382,17 @@ void Search::Worker::iterative_deepening() { std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1); if (mainThread - && (threads.stop || pvIdx + 1 == multiPV || elapsed() > 3000) - // A thread that aborted search can have mated-in/TB-loss PV and score - // that cannot be trusted, i.e. it can be delayed or refuted if we would have - // had time to fully search other root-moves. Thus we suppress this output and - // below pick a proven score/PV for this thread (from the previous iteration). - && !(threads.abortedSearch && rootMoves[0].uciScore <= VALUE_TB_LOSS_IN_MAX_PLY)) + && (threads.stop || pvIdx + 1 == multiPV || nodes > 10000000) + // A thread that aborted search can have mated-in/TB-loss PV and + // score that cannot be trusted, i.e. it can be delayed or refuted + // if we would have had time to fully search other root-moves. Thus + // we suppress this output and below pick a proven score/PV for this + // thread (from the previous iteration). + && !(threads.abortedSearch && is_loss(rootMoves[0].uciScore))) main_manager()->pv(*this, threads, tt, rootDepth); + + if (threads.stop) + break; } if (!threads.stop) @@ -393,7 +401,7 @@ void Search::Worker::iterative_deepening() { // We make sure not to pick an unproven mated-in score, // in case this thread prematurely stopped search (aborted-search). if (threads.abortedSearch && rootMoves[0].score != -VALUE_INFINITE - && rootMoves[0].score <= VALUE_TB_LOSS_IN_MAX_PLY) + && is_loss(rootMoves[0].score)) { // Bring the last best move to the front for best thread selection. Utility::move_to_front(rootMoves, [&lastBestPV = std::as_const(lastBestPV)]( @@ -425,7 +433,7 @@ void Search::Worker::iterative_deepening() { skill.pick_best(rootMoves, multiPV); // Use part of the gained time from a previous stable move for the current move - for (Thread* th : threads) + for (auto&& th : threads) { totBestMoveChanges += th->worker->bestMoveChanges; th->worker->bestMoveChanges = 0; @@ -436,20 +444,19 @@ void Search::Worker::iterative_deepening() { { int nodesEffort = rootMoves[0].effort * 100 / std::max(size_t(1), size_t(nodes)); - double fallingEval = (1067 + 223 * (mainThread->bestPreviousAverageScore - bestValue) - + 97 * (mainThread->iterValue[iterIdx] - bestValue)) - / 10000.0; + double fallingEval = (11 + 2 * (mainThread->bestPreviousAverageScore - bestValue) + + (mainThread->iterValue[iterIdx] - bestValue)) + / 100.0; fallingEval = std::clamp(fallingEval, 0.580, 1.667); // If the bestMove is stable over several iterations, reduce time accordingly timeReduction = lastBestMoveDepth + 8 < completedDepth ? 1.495 : 0.687; double reduction = (1.48 + mainThread->previousTimeReduction) / (2.17 * timeReduction); double bestMoveInstability = 1 + 1.88 * totBestMoveChanges / threads.size(); - int el = std::clamp((bestValue + 750) / 150, 0, 9); double recapture = limits.capSq == rootMoves[0].pv[0].to_sq() ? 0.955 : 1.005; - double totalTime = mainThread->tm.optimum() * fallingEval * reduction - * bestMoveInstability * EvalLevel[el] * recapture; + double totalTime = + mainThread->tm.optimum() * fallingEval * reduction * bestMoveInstability * recapture; // Cap used time in case of a single legal move for a better viewer experience if (rootMoves.size() == 1) @@ -491,41 +498,53 @@ void Search::Worker::iterative_deepening() { skill.best ? skill.best : skill.pick_best(rootMoves, multiPV))); } +// Reset histories, usually before a new game void Search::Worker::clear() { - counterMoves.fill(Move::none()); mainHistory.fill(0); - captureHistory.fill(0); - pawnHistory.fill(0); - correctionHistory.fill(0); + lowPlyHistory.fill(0); + captureHistory.fill(-758); + pawnHistory.fill(-1158); + pawnCorrectionHistory.fill(0); + majorPieceCorrectionHistory.fill(0); + minorPieceCorrectionHistory.fill(0); + nonPawnCorrectionHistory[WHITE].fill(0); + nonPawnCorrectionHistory[BLACK].fill(0); + + for (auto& to : continuationCorrectionHistory) + for (auto& h : to) + h->fill(0); for (bool inCheck : {false, true}) for (StatsType c : {NoCaptures, Captures}) for (auto& to : continuationHistory[inCheck][c]) for (auto& h : to) - h->fill(-60); + h->fill(-645); for (size_t i = 1; i < reductions.size(); ++i) - reductions[i] = int((18.93 + std::log(size_t(options["Threads"])) / 2) * std::log(i)); + reductions[i] = int((19.43 + std::log(size_t(options["Threads"])) / 2) * std::log(i)); - refreshTable.clear(networks); + refreshTable.clear(networks[numaAccessToken]); } -// Main search function for both PV and non-PV nodes. +// Main search function for both PV and non-PV nodes template Value Search::Worker::search( Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode) { constexpr bool PvNode = nodeType != NonPV; constexpr bool rootNode = nodeType == Root; + const bool allNode = !(PvNode || cutNode); // Dive into quiescence search when the depth reaches zero if (depth <= 0) return qsearch < PvNode ? PV : NonPV > (pos, ss, alpha, beta); - // Check if we have an upcoming move that draws by repetition, or - // if the opponent had an alternative move earlier to this position. - if (!rootNode && alpha < VALUE_DRAW && pos.has_game_cycle(ss->ply)) + // Limit the depth if extensions made it too large + depth = std::min(depth, MAX_PLY - 1); + + // Check if we have an upcoming move that draws by repetition + if (!rootNode && alpha < VALUE_DRAW && pos.upcoming_repetition(ss->ply)) { alpha = value_draw(this->nodes); if (alpha >= beta) @@ -537,28 +556,29 @@ Value Search::Worker::search( assert(0 < depth && depth < MAX_PLY); assert(!(PvNode && cutNode)); - Move pv[MAX_PLY + 1], capturesSearched[32], quietsSearched[32]; + Move pv[MAX_PLY + 1]; StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); - TTEntry* tte; - Key posKey; - Move ttMove, move, excludedMove, bestMove; - Depth extension, newDepth; - Value bestValue, value, ttValue, eval, maxValue, probCutBeta; - bool givesCheck, improving, priorCapture, opponentWorsening; - bool capture, moveCountPruning, ttCapture; - Piece movedPiece; - int moveCount, captureCount, quietCount; + Key posKey; + Move move, excludedMove, bestMove; + Depth extension, newDepth; + Value bestValue, value, eval, maxValue, probCutBeta; + bool givesCheck, improving, priorCapture, opponentWorsening; + bool capture, ttCapture; + Piece movedPiece; + + ValueList capturesSearched; + ValueList quietsSearched; // Step 1. Initialize node Worker* thisThread = this; ss->inCheck = pos.checkers(); priorCapture = pos.captured_piece(); Color us = pos.side_to_move(); - moveCount = captureCount = quietCount = ss->moveCount = 0; - bestValue = -VALUE_INFINITE; - maxValue = VALUE_INFINITE; + ss->moveCount = 0; + bestValue = -VALUE_INFINITE; + maxValue = VALUE_INFINITE; // Check for the available remaining time if (is_mainthread()) @@ -574,7 +594,8 @@ Value Search::Worker::search( if (threads.stop.load(std::memory_order_relaxed) || pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) + ? evaluate(networks[numaAccessToken], pos, refreshTable, + thisThread->optimism[us]) : value_draw(thisThread->nodes); // Step 3. Mate distance pruning. Even if we mate at the next move our score @@ -588,43 +609,42 @@ Value Search::Worker::search( if (alpha >= beta) return alpha; } - else - thisThread->rootDelta = beta - alpha; assert(0 <= ss->ply && ss->ply < MAX_PLY); - bestMove = Move::none(); - (ss + 2)->killers[0] = (ss + 2)->killers[1] = Move::none(); - (ss + 2)->cutoffCnt = 0; + bestMove = Move::none(); + (ss + 2)->cutoffCnt = 0; Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; ss->statScore = 0; - // Step 4. Transposition table lookup. - excludedMove = ss->excludedMove; - posKey = pos.key(); - tte = tt.probe(posKey, ss->ttHit); - ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; - ttMove = rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0] - : ss->ttHit ? tte->move() - : Move::none(); - ttCapture = ttMove && pos.capture_stage(ttMove); + // Step 4. Transposition table lookup + excludedMove = ss->excludedMove; + posKey = pos.key(); + auto [ttHit, ttData, ttWriter] = tt.probe(posKey); + // Need further processing of the saved data + ss->ttHit = ttHit; + ttData.move = rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0] + : ttHit ? ttData.move + : Move::none(); + ttData.value = ttHit ? value_from_tt(ttData.value, ss->ply, pos.rule50_count()) : VALUE_NONE; + ss->ttPv = excludedMove ? ss->ttPv : PvNode || (ttHit && ttData.is_pv); + ttCapture = ttData.move && pos.capture_stage(ttData.move); // At this point, if excluded, skip straight to step 6, static eval. However, // to save indentation, we list the condition in all code between here and there. - if (!excludedMove) - ss->ttPv = PvNode || (ss->ttHit && tte->is_pv()); // At non-PV nodes we check for an early TT cutoff - if (!PvNode && !excludedMove && tte->depth() > depth - && ttValue != VALUE_NONE // Possible in case of TT access race or if !ttHit - && (tte->bound() & (ttValue >= beta ? BOUND_LOWER : BOUND_UPPER))) + if (!PvNode && !excludedMove && ttData.depth > depth - (ttData.value <= beta) + && is_valid(ttData.value) // Can happen when !ttHit or when access race in probe() + && (ttData.bound & (ttData.value >= beta ? BOUND_LOWER : BOUND_UPPER)) + && (cutNode == (ttData.value >= beta) || depth > 8)) { // If ttMove is quiet, update move sorting heuristics on TT hit (~2 Elo) - if (ttMove && ttValue >= beta) + if (ttData.move && ttData.value >= beta) { // Bonus for a quiet ttMove that fails high (~2 Elo) if (!ttCapture) - update_quiet_stats(pos, ss, *this, ttMove, stat_bonus(depth)); + update_quiet_histories(pos, ss, *this, ttData.move, stat_bonus(depth)); // Extra penalty for early quiet moves of // the previous ply (~1 Elo on STC, ~2 Elo on LTC) @@ -636,9 +656,7 @@ Value Search::Worker::search( // Partial workaround for the graph history interaction problem // For high rule50 counts don't produce transposition table cutoffs. if (pos.rule50_count() < 90) - return ttValue >= beta && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY - ? (ttValue * 3 + beta) / 4 - : ttValue; + return ttData.value; } // Step 5. Tablebases probe @@ -665,7 +683,7 @@ Value Search::Worker::search( Value tbValue = VALUE_TB - ss->ply; - // use the range VALUE_TB to VALUE_TB_WIN_IN_MAX_PLY to score + // Use the range VALUE_TB to VALUE_TB_WIN_IN_MAX_PLY to score value = wdl < -drawScore ? -tbValue : wdl > drawScore ? tbValue : VALUE_DRAW + 2 * wdl * drawScore; @@ -676,9 +694,9 @@ Value Search::Worker::search( if (b == BOUND_EXACT || (b == BOUND_LOWER ? value >= beta : value <= alpha)) { - tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b, - std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE, - tt.generation()); + ttWriter.write(posKey, value_to_tt(value, ss->ply), ss->ttPv, b, + std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE, + tt.generation()); return value; } @@ -699,7 +717,7 @@ Value Search::Worker::search( if (ss->inCheck) { // Skip early pruning when in check - ss->staticEval = eval = VALUE_NONE; + ss->staticEval = eval = (ss - 2)->staticEval; improving = false; goto moves_loop; } @@ -707,98 +725,100 @@ Value Search::Worker::search( { // Providing the hint that this node's accumulator will be used often // brings significant Elo gain (~13 Elo). - Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); + Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable); unadjustedStaticEval = eval = ss->staticEval; } else if (ss->ttHit) { // Never assume anything about values stored in TT - unadjustedStaticEval = tte->eval(); - if (unadjustedStaticEval == VALUE_NONE) - unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]); + unadjustedStaticEval = ttData.eval; + if (!is_valid(unadjustedStaticEval)) + unadjustedStaticEval = + evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); else if (PvNode) - Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); + Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable); - ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); + ss->staticEval = eval = + to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos, ss); // ttValue can be used as a better position evaluation (~7 Elo) - if (ttValue != VALUE_NONE && (tte->bound() & (ttValue > eval ? BOUND_LOWER : BOUND_UPPER))) - eval = ttValue; + if (is_valid(ttData.value) + && (ttData.bound & (ttData.value > eval ? BOUND_LOWER : BOUND_UPPER))) + eval = ttData.value; } else { - unadjustedStaticEval = evaluate(networks, pos, refreshTable, thisThread->optimism[us]); - ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); + unadjustedStaticEval = + evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); + ss->staticEval = eval = + to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos, ss); // Static evaluation is saved as it was before adjustment by correction history - tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, Move::none(), - unadjustedStaticEval, tt.generation()); + ttWriter.write(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_UNSEARCHED, Move::none(), + unadjustedStaticEval, tt.generation()); } // Use static evaluation difference to improve quiet move ordering (~9 Elo) if (((ss - 1)->currentMove).is_ok() && !(ss - 1)->inCheck && !priorCapture) { - int bonus = std::clamp(-13 * int((ss - 1)->staticEval + ss->staticEval), -1796, 1526); - bonus = bonus > 0 ? 2 * bonus : bonus / 2; + int bonus = std::clamp(-10 * int((ss - 1)->staticEval + ss->staticEval), -1831, 1428) + 623; thisThread->mainHistory[~us][((ss - 1)->currentMove).from_to()] << bonus; if (type_of(pos.piece_on(prevSq)) != PAWN && ((ss - 1)->currentMove).type_of() != PROMOTION) thisThread->pawnHistory[pawn_structure_index(pos)][pos.piece_on(prevSq)][prevSq] - << bonus / 2; + << bonus; } // Set up the improving flag, which is true if current static evaluation is // bigger than the previous static evaluation at our turn (if we were in - // check at our previous move we look at static evaluation at move prior to it - // and if we were in check at move prior to it flag is set to true) and is + // check at our previous move we go back until we weren't in check) and is // false otherwise. The improving flag is used in various pruning heuristics. - improving = (ss - 2)->staticEval != VALUE_NONE - ? ss->staticEval > (ss - 2)->staticEval - : (ss - 4)->staticEval != VALUE_NONE && ss->staticEval > (ss - 4)->staticEval; + improving = ss->staticEval > (ss - 2)->staticEval; opponentWorsening = ss->staticEval + (ss - 1)->staticEval > 2; // Step 7. Razoring (~1 Elo) - // If eval is really low check with qsearch if it can exceed alpha, if it can't, - // return a fail low. - // Adjust razor margin according to cutoffCnt. (~1 Elo) - if (eval < alpha - 433 - (302 - 141 * ((ss + 1)->cutoffCnt > 3)) * depth * depth) + // If eval is really low, check with qsearch if we can exceed alpha. If the + // search suggests we cannot exceed alpha, return a speculative fail low. + if (eval < alpha - 469 - 307 * depth * depth) { value = qsearch(pos, ss, alpha - 1, alpha); - if (value < alpha) + if (value < alpha && !is_decisive(value)) return value; } // Step 8. Futility pruning: child node (~40 Elo) // The depth condition is important for mate finding. - if (!ss->ttPv && depth < 11 + if (!ss->ttPv && depth < 14 && eval - futility_margin(depth, cutNode && !ss->ttHit, improving, opponentWorsening) - - (ss - 1)->statScore / 254 + - (ss - 1)->statScore / 290 >= beta - && eval >= beta && eval < VALUE_TB_WIN_IN_MAX_PLY && (!ttMove || ttCapture)) - return beta > VALUE_TB_LOSS_IN_MAX_PLY ? (eval + beta) / 2 : eval; + && eval >= beta && (!ttData.move || ttCapture) && !is_loss(beta) && !is_win(eval)) + return beta + (eval - beta) / 3; + + improving |= ss->staticEval >= beta + 100; // Step 9. Null move search with verification search (~35 Elo) - if (!PvNode && (ss - 1)->currentMove != Move::null() && (ss - 1)->statScore < 16993 - && eval >= beta && ss->staticEval >= beta - 19 * depth + 326 && !excludedMove - && pos.non_pawn_material(us) && ss->ply >= thisThread->nmpMinPly - && beta > VALUE_TB_LOSS_IN_MAX_PLY) + if (cutNode && (ss - 1)->currentMove != Move::null() && eval >= beta + && ss->staticEval >= beta - 21 * depth + 421 && !excludedMove && pos.non_pawn_material(us) + && ss->ply >= thisThread->nmpMinPly && !is_loss(beta)) { assert(eval - beta >= 0); // Null move dynamic reduction based on depth and eval - Depth R = std::min(int(eval - beta) / 134, 6) + depth / 3 + 4; + Depth R = std::min(int(eval - beta) / 235, 7) + depth / 3 + 5; - ss->currentMove = Move::null(); - ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; + ss->currentMove = Move::null(); + ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; + ss->continuationCorrectionHistory = &thisThread->continuationCorrectionHistory[NO_PIECE][0]; pos.do_null_move(st, tt); - Value nullValue = -search(pos, ss + 1, -beta, -beta + 1, depth - R, !cutNode); + Value nullValue = -search(pos, ss + 1, -beta, -beta + 1, depth - R, false); pos.undo_null_move(); // Do not return unproven mate or TB scores - if (nullValue >= beta && nullValue < VALUE_TB_WIN_IN_MAX_PLY) + if (nullValue >= beta && !is_win(nullValue)) { if (thisThread->nmpMinPly || depth < 16) return nullValue; @@ -819,81 +839,94 @@ Value Search::Worker::search( } // Step 10. Internal iterative reductions (~9 Elo) - // For PV nodes without a ttMove, we decrease depth by 3. - if (PvNode && !ttMove) + // For PV nodes without a ttMove, we decrease depth. + if (PvNode && !ttData.move) depth -= 3; - // Use qsearch if depth <= 0. + // Use qsearch if depth <= 0 if (depth <= 0) return qsearch(pos, ss, alpha, beta); - // For cutNodes without a ttMove, we decrease depth by 2 if depth is high enough. - if (cutNode && depth >= 8 && !ttMove) - depth -= 2; + // For cutNodes, if depth is high enough, decrease depth by 2 if there is no ttMove, + // or by 1 if there is a ttMove with an upper bound. + if (cutNode && depth >= 7 && (!ttData.move || ttData.bound == BOUND_UPPER)) + depth -= 1 + !ttData.move; // Step 11. ProbCut (~10 Elo) - // If we have a good enough capture (or queen promotion) and a reduced search returns a value - // much above beta, we can (almost) safely prune the previous move. - probCutBeta = beta + 159 - 66 * improving; - if ( - !PvNode && depth > 3 - && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY - // If value from transposition table is lower than probCutBeta, don't attempt probCut - // there and in further interactions with transposition table cutoff depth is set to depth - 3 - // because probCut search has depth set to depth - 4 but we also do a move before it - // So effective depth is equal to depth - 3 - && !(tte->depth() >= depth - 3 && ttValue != VALUE_NONE && ttValue < probCutBeta)) + // If we have a good enough capture (or queen promotion) and a reduced search + // returns a value much above beta, we can (almost) safely prune the previous move. + probCutBeta = beta + 187 - 56 * improving; + if (!PvNode && depth > 3 + && !is_decisive(beta) + // If value from transposition table is lower than probCutBeta, don't attempt + // probCut there and in further interactions with transposition table cutoff + // depth is set to depth - 3 because probCut search has depth set to depth - 4 + // but we also do a move before it. So effective depth is equal to depth - 3. + && !(ttData.depth >= depth - 3 && is_valid(ttData.value) && ttData.value < probCutBeta)) { assert(probCutBeta < VALUE_INFINITE && probCutBeta > beta); - MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &thisThread->captureHistory); + MovePicker mp(pos, ttData.move, probCutBeta - ss->staticEval, &thisThread->captureHistory); + Piece captured; while ((move = mp.next_move()) != Move::none()) - if (move != excludedMove && pos.legal(move)) + { + assert(move.is_ok()); + + if (move == excludedMove) + continue; + + if (!pos.legal(move)) + continue; + + assert(pos.capture_stage(move)); + + movedPiece = pos.moved_piece(move); + captured = pos.piece_on(move.to_sq()); + + + // Prefetch the TT entry for the resulting position + prefetch(tt.first_entry(pos.key_after(move))); + + ss->currentMove = move; + ss->continuationHistory = + &this->continuationHistory[ss->inCheck][true][pos.moved_piece(move)][move.to_sq()]; + ss->continuationCorrectionHistory = + &this->continuationCorrectionHistory[pos.moved_piece(move)][move.to_sq()]; + + thisThread->nodes.fetch_add(1, std::memory_order_relaxed); + pos.do_move(move, st); + + // Perform a preliminary qsearch to verify that the move holds + value = -qsearch(pos, ss + 1, -probCutBeta, -probCutBeta + 1); + + // If the qsearch held, perform the regular search + if (value >= probCutBeta) + value = + -search(pos, ss + 1, -probCutBeta, -probCutBeta + 1, depth - 4, !cutNode); + + pos.undo_move(move); + + if (value >= probCutBeta) { - assert(pos.capture_stage(move)); + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(captured)] << 1300; - // Prefetch the TT entry for the resulting position - prefetch(tt.first_entry(pos.key_after(move))); - - ss->currentMove = move; - ss->continuationHistory = - &this - ->continuationHistory[ss->inCheck][true][pos.moved_piece(move)][move.to_sq()]; - - thisThread->nodes.fetch_add(1, std::memory_order_relaxed); - pos.do_move(move, st); - - // Perform a preliminary qsearch to verify that the move holds - value = -qsearch(pos, ss + 1, -probCutBeta, -probCutBeta + 1); - - // If the qsearch held, perform the regular search - if (value >= probCutBeta) - value = -search(pos, ss + 1, -probCutBeta, -probCutBeta + 1, depth - 4, - !cutNode); - - pos.undo_move(move); - - if (value >= probCutBeta) - { - // Save ProbCut data into transposition table - tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, depth - 3, - move, unadjustedStaticEval, tt.generation()); - return std::abs(value) < VALUE_TB_WIN_IN_MAX_PLY ? value - (probCutBeta - beta) - : value; - } + // Save ProbCut data into transposition table + ttWriter.write(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, + depth - 3, move, unadjustedStaticEval, tt.generation()); + return is_decisive(value) ? value : value - (probCutBeta - beta); } + } - Eval::NNUE::hint_common_parent_position(pos, networks, refreshTable); + Eval::NNUE::hint_common_parent_position(pos, networks[numaAccessToken], refreshTable); } moves_loop: // When in check, search starts here - // Step 12. A small Probcut idea, when we are in check (~4 Elo) - probCutBeta = beta + 420; - if (ss->inCheck && !PvNode && ttCapture && (tte->bound() & BOUND_LOWER) - && tte->depth() >= depth - 4 && ttValue >= probCutBeta - && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY) + // Step 12. A small Probcut idea (~4 Elo) + probCutBeta = beta + 417; + if ((ttData.bound & BOUND_LOWER) && ttData.depth >= depth - 4 && ttData.value >= probCutBeta + && !is_decisive(beta) && is_valid(ttData.value) && !is_decisive(ttData.value)) return probCutBeta; const PieceToHistory* contHist[] = {(ss - 1)->continuationHistory, @@ -903,18 +936,17 @@ moves_loop: // When in check, search starts here nullptr, (ss - 6)->continuationHistory}; - Move countermove = - prevSq != SQ_NONE ? thisThread->counterMoves[pos.piece_on(prevSq)][prevSq] : Move::none(); - MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory, &thisThread->captureHistory, - contHist, &thisThread->pawnHistory, countermove, ss->killers); + MovePicker mp(pos, ttData.move, depth, &thisThread->mainHistory, &thisThread->lowPlyHistory, + &thisThread->captureHistory, contHist, &thisThread->pawnHistory, ss->ply); - value = bestValue; - moveCountPruning = false; + value = bestValue; + + int moveCount = 0; // Step 13. Loop through all pseudo-legal moves until no moves remain // or a beta cutoff occurs. - while ((move = mp.next_move(moveCountPruning)) != Move::none()) + while ((move = mp.next_move()) != Move::none()) { assert(move.is_ok()); @@ -935,7 +967,7 @@ moves_loop: // When in check, search starts here ss->moveCount = ++moveCount; - if (rootNode && is_mainthread() && elapsed() > 3000) + if (rootNode && is_mainthread() && nodes > 10000000) { main_manager()->updates.onIter( {depth, UCIEngine::move(move, pos.is_chess960()), moveCount + thisThread->pvIdx}); @@ -957,14 +989,14 @@ moves_loop: // When in check, search starts here // Step 14. Pruning at shallow depth (~120 Elo). // Depth conditions are important for mate finding. - if (!rootNode && pos.non_pawn_material(us) && bestValue > VALUE_TB_LOSS_IN_MAX_PLY) + if (!rootNode && pos.non_pawn_material(us) && !is_loss(bestValue)) { // Skip quiet moves if movecount exceeds our FutilityMoveCount threshold (~8 Elo) - if (!moveCountPruning) - moveCountPruning = moveCount >= futility_move_count(improving, depth); + if (moveCount >= futility_move_count(improving, depth)) + mp.skip_quiet_moves(); // Reduced depth of the next LMR search - int lmrDepth = newDepth - r; + int lmrDepth = newDepth - r / 1024; if (capture || givesCheck) { @@ -975,15 +1007,15 @@ moves_loop: // When in check, search starts here // Futility pruning for captures (~2 Elo) if (!givesCheck && lmrDepth < 7 && !ss->inCheck) { - Value futilityValue = ss->staticEval + 295 + 280 * lmrDepth + Value futilityValue = ss->staticEval + 287 + 253 * lmrDepth + PieceValue[capturedPiece] + captHist / 7; if (futilityValue <= alpha) continue; } // SEE based pruning for captures and checks (~11 Elo) - int seeHist = std::clamp(captHist / 32, -197 * depth, 196 * depth); - if (!pos.see_ge(move, -186 * depth - seeHist)) + int seeHist = std::clamp(captHist / 33, -161 * depth, 156 * depth); + if (!pos.see_ge(move, -162 * depth - seeHist)) continue; } else @@ -991,33 +1023,32 @@ moves_loop: // When in check, search starts here int history = (*contHist[0])[movedPiece][move.to_sq()] + (*contHist[1])[movedPiece][move.to_sq()] - + (*contHist[3])[movedPiece][move.to_sq()] / 2 + thisThread->pawnHistory[pawn_structure_index(pos)][movedPiece][move.to_sq()]; // Continuation history based pruning (~2 Elo) - if (lmrDepth < 6 && history < -4081 * depth) + if (history < -3884 * depth) continue; history += 2 * thisThread->mainHistory[us][move.from_to()]; - lmrDepth += history / 4768; + lmrDepth += history / 3609; Value futilityValue = - ss->staticEval + (bestValue < ss->staticEval - 52 ? 134 : 54) + 142 * lmrDepth; + ss->staticEval + (bestValue < ss->staticEval - 45 ? 140 : 43) + 141 * lmrDepth; // Futility pruning: parent node (~13 Elo) - if (!ss->inCheck && lmrDepth < 13 && futilityValue <= alpha) + if (!ss->inCheck && lmrDepth < 12 && futilityValue <= alpha) { - if (bestValue <= futilityValue && std::abs(bestValue) < VALUE_TB_WIN_IN_MAX_PLY - && futilityValue < VALUE_TB_WIN_IN_MAX_PLY) - bestValue = (bestValue + futilityValue * 3) / 4; + if (bestValue <= futilityValue && !is_decisive(bestValue) + && !is_win(futilityValue)) + bestValue = futilityValue; continue; } lmrDepth = std::max(lmrDepth, 0); // Prune moves with negative SEE (~4 Elo) - if (!pos.see_ge(move, -28 * lmrDepth * lmrDepth)) + if (!pos.see_ge(move, -25 * lmrDepth * lmrDepth)) continue; } } @@ -1026,22 +1057,25 @@ moves_loop: // When in check, search starts here // We take care to not overdo to avoid search getting stuck. if (ss->ply < thisThread->rootDepth * 2) { - // Singular extension search (~94 Elo). If all moves but one fail low on a - // search of (alpha-s, beta-s), and just one fails high on (alpha, beta), - // then that move is singular and should be extended. To verify this we do - // a reduced search on the position excluding the ttMove and if the result - // is lower than ttValue minus a margin, then we will extend the ttMove. + // Singular extension search (~76 Elo, ~170 nElo). If all moves but one + // fail low on a search of (alpha-s, beta-s), and just one fails high on + // (alpha, beta), then that move is singular and should be extended. To + // verify this we do a reduced search on the position excluding the ttMove + // and if the result is lower than ttValue minus a margin, then we will + // extend the ttMove. Recursive singular search is avoided. - // Note: the depth margin and singularBeta margin are known for having non-linear - // scaling. Their values are optimized to time controls of 180+1.8 and longer - // so changing them requires tests at these types of time controls. - // Recursive singular search is avoided. - if (!rootNode && move == ttMove && !excludedMove - && depth >= 4 - (thisThread->completedDepth > 32) + ss->ttPv - && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY && (tte->bound() & BOUND_LOWER) - && tte->depth() >= depth - 3) + // Note: the depth margin and singularBeta margin are known for having + // non-linear scaling. Their values are optimized to time controls of + // 180+1.8 and longer so changing them requires tests at these types of + // time controls. Generally, higher singularBeta (i.e closer to ttValue) + // and lower extension margins scale well. + + if (!rootNode && move == ttData.move && !excludedMove + && depth >= 4 - (thisThread->completedDepth > 33) + ss->ttPv + && is_valid(ttData.value) && !is_decisive(ttData.value) + && (ttData.bound & BOUND_LOWER) && ttData.depth >= depth - 3) { - Value singularBeta = ttValue - (65 + 52 * (ss->ttPv && !PvNode)) * depth / 63; + Value singularBeta = ttData.value - (56 + 79 * (ss->ttPv && !PvNode)) * depth / 64; Depth singularDepth = newDepth / 2; ss->excludedMove = move; @@ -1051,55 +1085,46 @@ moves_loop: // When in check, search starts here if (value < singularBeta) { - int doubleMargin = 251 * PvNode - 241 * !ttCapture; - int tripleMargin = - 135 + 234 * PvNode - 248 * !ttCapture + 124 * (ss->ttPv || !ttCapture); - int quadMargin = 447 + 354 * PvNode - 300 * !ttCapture + 206 * ss->ttPv; + int doubleMargin = 249 * PvNode - 194 * !ttCapture; + int tripleMargin = 94 + 287 * PvNode - 249 * !ttCapture + 99 * ss->ttPv; extension = 1 + (value < singularBeta - doubleMargin) - + (value < singularBeta - tripleMargin) - + (value < singularBeta - quadMargin); + + (value < singularBeta - tripleMargin); depth += ((!PvNode) && (depth < 14)); } // Multi-cut pruning // Our ttMove is assumed to fail high based on the bound of the TT entry, - // and if after excluding the ttMove with a reduced search we fail high over the original beta, - // we assume this expected cut-node is not singular (multiple moves fail high), - // and we can prune the whole subtree by returning a softbound. - else if (singularBeta >= beta) - { - if (!ttCapture) - update_quiet_histories(pos, ss, *this, ttMove, -stat_malus(depth)); - - return singularBeta; - } + // and if after excluding the ttMove with a reduced search we fail high + // over the original beta, we assume this expected cut-node is not + // singular (multiple moves fail high), and we can prune the whole + // subtree by returning a softbound. + else if (value >= beta && !is_decisive(value)) + return value; // Negative extensions - // If other moves failed high over (ttValue - margin) without the ttMove on a reduced search, - // but we cannot do multi-cut because (ttValue - margin) is lower than the original beta, - // we do not know if the ttMove is singular or can do a multi-cut, - // so we reduce the ttMove in favor of other moves based on some conditions: + // If other moves failed high over (ttValue - margin) without the + // ttMove on a reduced search, but we cannot do multi-cut because + // (ttValue - margin) is lower than the original beta, we do not know + // if the ttMove is singular or can do a multi-cut, so we reduce the + // ttMove in favor of other moves based on some conditions: // If the ttMove is assumed to fail high over current beta (~7 Elo) - else if (ttValue >= beta) + else if (ttData.value >= beta) extension = -3; - // If we are on a cutNode but the ttMove is not assumed to fail high over current beta (~1 Elo) + // If we are on a cutNode but the ttMove is not assumed to fail high + // over current beta (~1 Elo) else if (cutNode) extension = -2; - - // If the ttMove is assumed to fail low over the value of the reduced search (~1 Elo) - else if (ttValue <= value) - extension = -1; } - // Extension for capturing the previous moved piece (~0 Elo on STC, ~1 Elo on LTC) - else if (PvNode && move == ttMove && move.to_sq() == prevSq + // Extension for capturing the previous moved piece (~1 Elo at LTC) + else if (PvNode && move.to_sq() == prevSq && thisThread->captureHistory[movedPiece][move.to_sq()] [type_of(pos.piece_on(move.to_sq()))] - > 4016) + > 4321) extension = 1; } @@ -1113,68 +1138,77 @@ moves_loop: // When in check, search starts here ss->currentMove = move; ss->continuationHistory = &thisThread->continuationHistory[ss->inCheck][capture][movedPiece][move.to_sq()]; - + ss->continuationCorrectionHistory = + &thisThread->continuationCorrectionHistory[movedPiece][move.to_sq()]; uint64_t nodeCount = rootNode ? uint64_t(nodes) : 0; // Step 16. Make the move thisThread->nodes.fetch_add(1, std::memory_order_relaxed); pos.do_move(move, st, givesCheck); + // These reduction adjustments have proven non-linear scaling. + // They are optimized to time controls of 180 + 1.8 and longer, + // so changing them or adding conditions that are similar requires + // tests at these types of time controls. + // Decrease reduction if position is or has been on the PV (~7 Elo) if (ss->ttPv) - r -= 1 + (ttValue > alpha) + (tte->depth() >= depth); - - else if (cutNode && move != ttMove && move != ss->killers[0]) - r++; - - // Increase reduction for cut nodes (~4 Elo) - if (cutNode) - r += 2 - (tte->depth() >= depth && ss->ttPv); - - // Increase reduction if ttMove is a capture (~3 Elo) - if (ttCapture) - r++; + r -= 1024 + (ttData.value > alpha) * 1024 + (ttData.depth >= depth) * 1024; // Decrease reduction for PvNodes (~0 Elo on STC, ~2 Elo on LTC) if (PvNode) - r--; + r -= 1024; + + // These reduction adjustments have no proven non-linear scaling + + // Increase reduction for cut nodes (~4 Elo) + if (cutNode) + r += 2518 - (ttData.depth >= depth && ss->ttPv) * 991; + + // Increase reduction if ttMove is a capture but the current move is not a capture (~3 Elo) + if (ttCapture && !capture) + r += 1043 + (depth < 8) * 999; // Increase reduction if next ply has a lot of fail high (~5 Elo) if ((ss + 1)->cutoffCnt > 3) - r++; + r += 938 + allNode * 960; - // Set reduction to 0 for first picked move (ttMove) (~2 Elo) - // Nullifies all previous reduction adjustments to ttMove and leaves only history to do them - else if (move == ttMove) - r = 0; + // For first picked move (ttMove) reduce reduction (~3 Elo) + else if (move == ttData.move) + r -= 1879; - ss->statScore = 2 * thisThread->mainHistory[us][move.from_to()] - + (*contHist[0])[movedPiece][move.to_sq()] - + (*contHist[1])[movedPiece][move.to_sq()] - + (*contHist[3])[movedPiece][move.to_sq()] - 5078; + if (capture) + ss->statScore = + 7 * int(PieceValue[pos.captured_piece()]) + + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(pos.captured_piece())] + - 5000; + else + ss->statScore = 2 * thisThread->mainHistory[us][move.from_to()] + + (*contHist[0])[movedPiece][move.to_sq()] + + (*contHist[1])[movedPiece][move.to_sq()] - 3996; // Decrease/increase reduction for moves with a good/bad history (~8 Elo) - r -= ss->statScore / 12076; + r -= ss->statScore * 1287 / 16384; // Step 17. Late moves reduction / extension (LMR, ~117 Elo) - if (depth >= 2 && moveCount > 1 + rootNode) + if (depth >= 2 && moveCount > 1) { // In general we want to cap the LMR depth search at newDepth, but when // reduction is negative, we allow this move a limited search extension // beyond the first move depth. // To prevent problems when the max value is less than the min value, // std::clamp has been replaced by a more robust implementation. - Depth d = std::max(1, std::min(newDepth - r, newDepth + 1)); + Depth d = std::max(1, std::min(newDepth - r / 1024, newDepth + !allNode)); value = -search(pos, ss + 1, -(alpha + 1), -alpha, d, true); // Do a full-depth search when reduced LMR search fails high if (value > alpha && d < newDepth) { - // Adjust full-depth search based on LMR results - if the result - // was good enough search deeper, if it was bad enough search shallower. - const bool doDeeperSearch = value > (bestValue + 40 + 2 * newDepth); // (~1 Elo) - const bool doShallowerSearch = value < bestValue + newDepth; // (~2 Elo) + // Adjust full-depth search based on LMR results - if the result was + // good enough search deeper, if it was bad enough search shallower. + const bool doDeeperSearch = value > (bestValue + 42 + 2 * newDepth); // (~1 Elo) + const bool doShallowerSearch = value < bestValue + 10; // (~2 Elo) newDepth += doDeeperSearch - doShallowerSearch; @@ -1182,10 +1216,7 @@ moves_loop: // When in check, search starts here value = -search(pos, ss + 1, -(alpha + 1), -alpha, newDepth, !cutNode); // Post LMR continuation history updates (~1 Elo) - int bonus = value <= alpha ? -stat_malus(newDepth) - : value >= beta ? stat_bonus(newDepth) - : 0; - + int bonus = 2 * (value >= beta) * stat_bonus(newDepth); update_continuation_histories(ss, movedPiece, move.to_sq(), bonus); } } @@ -1194,11 +1225,12 @@ moves_loop: // When in check, search starts here else if (!PvNode || moveCount > 1) { // Increase reduction if ttMove is not present (~6 Elo) - if (!ttMove) - r += 2; + if (!ttData.move) + r += 2037; // Note that if expected reduction is high, we reduce search depth by 1 here (~9 Elo) - value = -search(pos, ss + 1, -(alpha + 1), -alpha, newDepth - (r > 3), !cutNode); + value = + -search(pos, ss + 1, -(alpha + 1), -alpha, newDepth - (r > 2983), !cutNode); } // For PV nodes only, do a full PV search on the first move or after a fail high, @@ -1208,6 +1240,10 @@ moves_loop: // When in check, search starts here (ss + 1)->pv = pv; (ss + 1)->pv[0] = Move::none(); + // Extend move from transposition table if we are about to dive into qsearch. + if (move == ttData.move && ss->ply <= thisThread->rootDepth * 2) + newDepth = std::max(newDepth, 1); + value = -search(pos, ss + 1, -beta, -alpha, newDepth, false); } @@ -1218,8 +1254,8 @@ moves_loop: // When in check, search starts here // Step 20. Check for a new best move // Finished searching the move. If a stop occurred, the return value of - // the search cannot be trusted, and we return immediately without - // updating best move, PV and TT. + // the search cannot be trusted, and we return immediately without updating + // best move, principal variation nor transposition table. if (threads.stop.load(std::memory_order_relaxed)) return VALUE_ZERO; @@ -1231,7 +1267,11 @@ moves_loop: // When in check, search starts here rm.effort += nodes - nodeCount; rm.averageScore = - rm.averageScore != -VALUE_INFINITE ? (2 * value + rm.averageScore) / 3 : value; + rm.averageScore != -VALUE_INFINITE ? (value + rm.averageScore) / 2 : value; + + rm.meanSquaredScore = rm.meanSquaredScore != -VALUE_INFINITE * VALUE_INFINITE + ? (value * std::abs(value) + rm.meanSquaredScore) / 2 + : value * std::abs(value); // PV move or new best move? if (moveCount == 1 || value > alpha) @@ -1271,11 +1311,16 @@ moves_loop: // When in check, search starts here rm.score = -VALUE_INFINITE; } - if (value > bestValue) + // In case we have an alternative move equal in eval to the current bestmove, + // promote it to bestmove by pretending it just exceeds alpha (but not beta). + int inc = (value == bestValue && ss->ply + 2 >= thisThread->rootDepth + && (int(nodes) & 15) == 0 && !is_win(std::abs(value) + 1)); + + if (value + inc > bestValue) { bestValue = value; - if (value > alpha) + if (value + inc > alpha) { bestMove = move; @@ -1284,14 +1329,14 @@ moves_loop: // When in check, search starts here if (value >= beta) { - ss->cutoffCnt += 1 + !ttMove; + ss->cutoffCnt += !ttData.move + (extension < 2); assert(value >= beta); // Fail high break; } else { // Reduce other moves if we have found at least one score improvement (~2 Elo) - if (depth > 2 && depth < 13 && beta < 15868 && value > -14630) + if (depth > 2 && depth < 14 && !is_decisive(value)) depth -= 2; assert(depth > 0); @@ -1305,9 +1350,9 @@ moves_loop: // When in check, search starts here if (move != bestMove && moveCount <= 32) { if (capture) - capturesSearched[captureCount++] = move; + capturesSearched.push_back(move); else - quietsSearched[quietCount++] = move; + quietsSearched.push_back(move); } } @@ -1319,30 +1364,54 @@ moves_loop: // When in check, search starts here assert(moveCount || !ss->inCheck || excludedMove || !MoveList(pos).size()); // Adjust best value for fail high cases at non-pv nodes - if (!PvNode && bestValue >= beta && std::abs(bestValue) < VALUE_TB_WIN_IN_MAX_PLY - && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY && std::abs(alpha) < VALUE_TB_WIN_IN_MAX_PLY) + if (!PvNode && bestValue >= beta && !is_decisive(bestValue) && !is_decisive(beta) + && !is_decisive(alpha)) bestValue = (bestValue * depth + beta) / (depth + 1); if (!moveCount) bestValue = excludedMove ? alpha : ss->inCheck ? mated_in(ss->ply) : VALUE_DRAW; - // If there is a move that produces search value greater than alpha we update the stats of searched moves + // If there is a move that produces search value greater than alpha, + // we update the stats of searched moves. else if (bestMove) - update_all_stats(pos, ss, *this, bestMove, bestValue, beta, prevSq, quietsSearched, - quietCount, capturesSearched, captureCount, depth); + update_all_stats(pos, ss, *this, bestMove, prevSq, quietsSearched, capturesSearched, depth); // Bonus for prior countermove that caused the fail low else if (!priorCapture && prevSq != SQ_NONE) { - int bonus = (depth > 5) + (PvNode || cutNode) + ((ss - 1)->statScore < -14455) - + ((ss - 1)->moveCount > 10) + (!ss->inCheck && bestValue <= ss->staticEval - 130) - + (!(ss - 1)->inCheck && bestValue <= -(ss - 1)->staticEval - 77); + int bonus = (117 * (depth > 5) + 39 * !allNode + 168 * ((ss - 1)->moveCount > 8) + + 115 * (!ss->inCheck && bestValue <= ss->staticEval - 108) + + 119 * (!(ss - 1)->inCheck && bestValue <= -(ss - 1)->staticEval - 83)); + + // Proportional to "how much damage we have to undo" + bonus += std::min(-(ss - 1)->statScore / 113, 300); + + bonus = std::max(bonus, 0); + update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq, - stat_bonus(depth) * bonus); + stat_bonus(depth) * bonus / 93); thisThread->mainHistory[~us][((ss - 1)->currentMove).from_to()] - << stat_bonus(depth) * bonus / 2; + << stat_bonus(depth) * bonus / 179; + + + if (type_of(pos.piece_on(prevSq)) != PAWN && ((ss - 1)->currentMove).type_of() != PROMOTION) + thisThread->pawnHistory[pawn_structure_index(pos)][pos.piece_on(prevSq)][prevSq] + << stat_bonus(depth) * bonus / 24; } + else if (priorCapture && prevSq != SQ_NONE) + { + // bonus for prior countermoves that caused the fail low + Piece capturedPiece = pos.captured_piece(); + assert(capturedPiece != NO_PIECE); + thisThread->captureHistory[pos.piece_on(prevSq)][prevSq][type_of(capturedPiece)] + << stat_bonus(depth) * 2; + } + + // Bonus when search fails low and there is a TT move + else if (ttData.move && !allNode) + thisThread->mainHistory[us][ttData.move.from_to()] << stat_bonus(depth) * 23 / 100; + if (PvNode) bestValue = std::min(bestValue, maxValue); @@ -1351,23 +1420,36 @@ moves_loop: // When in check, search starts here if (bestValue <= alpha) ss->ttPv = ss->ttPv || ((ss - 1)->ttPv && depth > 3); - // Write gathered information in transposition table - // Static evaluation is saved as it was before correction history + // Write gathered information in transposition table. Note that the + // static evaluation is saved as it was before correction history. if (!excludedMove && !(rootNode && thisThread->pvIdx)) - tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, - bestValue >= beta ? BOUND_LOWER - : PvNode && bestMove ? BOUND_EXACT - : BOUND_UPPER, - depth, bestMove, unadjustedStaticEval, tt.generation()); + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, + bestValue >= beta ? BOUND_LOWER + : PvNode && bestMove ? BOUND_EXACT + : BOUND_UPPER, + depth, bestMove, unadjustedStaticEval, tt.generation()); // Adjust correction history - if (!ss->inCheck && (!bestMove || !pos.capture(bestMove)) - && !(bestValue >= beta && bestValue <= ss->staticEval) - && !(!bestMove && bestValue >= ss->staticEval)) + if (!ss->inCheck && !(bestMove && pos.capture(bestMove)) + && ((bestValue < ss->staticEval && bestValue < beta) // negative correction & no fail high + || (bestValue > ss->staticEval && bestMove))) // positive correction & no fail low { + const auto m = (ss - 1)->currentMove; + static const int nonPawnWeight = 154; + auto bonus = std::clamp(int(bestValue - ss->staticEval) * depth / 8, -CORRECTION_HISTORY_LIMIT / 4, CORRECTION_HISTORY_LIMIT / 4); - thisThread->correctionHistory[us][pawn_structure_index(pos)] << bonus; + thisThread->pawnCorrectionHistory[us][pawn_structure_index(pos)] + << bonus * 107 / 128; + thisThread->majorPieceCorrectionHistory[us][major_piece_index(pos)] << bonus * 162 / 128; + thisThread->minorPieceCorrectionHistory[us][minor_piece_index(pos)] << bonus * 148 / 128; + thisThread->nonPawnCorrectionHistory[WHITE][us][non_pawn_index(pos)] + << bonus * nonPawnWeight / 128; + thisThread->nonPawnCorrectionHistory[BLACK][us][non_pawn_index(pos)] + << bonus * nonPawnWeight / 128; + + if (m.is_ok()) + (*(ss - 2)->continuationCorrectionHistory)[pos.piece_on(m.to_sq())][m.to_sq()] << bonus; } assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE); @@ -1376,22 +1458,23 @@ moves_loop: // When in check, search starts here } -// Quiescence search function, which is called by the main search -// function with zero depth, or recursively with further decreasing depth per call. -// (~155 Elo) +// Quiescence search function, which is called by the main search function with +// depth zero, or recursively with further decreasing depth. With depth <= 0, we +// "should" be using static eval only, but tactical moves may confuse the static eval. +// To fight this horizon effect, we implement this qsearch of tactical moves (~155 Elo). +// See https://www.chessprogramming.org/Horizon_Effect +// and https://www.chessprogramming.org/Quiescence_Search template -Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth) { +Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta) { static_assert(nodeType != Root); constexpr bool PvNode = nodeType == PV; assert(alpha >= -VALUE_INFINITE && alpha < beta && beta <= VALUE_INFINITE); assert(PvNode || (alpha == beta - 1)); - assert(depth <= 0); - // Check if we have an upcoming move that draws by repetition, or if - // the opponent had an alternative move earlier to this position. (~1 Elo) - if (alpha < VALUE_DRAW && pos.has_game_cycle(ss->ply)) + // Check if we have an upcoming move that draws by repetition (~1 Elo) + if (alpha < VALUE_DRAW && pos.upcoming_repetition(ss->ply)) { alpha = value_draw(this->nodes); if (alpha >= beta) @@ -1402,14 +1485,12 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); - TTEntry* tte; - Key posKey; - Move ttMove, move, bestMove; - Depth ttDepth; - Value bestValue, value, ttValue, futilityBase; - bool pvHit, givesCheck, capture; - int moveCount; - Color us = pos.side_to_move(); + Key posKey; + Move move, bestMove; + Value bestValue, value, futilityBase; + bool pvHit, givesCheck, capture; + int moveCount; + Color us = pos.side_to_move(); // Step 1. Initialize node if (PvNode) @@ -1430,26 +1511,25 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, // Step 2. Check for an immediate draw or maximum ply reached if (pos.is_draw(ss->ply) || ss->ply >= MAX_PLY) return (ss->ply >= MAX_PLY && !ss->inCheck) - ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) + ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]) : VALUE_DRAW; assert(0 <= ss->ply && ss->ply < MAX_PLY); - // Decide the replacement and cutoff priority of the qsearch TT entries - ttDepth = ss->inCheck || depth >= DEPTH_QS_CHECKS ? DEPTH_QS_CHECKS : DEPTH_QS_NO_CHECKS; - // Step 3. Transposition table lookup - posKey = pos.key(); - tte = tt.probe(posKey, ss->ttHit); - ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; - ttMove = ss->ttHit ? tte->move() : Move::none(); - pvHit = ss->ttHit && tte->is_pv(); + posKey = pos.key(); + auto [ttHit, ttData, ttWriter] = tt.probe(posKey); + // Need further processing of the saved data + ss->ttHit = ttHit; + ttData.move = ttHit ? ttData.move : Move::none(); + ttData.value = ttHit ? value_from_tt(ttData.value, ss->ply, pos.rule50_count()) : VALUE_NONE; + pvHit = ttHit && ttData.is_pv; // At non-PV nodes we check for an early TT cutoff - if (!PvNode && tte->depth() >= ttDepth - && ttValue != VALUE_NONE // Only in case of TT access race or if !ttHit - && (tte->bound() & (ttValue >= beta ? BOUND_LOWER : BOUND_UPPER))) - return ttValue; + if (!PvNode && ttData.depth >= DEPTH_QS + && is_valid(ttData.value) // Can happen when !ttHit or when access race in probe() + && (ttData.bound & (ttData.value >= beta ? BOUND_LOWER : BOUND_UPPER))) + return ttData.value; // Step 4. Static evaluation of the position Value unadjustedStaticEval = VALUE_NONE; @@ -1460,64 +1540,64 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, if (ss->ttHit) { // Never assume anything about values stored in TT - unadjustedStaticEval = tte->eval(); - if (unadjustedStaticEval == VALUE_NONE) + unadjustedStaticEval = ttData.eval; + if (!is_valid(unadjustedStaticEval)) unadjustedStaticEval = - evaluate(networks, pos, refreshTable, thisThread->optimism[us]); + evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]); ss->staticEval = bestValue = - to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); + to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos, ss); // ttValue can be used as a better position evaluation (~13 Elo) - if (ttValue != VALUE_NONE - && (tte->bound() & (ttValue > bestValue ? BOUND_LOWER : BOUND_UPPER))) - bestValue = ttValue; + if (is_valid(ttData.value) && !is_decisive(ttData.value) + && (ttData.bound & (ttData.value > bestValue ? BOUND_LOWER : BOUND_UPPER))) + bestValue = ttData.value; } else { - // In case of null move search, use previous static eval with a different sign - unadjustedStaticEval = (ss - 1)->currentMove != Move::null() - ? evaluate(networks, pos, refreshTable, thisThread->optimism[us]) - : -(ss - 1)->staticEval; - ss->staticEval = bestValue = - to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos); + // In case of null move search, use previous static eval with opposite sign + unadjustedStaticEval = + (ss - 1)->currentMove != Move::null() + ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]) + : -(ss - 1)->staticEval; + ss->staticEval = bestValue = + to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos, ss); } // Stand pat. Return immediately if static value is at least beta if (bestValue >= beta) { + if (!is_decisive(bestValue)) + bestValue = (bestValue + beta) / 2; if (!ss->ttHit) - tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, DEPTH_NONE, - Move::none(), unadjustedStaticEval, tt.generation()); - + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, + DEPTH_UNSEARCHED, Move::none(), unadjustedStaticEval, + tt.generation()); return bestValue; } if (bestValue > alpha) alpha = bestValue; - futilityBase = ss->staticEval + 270; + futilityBase = ss->staticEval + 306; } const PieceToHistory* contHist[] = {(ss - 1)->continuationHistory, (ss - 2)->continuationHistory}; - // Initialize a MovePicker object for the current position, and prepare - // to search the moves. Because the depth is <= 0 here, only captures, - // queen promotions, and other checks (only if depth >= DEPTH_QS_CHECKS) - // will be generated. - Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; - MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory, &thisThread->captureHistory, - contHist, &thisThread->pawnHistory); + Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; - int quietCheckEvasions = 0; + // Initialize a MovePicker object for the current position, and prepare to search + // the moves. We presently use two stages of move generator in quiescence search: + // captures, or evasions only when in check. + MovePicker mp(pos, ttData.move, DEPTH_QS, &thisThread->mainHistory, &thisThread->lowPlyHistory, + &thisThread->captureHistory, contHist, &thisThread->pawnHistory, ss->ply); - // Step 5. Loop through all pseudo-legal moves until no moves remain - // or a beta cutoff occurs. + // Step 5. Loop through all pseudo-legal moves until no moves remain or a beta + // cutoff occurs. while ((move = mp.next_move()) != Move::none()) { assert(move.is_ok()); - // Check for legality if (!pos.legal(move)) continue; @@ -1527,10 +1607,10 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, moveCount++; // Step 6. Pruning - if (bestValue > VALUE_TB_LOSS_IN_MAX_PLY && pos.non_pawn_material(us)) + if (!is_loss(bestValue) && pos.non_pawn_material(us)) { // Futility pruning and moveCount pruning (~10 Elo) - if (!givesCheck && move.to_sq() != prevSq && futilityBase > VALUE_TB_LOSS_IN_MAX_PLY + if (!givesCheck && move.to_sq() != prevSq && !is_loss(futilityBase) && move.type_of() != PROMOTION) { if (moveCount > 2) @@ -1538,44 +1618,34 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Value futilityValue = futilityBase + PieceValue[pos.piece_on(move.to_sq())]; - // If static eval + value of piece we are going to capture is much lower - // than alpha we can prune this move. (~2 Elo) + // If static eval + value of piece we are going to capture is + // much lower than alpha, we can prune this move. (~2 Elo) if (futilityValue <= alpha) { bestValue = std::max(bestValue, futilityValue); continue; } - // If static eval is much lower than alpha and move is not winning material + // If static exchange evaluation is low enough // we can prune this move. (~2 Elo) - if (futilityBase <= alpha && !pos.see_ge(move, 1)) + if (!pos.see_ge(move, alpha - futilityBase)) { - bestValue = std::max(bestValue, futilityBase); - continue; - } - - // If static exchange evaluation is much worse than what is needed to not - // fall below alpha we can prune this move. - if (futilityBase > alpha && !pos.see_ge(move, (alpha - futilityBase) * 4)) - { - bestValue = alpha; + bestValue = std::min(alpha, futilityBase); continue; } } - // We prune after the second quiet check evasion move, where being 'in check' is - // implicitly checked through the counter, and being a 'quiet move' apart from - // being a tt move is assumed after an increment because captures are pushed ahead. - if (quietCheckEvasions > 1) - break; - // Continuation history based pruning (~3 Elo) - if (!capture && (*contHist[0])[pos.moved_piece(move)][move.to_sq()] < 0 - && (*contHist[1])[pos.moved_piece(move)][move.to_sq()] < 0) + if (!capture + && (*contHist[0])[pos.moved_piece(move)][move.to_sq()] + + (*contHist[1])[pos.moved_piece(move)][move.to_sq()] + + thisThread->pawnHistory[pawn_structure_index(pos)][pos.moved_piece(move)] + [move.to_sq()] + <= 5095) continue; // Do not search moves with bad enough SEE values (~5 Elo) - if (!pos.see_ge(move, -69)) + if (!pos.see_ge(move, -83)) continue; } @@ -1587,13 +1657,13 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, ss->continuationHistory = &thisThread ->continuationHistory[ss->inCheck][capture][pos.moved_piece(move)][move.to_sq()]; - - quietCheckEvasions += !capture && ss->inCheck; + ss->continuationCorrectionHistory = + &thisThread->continuationCorrectionHistory[pos.moved_piece(move)][move.to_sq()]; // Step 7. Make and search the move thisThread->nodes.fetch_add(1, std::memory_order_relaxed); pos.do_move(move, st, givesCheck); - value = -qsearch(pos, ss + 1, -beta, -alpha, depth - 1); + value = -qsearch(pos, ss + 1, -beta, -alpha); pos.undo_move(move); assert(value > -VALUE_INFINITE && value < VALUE_INFINITE); @@ -1619,61 +1689,67 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, } // Step 9. Check for mate - // All legal moves have been searched. A special case: if we're in check - // and no legal moves were found, it is checkmate. + // All legal moves have been searched. A special case: if we are + // in check and no legal moves were found, it is checkmate. if (ss->inCheck && bestValue == -VALUE_INFINITE) { assert(!MoveList(pos).size()); return mated_in(ss->ply); // Plies to mate from the root } - if (std::abs(bestValue) < VALUE_TB_WIN_IN_MAX_PLY && bestValue >= beta) + if (!is_decisive(bestValue) && bestValue >= beta) bestValue = (3 * bestValue + beta) / 4; - // Save gathered info in transposition table - // Static evaluation is saved as it was before adjustment by correction history - tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit, - bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove, - unadjustedStaticEval, tt.generation()); + // Save gathered info in transposition table. The static evaluation + // is saved as it was before adjustment by correction history. + ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), pvHit, + bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, DEPTH_QS, bestMove, + unadjustedStaticEval, tt.generation()); assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE); return bestValue; } -Depth Search::Worker::reduction(bool i, Depth d, int mn, int delta) { +Depth Search::Worker::reduction(bool i, Depth d, int mn, int delta) const { int reductionScale = reductions[d] * reductions[mn]; - return (reductionScale + 1318 - delta * 760 / rootDelta) / 1024 + (!i && reductionScale > 1066); + return (reductionScale + 1304 - delta * 814 / rootDelta) + (!i && reductionScale > 1423) * 1135; } +// elapsed() returns the time elapsed since the search started. If the +// 'nodestime' option is enabled, it will return the count of nodes searched +// instead. This function is called to check whether the search should be +// stopped based on predefined thresholds like time limits or nodes searched. +// +// elapsed_time() returns the actual time elapsed since the start of the search. +// This function is intended for use only when printing PV outputs, and not used +// for making decisions within the search algorithm itself. TimePoint Search::Worker::elapsed() const { return main_manager()->tm.elapsed([this]() { return threads.nodes_searched(); }); } +TimePoint Search::Worker::elapsed_time() const { return main_manager()->tm.elapsed_time(); } + namespace { -// Adjusts a mate or TB score from "plies to mate from the root" -// to "plies to mate from the current position". Standard scores are unchanged. +// Adjusts a mate or TB score from "plies to mate from the root" to +// "plies to mate from the current position". Standard scores are unchanged. // The function is called before storing a value in the transposition table. -Value value_to_tt(Value v, int ply) { - - assert(v != VALUE_NONE); - return v >= VALUE_TB_WIN_IN_MAX_PLY ? v + ply : v <= VALUE_TB_LOSS_IN_MAX_PLY ? v - ply : v; -} +Value value_to_tt(Value v, int ply) { return is_win(v) ? v + ply : is_loss(v) ? v - ply : v; } -// Inverse of value_to_tt(): it adjusts a mate or TB score -// from the transposition table (which refers to the plies to mate/be mated from -// current position) to "plies to mate/be mated (TB win/loss) from the root". -// However, to avoid potentially false mate or TB scores related to the 50 moves rule -// and the graph history interaction, we return the highest non-TB score instead. +// Inverse of value_to_tt(): it adjusts a mate or TB score from the transposition +// table (which refers to the plies to mate/be mated from current position) to +// "plies to mate/be mated (TB win/loss) from the root". However, to avoid +// potentially false mate or TB scores related to the 50 moves rule and the +// graph history interaction, we return the highest non-TB score instead. Value value_from_tt(Value v, int ply, int r50c) { - if (v == VALUE_NONE) + if (!is_valid(v)) return VALUE_NONE; // handle TB win or better - if (v >= VALUE_TB_WIN_IN_MAX_PLY) + if (is_win(v)) { // Downgrade a potentially false mate score if (v >= VALUE_MATE_IN_MAX_PLY && VALUE_MATE - v > 100 - r50c) @@ -1687,7 +1763,7 @@ Value value_from_tt(Value v, int ply, int r50c) { } // handle TB loss or worse - if (v <= VALUE_TB_LOSS_IN_MAX_PLY) + if (is_loss(v)) { // Downgrade a potentially false mate score. if (v <= VALUE_MATED_IN_MAX_PLY && VALUE_MATE + v > 100 - r50c) @@ -1714,118 +1790,88 @@ void update_pv(Move* pv, Move move, const Move* childPv) { // Updates stats at the end of search() when a bestMove is found -void update_all_stats(const Position& pos, - Stack* ss, - Search::Worker& workerThread, - Move bestMove, - Value bestValue, - Value beta, - Square prevSq, - Move* quietsSearched, - int quietCount, - Move* capturesSearched, - int captureCount, - Depth depth) { +void update_all_stats(const Position& pos, + Stack* ss, + Search::Worker& workerThread, + Move bestMove, + Square prevSq, + ValueList& quietsSearched, + ValueList& capturesSearched, + Depth depth) { CapturePieceToHistory& captureHistory = workerThread.captureHistory; Piece moved_piece = pos.moved_piece(bestMove); PieceType captured; - int quietMoveBonus = stat_bonus(depth + 1); - int quietMoveMalus = stat_malus(depth); + int bonus = stat_bonus(depth); + int malus = stat_malus(depth); if (!pos.capture_stage(bestMove)) { - int bestMoveBonus = bestValue > beta + 165 ? quietMoveBonus // larger bonus - : stat_bonus(depth); // smaller bonus - - update_quiet_stats(pos, ss, workerThread, bestMove, bestMoveBonus); + update_quiet_histories(pos, ss, workerThread, bestMove, bonus); // Decrease stats for all non-best quiet moves - for (int i = 0; i < quietCount; ++i) - update_quiet_histories(pos, ss, workerThread, quietsSearched[i], -quietMoveMalus); + for (Move move : quietsSearched) + update_quiet_histories(pos, ss, workerThread, move, -malus); } else { // Increase stats for the best move in case it was a capture move captured = type_of(pos.piece_on(bestMove.to_sq())); - captureHistory[moved_piece][bestMove.to_sq()][captured] << quietMoveBonus; + captureHistory[moved_piece][bestMove.to_sq()][captured] << bonus; } - // Extra penalty for a quiet early move that was not a TT move or - // main killer move in previous ply when it gets refuted. - if (prevSq != SQ_NONE - && ((ss - 1)->moveCount == 1 + (ss - 1)->ttHit - || ((ss - 1)->currentMove == (ss - 1)->killers[0])) - && !pos.captured_piece()) - update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq, -quietMoveMalus); + // Extra penalty for a quiet early move that was not a TT move in + // previous ply when it gets refuted. + if (prevSq != SQ_NONE && ((ss - 1)->moveCount == 1 + (ss - 1)->ttHit) && !pos.captured_piece()) + update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq, -malus); // Decrease stats for all non-best capture moves - for (int i = 0; i < captureCount; ++i) + for (Move move : capturesSearched) { - moved_piece = pos.moved_piece(capturesSearched[i]); - captured = type_of(pos.piece_on(capturesSearched[i].to_sq())); - captureHistory[moved_piece][capturesSearched[i].to_sq()][captured] << -quietMoveMalus; + moved_piece = pos.moved_piece(move); + captured = type_of(pos.piece_on(move.to_sq())); + captureHistory[moved_piece][move.to_sq()][captured] << -malus; } } -// Updates histories of the move pairs formed -// by moves at ply -1, -2, -3, -4, and -6 with current move. +// Updates histories of the move pairs formed by moves +// at ply -1, -2, -3, -4, and -6 with current move. void update_continuation_histories(Stack* ss, Piece pc, Square to, int bonus) { + bonus = bonus * 50 / 64; + for (int i : {1, 2, 3, 4, 6}) { // Only update the first 2 continuation histories if we are in check if (ss->inCheck && i > 2) break; if (((ss - i)->currentMove).is_ok()) - (*(ss - i)->continuationHistory)[pc][to] << bonus / (1 + 3 * (i == 3)); + (*(ss - i)->continuationHistory)[pc][to] << bonus / (1 + (i == 3)); } } // Updates move sorting heuristics -void update_refutations(const Position& pos, Stack* ss, Search::Worker& workerThread, Move move) { - - // Update killers - if (ss->killers[0] != move) - { - ss->killers[1] = ss->killers[0]; - ss->killers[0] = move; - } - - // Update countermove history - if (((ss - 1)->currentMove).is_ok()) - { - Square prevSq = ((ss - 1)->currentMove).to_sq(); - workerThread.counterMoves[pos.piece_on(prevSq)][prevSq] = move; - } -} void update_quiet_histories( const Position& pos, Stack* ss, Search::Worker& workerThread, Move move, int bonus) { Color us = pos.side_to_move(); workerThread.mainHistory[us][move.from_to()] << bonus; + if (ss->ply < LOW_PLY_HISTORY_SIZE) + workerThread.lowPlyHistory[ss->ply][move.from_to()] << bonus; update_continuation_histories(ss, pos.moved_piece(move), move.to_sq(), bonus); int pIndex = pawn_structure_index(pos); - workerThread.pawnHistory[pIndex][pos.moved_piece(move)][move.to_sq()] << bonus; -} - -// Updates move sorting heuristics -void update_quiet_stats( - const Position& pos, Stack* ss, Search::Worker& workerThread, Move move, int bonus) { - - update_refutations(pos, ss, workerThread, move); - update_quiet_histories(pos, ss, workerThread, move, bonus); + workerThread.pawnHistory[pIndex][pos.moved_piece(move)][move.to_sq()] << bonus / 2; } } -// When playing with strength handicap, choose the best move among a set of RootMoves -// using a statistical rule dependent on 'level'. Idea by Heinz van Saanen. +// When playing with strength handicap, choose the best move among a set of +// RootMoves using a statistical rule dependent on 'level'. Idea by Heinz van Saanen. Move Skill::pick_best(const RootMoves& rootMoves, size_t multiPV) { static PRNG rng(now()); // PRNG sequence should be non-deterministic @@ -1856,8 +1902,8 @@ Move Skill::pick_best(const RootMoves& rootMoves, size_t multiPV) { } -// Used to print debug info and, more importantly, -// to detect when we are out of available time and thus stop the search. +// Used to print debug info and, more importantly, to detect +// when we are out of available time and thus stop the search. void SearchManager::check_time(Search::Worker& worker) { if (--callsCnt > 0) return; @@ -1890,18 +1936,152 @@ void SearchManager::check_time(Search::Worker& worker) { worker.threads.stop = worker.threads.abortedSearch = true; } -void SearchManager::pv(const Search::Worker& worker, +// Used to correct and extend PVs for moves that have a TB (but not a mate) score. +// Keeps the search based PV for as long as it is verified to maintain the game +// outcome, truncates afterwards. Finally, extends to mate the PV, providing a +// possible continuation (but not a proven mating line). +void syzygy_extend_pv(const OptionsMap& options, + const Search::LimitsType& limits, + Position& pos, + RootMove& rootMove, + Value& v) { + + auto t_start = std::chrono::steady_clock::now(); + int moveOverhead = int(options["Move Overhead"]); + + // Do not use more than moveOverhead / 2 time, if time management is active + auto time_abort = [&t_start, &moveOverhead, &limits]() -> bool { + auto t_end = std::chrono::steady_clock::now(); + return limits.use_time_management() + && 2 * std::chrono::duration(t_end - t_start).count() + > moveOverhead; + }; + + std::list sts; + + // Step 0, do the rootMove, no correction allowed, as needed for MultiPV in TB. + auto& stRoot = sts.emplace_back(); + pos.do_move(rootMove.pv[0], stRoot); + int ply = 1; + + // Step 1, walk the PV to the last position in TB with correct decisive score + while (size_t(ply) < rootMove.pv.size()) + { + Move& pvMove = rootMove.pv[ply]; + + RootMoves legalMoves; + for (const auto& m : MoveList(pos)) + legalMoves.emplace_back(m); + + Tablebases::Config config = Tablebases::rank_root_moves(options, pos, legalMoves); + RootMove& rm = *std::find(legalMoves.begin(), legalMoves.end(), pvMove); + + if (legalMoves[0].tbRank != rm.tbRank) + break; + + ply++; + + auto& st = sts.emplace_back(); + pos.do_move(pvMove, st); + + // Do not allow for repetitions or drawing moves along the PV in TB regime + if (config.rootInTB && pos.is_draw(ply)) + { + pos.undo_move(pvMove); + ply--; + break; + } + + // Full PV shown will thus be validated and end in TB. + // If we cannot validate the full PV in time, we do not show it. + if (config.rootInTB && time_abort()) + break; + } + + // Resize the PV to the correct part + rootMove.pv.resize(ply); + + // Step 2, now extend the PV to mate, as if the user explored syzygy-tables.info + // using top ranked moves (minimal DTZ), which gives optimal mates only for simple + // endgames e.g. KRvK. + while (!pos.is_draw(0)) + { + if (time_abort()) + break; + + RootMoves legalMoves; + for (const auto& m : MoveList(pos)) + { + auto& rm = legalMoves.emplace_back(m); + StateInfo tmpSI; + pos.do_move(m, tmpSI); + // Give a score of each move to break DTZ ties restricting opponent mobility, + // but not giving the opponent a capture. + for (const auto& mOpp : MoveList(pos)) + rm.tbRank -= pos.capture(mOpp) ? 100 : 1; + pos.undo_move(m); + } + + // Mate found + if (legalMoves.size() == 0) + break; + + // Sort moves according to their above assigned rank. + // This will break ties for moves with equal DTZ in rank_root_moves. + std::stable_sort( + legalMoves.begin(), legalMoves.end(), + [](const Search::RootMove& a, const Search::RootMove& b) { return a.tbRank > b.tbRank; }); + + // The winning side tries to minimize DTZ, the losing side maximizes it + Tablebases::Config config = Tablebases::rank_root_moves(options, pos, legalMoves, true); + + // If DTZ is not available we might not find a mate, so we bail out + if (!config.rootInTB || config.cardinality > 0) + break; + + ply++; + + Move& pvMove = legalMoves[0].pv[0]; + rootMove.pv.push_back(pvMove); + auto& st = sts.emplace_back(); + pos.do_move(pvMove, st); + } + + // Finding a draw in this function is an exceptional case, that cannot happen + // during engine game play, since we have a winning score, and play correctly + // with TB support. However, it can be that a position is draw due to the 50 move + // rule if it has been been reached on the board with a non-optimal 50 move counter + // (e.g. 8/8/6k1/3B4/3K4/4N3/8/8 w - - 54 106 ) which TB with dtz counter rounding + // cannot always correctly rank. See also + // https://github.com/official-stockfish/Stockfish/issues/5175#issuecomment-2058893495 + // We adjust the score to match the found PV. Note that a TB loss score can be + // displayed if the engine did not find a drawing move yet, but eventually search + // will figure it out (e.g. 1kq5/q2r4/5K2/8/8/8/8/7Q w - - 96 1 ) + if (pos.is_draw(0)) + v = VALUE_DRAW; + + // Undo the PV moves + for (auto it = rootMove.pv.rbegin(); it != rootMove.pv.rend(); ++it) + pos.undo_move(*it); + + // Inform if we couldn't get a full extension in time + if (time_abort()) + sync_cout + << "info string Syzygy based PV extension requires more time, increase Move Overhead as needed." + << sync_endl; +} + +void SearchManager::pv(Search::Worker& worker, const ThreadPool& threads, const TranspositionTable& tt, - Depth depth) const { + Depth depth) { - const auto nodes = threads.nodes_searched(); - const auto& rootMoves = worker.rootMoves; - const auto& pos = worker.rootPos; - size_t pvIdx = worker.pvIdx; - TimePoint time = tm.elapsed([nodes]() { return nodes; }) + 1; - size_t multiPV = std::min(size_t(worker.options["MultiPV"]), rootMoves.size()); - uint64_t tbHits = threads.tb_hits() + (worker.tbConfig.rootInTB ? rootMoves.size() : 0); + const auto nodes = threads.nodes_searched(); + auto& rootMoves = worker.rootMoves; + auto& pos = worker.rootPos; + size_t pvIdx = worker.pvIdx; + size_t multiPV = std::min(size_t(worker.options["MultiPV"]), rootMoves.size()); + uint64_t tbHits = threads.tb_hits() + (worker.tbConfig.rootInTB ? rootMoves.size() : 0); for (size_t i = 0; i < multiPV; ++i) { @@ -1919,11 +2099,18 @@ void SearchManager::pv(const Search::Worker& worker, bool tb = worker.tbConfig.rootInTB && std::abs(v) <= VALUE_TB; v = tb ? rootMoves[i].tbScore : v; + bool isExact = i != pvIdx || tb || !updated; // tablebase- and previous-scores are exact + + // Potentially correct and extend the PV, and in exceptional cases v + if (is_decisive(v) && std::abs(v) < VALUE_MATE_IN_MAX_PLY + && ((!rootMoves[i].scoreLowerbound && !rootMoves[i].scoreUpperbound) || isExact)) + syzygy_extend_pv(worker.options, worker.limits, pos, rootMoves[i], v); + std::string pv; for (Move m : rootMoves[i].pv) pv += UCIEngine::move(m, pos.is_chess960()) + " "; - // remove last whitespace + // Remove last whitespace if (!pv.empty()) pv.pop_back(); @@ -1940,15 +2127,16 @@ void SearchManager::pv(const Search::Worker& worker, info.score = {v, pos}; info.wdl = wdl; - if (i == pvIdx && !tb && updated) // tablebase- and previous-scores are exact + if (!isExact) info.bound = bound; - info.timeMs = time; - info.nodes = nodes; - info.nps = nodes * 1000 / time; - info.tbHits = tbHits; - info.pv = pv; - info.hashfull = tt.hashfull(); + TimePoint time = tm.elapsed_time() + 1; + info.timeMs = time; + info.nodes = nodes; + info.nps = nodes * 1000 / time; + info.tbHits = tbHits; + info.pv = pv; + info.hashfull = tt.hashfull(); updates.onUpdateFull(info); } @@ -1963,20 +2151,17 @@ bool RootMove::extract_ponder_from_tt(const TranspositionTable& tt, Position& po StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); - bool ttHit; - assert(pv.size() == 1); if (pv[0] == Move::none()) return false; pos.do_move(pv[0], st); - TTEntry* tte = tt.probe(pos.key(), ttHit); + auto [ttHit, ttData, ttWriter] = tt.probe(pos.key()); if (ttHit) { - Move m = tte->move(); // Local copy to be SMP safe - if (MoveList(pos).contains(m)) - pv.push_back(m); + if (MoveList(pos).contains(ttData.move)) + pv.push_back(ttData.move); } pos.undo_move(pv[0]); diff --git a/src/search.h b/src/search.h index cb73a5af..b618855b 100644 --- a/src/search.h +++ b/src/search.h @@ -19,6 +19,7 @@ #ifndef SEARCH_H_INCLUDED #define SEARCH_H_INCLUDED +#include #include #include #include @@ -30,21 +31,19 @@ #include #include +#include "history.h" #include "misc.h" -#include "movepick.h" +#include "nnue/network.h" +#include "nnue/nnue_accumulator.h" +#include "numa.h" #include "position.h" #include "score.h" #include "syzygy/tbprobe.h" #include "timeman.h" #include "types.h" -#include "nnue/nnue_accumulator.h" namespace Stockfish { -namespace Eval::NNUE { -struct Networks; -} - // Different node types, used as a template parameter enum NodeType { NonPV, @@ -62,19 +61,19 @@ namespace Search { // shallower and deeper in the tree during the search. Each search thread has // its own array of Stack objects, indexed by the current ply. struct Stack { - Move* pv; - PieceToHistory* continuationHistory; - int ply; - Move currentMove; - Move excludedMove; - Move killers[2]; - Value staticEval; - int statScore; - int moveCount; - bool inCheck; - bool ttPv; - bool ttHit; - int cutoffCnt; + Move* pv; + PieceToHistory* continuationHistory; + CorrectionHistory* continuationCorrectionHistory; + int ply; + Move currentMove; + Move excludedMove; + Value staticEval; + int statScore; + int moveCount; + bool inCheck; + bool ttPv; + bool ttHit; + int cutoffCnt; }; @@ -92,15 +91,16 @@ struct RootMove { return m.score != score ? m.score < score : m.previousScore < previousScore; } - uint64_t effort = 0; - Value score = -VALUE_INFINITE; - Value previousScore = -VALUE_INFINITE; - Value averageScore = -VALUE_INFINITE; - Value uciScore = -VALUE_INFINITE; - bool scoreLowerbound = false; - bool scoreUpperbound = false; - int selDepth = 0; - int tbRank = 0; + uint64_t effort = 0; + Value score = -VALUE_INFINITE; + Value previousScore = -VALUE_INFINITE; + Value averageScore = -VALUE_INFINITE; + Value meanSquaredScore = -VALUE_INFINITE * VALUE_INFINITE; + Value uciScore = -VALUE_INFINITE; + bool scoreLowerbound = false; + bool scoreUpperbound = false; + int selDepth = 0; + int tbRank = 0; Value tbScore; std::vector pv; }; @@ -133,19 +133,19 @@ struct LimitsType { // The UCI stores the uci options, thread pool, and transposition table. // This struct is used to easily forward data to the Search::Worker class. struct SharedState { - SharedState(const OptionsMap& optionsMap, - ThreadPool& threadPool, - TranspositionTable& transpositionTable, - const Eval::NNUE::Networks& nets) : + SharedState(const OptionsMap& optionsMap, + ThreadPool& threadPool, + TranspositionTable& transpositionTable, + const LazyNumaReplicated& nets) : options(optionsMap), threads(threadPool), tt(transpositionTable), networks(nets) {} - const OptionsMap& options; - ThreadPool& threads; - TranspositionTable& tt; - const Eval::NNUE::Networks& networks; + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const LazyNumaReplicated& networks; }; class Worker; @@ -182,6 +182,34 @@ struct InfoIteration { size_t currmovenumber; }; +// Skill structure is used to implement strength limit. If we have a UCI_Elo, +// we convert it to an appropriate skill level, anchored to the Stash engine. +// This method is based on a fit of the Elo results for games played between +// Stockfish at various skill levels and various versions of the Stash engine. +// Skill 0 .. 19 now covers CCRL Blitz Elo from 1320 to 3190, approximately +// Reference: https://github.com/vondele/Stockfish/commit/a08b8d4e9711c2 +struct Skill { + // Lowest and highest Elo ratings used in the skill level calculation + constexpr static int LowestElo = 1320; + constexpr static int HighestElo = 3190; + + Skill(int skill_level, int uci_elo) { + if (uci_elo) + { + double e = double(uci_elo - LowestElo) / (HighestElo - LowestElo); + level = std::clamp((((37.2473 * e - 40.8525) * e + 22.2943) * e - 0.311438), 0.0, 19.0); + } + else + level = double(skill_level); + } + bool enabled() const { return level < 20.0; } + bool time_to_pick(Depth depth) const { return depth == 1 + int(level); } + Move pick_best(const RootMoves&, size_t multiPV); + + double level; + Move best = Move::none(); +}; + // SearchManager manages the search from the main thread. It is responsible for // keeping track of the time, and storing data strictly related to the main thread. class SearchManager: public ISearchManager { @@ -204,12 +232,13 @@ class SearchManager: public ISearchManager { void check_time(Search::Worker& worker) override; - void pv(const Search::Worker& worker, + void pv(Search::Worker& worker, const ThreadPool& threads, const TranspositionTable& tt, - Depth depth) const; + Depth depth); Stockfish::TimeManagement tm; + double originalTimeAdjust; int callsCnt; std::atomic_bool ponder; @@ -235,47 +264,55 @@ class NullSearchManager: public ISearchManager { // of the search history, and storing data required for the search. class Worker { public: - Worker(SharedState&, std::unique_ptr, size_t); + Worker(SharedState&, std::unique_ptr, size_t, NumaReplicatedAccessToken); - // Called at instantiation to initialize Reductions tables - // Reset histories, usually before a new game + // Called at instantiation to initialize reductions tables. + // Reset histories, usually before a new game. void clear(); // Called when the program receives the UCI 'go' command. // It searches from the root position and outputs the "bestmove". void start_searching(); - bool is_mainthread() const { return thread_idx == 0; } + bool is_mainthread() const { return threadIdx == 0; } + + void ensure_network_replicated(); // Public because they need to be updatable by the stats - CounterMoveHistory counterMoves; - ButterflyHistory mainHistory; + ButterflyHistory mainHistory; + LowPlyHistory lowPlyHistory; + CapturePieceToHistory captureHistory; ContinuationHistory continuationHistory[2][2]; PawnHistory pawnHistory; - CorrectionHistory correctionHistory; + + CorrectionHistory pawnCorrectionHistory; + CorrectionHistory majorPieceCorrectionHistory; + CorrectionHistory minorPieceCorrectionHistory; + CorrectionHistory nonPawnCorrectionHistory[COLOR_NB]; + CorrectionHistory continuationCorrectionHistory; private: void iterative_deepening(); - // Main search function for both PV and non-PV nodes + // This is the main search function, for both PV and non-PV nodes template Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode); // Quiescence search function, which is called by the main search template - Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth = 0); + Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta); - Depth reduction(bool i, Depth d, int mn, int delta); + Depth reduction(bool i, Depth d, int mn, int delta) const; - // Get a pointer to the search manager, only allowed to be called by the - // main thread. + // Pointer to the search manager, only allowed to be called by the main thread SearchManager* main_manager() const { - assert(thread_idx == 0); + assert(threadIdx == 0); return static_cast(manager.get()); } TimePoint elapsed() const; + TimePoint elapsed_time() const; LimitsType limits; @@ -291,7 +328,8 @@ class Worker { Depth rootDepth, completedDepth; Value rootDelta; - size_t thread_idx; + size_t threadIdx; + NumaReplicatedAccessToken numaAccessToken; // Reductions lookup table initialized at startup std::array reductions; // [depth or moveNumber] @@ -301,10 +339,10 @@ class Worker { Tablebases::Config tbConfig; - const OptionsMap& options; - ThreadPool& threads; - TranspositionTable& tt; - const Eval::NNUE::Networks& networks; + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const LazyNumaReplicated& networks; // Used by NNUE Eval::NNUE::AccumulatorCaches refreshTable; diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp index 722dc9d3..9b24e700 100644 --- a/src/syzygy/tbprobe.cpp +++ b/src/syzygy/tbprobe.cpp @@ -66,7 +66,7 @@ namespace { constexpr int TBPIECES = 7; // Max number of supported pieces constexpr int MAX_DTZ = - 1 << 18; // Max DTZ supported, large enough to deal with the syzygy TB limit. + 1 << 18; // Max DTZ supported times 2, large enough to deal with the syzygy TB limit. enum { BigEndian, @@ -443,6 +443,8 @@ class TBTables { std::deque> wdlTable; std::deque> dtzTable; + size_t foundDTZFiles = 0; + size_t foundWDLFiles = 0; void insert(Key key, TBTable* wdl, TBTable* dtz) { uint32_t homeBucket = uint32_t(key) & (Size - 1); @@ -486,9 +488,16 @@ class TBTables { memset(hashTable, 0, sizeof(hashTable)); wdlTable.clear(); dtzTable.clear(); + foundDTZFiles = 0; + foundWDLFiles = 0; } - size_t size() const { return wdlTable.size(); } - void add(const std::vector& pieces); + + void info() const { + sync_cout << "info string Found " << foundWDLFiles << " WDL and " << foundDTZFiles + << " DTZ tablebase files (up to " << MaxCardinality << "-man)." << sync_endl; + } + + void add(const std::vector& pieces); }; TBTables TBTables; @@ -501,13 +510,22 @@ void TBTables::add(const std::vector& pieces) { for (PieceType pt : pieces) code += PieceToChar[pt]; + code.insert(code.find('K', 1), "v"); - TBFile file(code.insert(code.find('K', 1), "v") + ".rtbw"); // KRK -> KRvK + TBFile file_dtz(code + ".rtbz"); // KRK -> KRvK + if (file_dtz.is_open()) + { + file_dtz.close(); + foundDTZFiles++; + } + + TBFile file(code + ".rtbw"); // KRK -> KRvK if (!file.is_open()) // Only WDL file is checked return; file.close(); + foundWDLFiles++; MaxCardinality = std::max(int(pieces.size()), MaxCardinality); @@ -1326,7 +1344,7 @@ void Tablebases::init(const std::string& paths) { MaxCardinality = 0; TBFile::Paths = paths; - if (paths.empty() || paths == "") + if (paths.empty()) return; // MapB1H1H7[] encodes a square below a1-h8 diagonal to 0..27 @@ -1466,7 +1484,7 @@ void Tablebases::init(const std::string& paths) { } } - sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl; + TBTables.info(); } // Probe the WDL table for a particular position. @@ -1574,7 +1592,10 @@ int Tablebases::probe_dtz(Position& pos, ProbeState* result) { // Use the DTZ tables to rank root moves. // // A return value false indicates that not all probes were successful. -bool Tablebases::root_probe(Position& pos, Search::RootMoves& rootMoves, bool rule50) { +bool Tablebases::root_probe(Position& pos, + Search::RootMoves& rootMoves, + bool rule50, + bool rankDTZ) { ProbeState result = OK; StateInfo st; @@ -1585,7 +1606,7 @@ bool Tablebases::root_probe(Position& pos, Search::RootMoves& rootMoves, bool ru // Check whether a position was repeated since the last zeroing move. bool rep = pos.has_repeated(); - int dtz, bound = rule50 ? (MAX_DTZ - 100) : 1; + int dtz, bound = rule50 ? (MAX_DTZ / 2 - 100) : 1; // Probe and rank each move for (auto& m : rootMoves) @@ -1624,8 +1645,10 @@ bool Tablebases::root_probe(Position& pos, Search::RootMoves& rootMoves, bool ru // Better moves are ranked higher. Certain wins are ranked equally. // Losing moves are ranked equally unless a 50-move draw is in sight. - int r = dtz > 0 ? (dtz + cnt50 <= 99 && !rep ? MAX_DTZ : MAX_DTZ - (dtz + cnt50)) - : dtz < 0 ? (-dtz * 2 + cnt50 < 100 ? -MAX_DTZ : -MAX_DTZ + (-dtz + cnt50)) + int r = dtz > 0 ? (dtz + cnt50 <= 99 && !rep ? MAX_DTZ - (rankDTZ ? dtz : 0) + : MAX_DTZ / 2 - (dtz + cnt50)) + : dtz < 0 ? (-dtz * 2 + cnt50 < 100 ? -MAX_DTZ - (rankDTZ ? dtz : 0) + : -MAX_DTZ / 2 + (-dtz + cnt50)) : 0; m.tbRank = r; @@ -1633,10 +1656,11 @@ bool Tablebases::root_probe(Position& pos, Search::RootMoves& rootMoves, bool ru // 1 cp to cursed wins and let it grow to 49 cp as the positions gets // closer to a real win. m.tbScore = r >= bound ? VALUE_MATE - MAX_PLY - 1 - : r > 0 ? Value((std::max(3, r - (MAX_DTZ - 200)) * int(PawnValue)) / 200) - : r == 0 ? VALUE_DRAW - : r > -bound ? Value((std::min(-3, r + (MAX_DTZ - 200)) * int(PawnValue)) / 200) - : -VALUE_MATE + MAX_PLY + 1; + : r > 0 ? Value((std::max(3, r - (MAX_DTZ / 2 - 200)) * int(PawnValue)) / 200) + : r == 0 ? VALUE_DRAW + : r > -bound + ? Value((std::min(-3, r + (MAX_DTZ / 2 - 200)) * int(PawnValue)) / 200) + : -VALUE_MATE + MAX_PLY + 1; } return true; @@ -1683,7 +1707,8 @@ bool Tablebases::root_probe_wdl(Position& pos, Search::RootMoves& rootMoves, boo Config Tablebases::rank_root_moves(const OptionsMap& options, Position& pos, - Search::RootMoves& rootMoves) { + Search::RootMoves& rootMoves, + bool rankDTZ) { Config config; if (rootMoves.empty()) @@ -1707,7 +1732,7 @@ Config Tablebases::rank_root_moves(const OptionsMap& options, if (config.cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING)) { // Rank moves using DTZ tables - config.rootInTB = root_probe(pos, rootMoves, options["Syzygy50MoveRule"]); + config.rootInTB = root_probe(pos, rootMoves, options["Syzygy50MoveRule"], rankDTZ); if (!config.rootInTB) { diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h index e10950f4..75a18585 100644 --- a/src/syzygy/tbprobe.h +++ b/src/syzygy/tbprobe.h @@ -66,9 +66,12 @@ extern int MaxCardinality; void init(const std::string& paths); WDLScore probe_wdl(Position& pos, ProbeState* result); int probe_dtz(Position& pos, ProbeState* result); -bool root_probe(Position& pos, Search::RootMoves& rootMoves, bool rule50); +bool root_probe(Position& pos, Search::RootMoves& rootMoves, bool rule50, bool rankDTZ); bool root_probe_wdl(Position& pos, Search::RootMoves& rootMoves, bool rule50); -Config rank_root_moves(const OptionsMap& options, Position& pos, Search::RootMoves& rootMoves); +Config rank_root_moves(const OptionsMap& options, + Position& pos, + Search::RootMoves& rootMoves, + bool rankDTZ = false); } // namespace Stockfish::Tablebases diff --git a/src/thread.cpp b/src/thread.cpp index 9052654b..5f73771e 100644 --- a/src/thread.cpp +++ b/src/thread.cpp @@ -22,19 +22,17 @@ #include #include #include +#include #include #include -#include -#include "misc.h" #include "movegen.h" #include "search.h" #include "syzygy/tbprobe.h" #include "timeman.h" -#include "tt.h" #include "types.h" -#include "ucioption.h" #include "uci.h" +#include "ucioption.h" namespace Stockfish { @@ -42,13 +40,24 @@ namespace Stockfish { // in idle_loop(). Note that 'searching' and 'exit' should be already set. Thread::Thread(Search::SharedState& sharedState, std::unique_ptr sm, - size_t n) : - worker(std::make_unique(sharedState, std::move(sm), n)), + size_t n, + OptionalThreadToNumaNodeBinder binder) : idx(n), nthreads(sharedState.options["Threads"]), stdThread(&Thread::idle_loop, this) { wait_for_search_finished(); + + run_custom_job([this, &binder, &sharedState, &sm, n]() { + // Use the binder to [maybe] bind the threads to a NUMA node before doing + // the Worker allocation. Ideally we would also allocate the SearchManager + // here, but that's minor. + this->numaAccessToken = binder(); + this->worker = + std::make_unique(sharedState, std::move(sm), n, this->numaAccessToken); + }); + + wait_for_search_finished(); } @@ -63,38 +72,42 @@ Thread::~Thread() { stdThread.join(); } - // Wakes up the thread that will start the search void Thread::start_searching() { - mutex.lock(); - searching = true; - mutex.unlock(); // Unlock before notifying saves a few CPU-cycles - cv.notify_one(); // Wake up the thread in idle_loop() + assert(worker != nullptr); + run_custom_job([this]() { worker->start_searching(); }); } +// Clears the histories for the thread worker (usually before a new game) +void Thread::clear_worker() { + assert(worker != nullptr); + run_custom_job([this]() { worker->clear(); }); +} -// Blocks on the condition variable -// until the thread has finished searching. +// Blocks on the condition variable until the thread has finished searching void Thread::wait_for_search_finished() { std::unique_lock lk(mutex); cv.wait(lk, [&] { return !searching; }); } +// Launching a function in the thread +void Thread::run_custom_job(std::function f) { + { + std::unique_lock lk(mutex); + cv.wait(lk, [&] { return !searching; }); + jobFunc = std::move(f); + searching = true; + } + cv.notify_one(); +} -// Thread gets parked here, blocked on the -// condition variable, when it has no work to do. +void Thread::ensure_network_replicated() { worker->ensure_network_replicated(); } + +// Thread gets parked here, blocked on the condition variable +// when the thread has no work to do. void Thread::idle_loop() { - - // If OS already scheduled us on a different group than 0 then don't overwrite - // the choice, eventually we are one of many one-threaded processes running on - // some Windows NUMA hardware, for instance in fishtest. To make it simple, - // just check if running threads are below a threshold, in this case, all this - // NUMA machinery is not needed. - if (nthreads > 8) - WinProcGroup::bind_this_thread(idx); - while (true) { std::unique_lock lk(mutex); @@ -105,15 +118,17 @@ void Thread::idle_loop() { if (exit) return; + std::function job = std::move(jobFunc); + jobFunc = nullptr; + lk.unlock(); - worker->start_searching(); + if (job) + job(); } } -Search::SearchManager* ThreadPool::main_manager() { - return static_cast(main_thread()->worker.get()->manager.get()); -} +Search::SearchManager* ThreadPool::main_manager() { return main_thread()->worker->main_manager(); } uint64_t ThreadPool::nodes_searched() const { return accumulate(&Search::Worker::nodes); } uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits); } @@ -121,59 +136,107 @@ uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits // Creates/destroys threads to match the requested number. // Created and launched threads will immediately go to sleep in idle_loop. // Upon resizing, threads are recreated to allow for binding if necessary. -void ThreadPool::set(Search::SharedState sharedState, +void ThreadPool::set(const NumaConfig& numaConfig, + Search::SharedState sharedState, const Search::SearchManager::UpdateContext& updateContext) { if (threads.size() > 0) // destroy any existing thread(s) { main_thread()->wait_for_search_finished(); - while (threads.size() > 0) - delete threads.back(), threads.pop_back(); + threads.clear(); + + boundThreadToNumaNode.clear(); } const size_t requested = sharedState.options["Threads"]; if (requested > 0) // create new thread(s) { - auto manager = std::make_unique(updateContext); - threads.push_back(new Thread(sharedState, std::move(manager), 0)); + // Binding threads may be problematic when there's multiple NUMA nodes and + // multiple Stockfish instances running. In particular, if each instance + // runs a single thread then they would all be mapped to the first NUMA node. + // This is undesirable, and so the default behaviour (i.e. when the user does not + // change the NumaConfig UCI setting) is to not bind the threads to processors + // unless we know for sure that we span NUMA nodes and replication is required. + const std::string numaPolicy(sharedState.options["NumaPolicy"]); + const bool doBindThreads = [&]() { + if (numaPolicy == "none") + return false; + + if (numaPolicy == "auto") + return numaConfig.suggests_binding_threads(requested); + + // numaPolicy == "system", or explicitly set by the user + return true; + }(); + + boundThreadToNumaNode = doBindThreads + ? numaConfig.distribute_threads_among_numa_nodes(requested) + : std::vector{}; while (threads.size() < requested) { - auto null_manager = std::make_unique(); - threads.push_back(new Thread(sharedState, std::move(null_manager), threads.size())); + const size_t threadId = threads.size(); + const NumaIndex numaId = doBindThreads ? boundThreadToNumaNode[threadId] : 0; + auto manager = threadId == 0 ? std::unique_ptr( + std::make_unique(updateContext)) + : std::make_unique(); + + // When not binding threads we want to force all access to happen + // from the same NUMA node, because in case of NUMA replicated memory + // accesses we don't want to trash cache in case the threads get scheduled + // on the same NUMA node. + auto binder = doBindThreads ? OptionalThreadToNumaNodeBinder(numaConfig, numaId) + : OptionalThreadToNumaNodeBinder(numaId); + + threads.emplace_back( + std::make_unique(sharedState, std::move(manager), threadId, binder)); } clear(); main_thread()->wait_for_search_finished(); - - // Reallocate the hash with the new threadpool size - sharedState.tt.resize(sharedState.options["Hash"], requested); } } // Sets threadPool data to initial values void ThreadPool::clear() { - - for (Thread* th : threads) - th->worker->clear(); - if (threads.size() == 0) return; - main_manager()->callsCnt = 0; - main_manager()->bestPreviousScore = VALUE_INFINITE; + for (auto&& th : threads) + th->clear_worker(); + + for (auto&& th : threads) + th->wait_for_search_finished(); + + // These two affect the time taken on the first move of a game: main_manager()->bestPreviousAverageScore = VALUE_INFINITE; - main_manager()->previousTimeReduction = 1.0; + main_manager()->previousTimeReduction = 0.85; + + main_manager()->callsCnt = 0; + main_manager()->bestPreviousScore = VALUE_INFINITE; + main_manager()->originalTimeAdjust = -1; main_manager()->tm.clear(); } +void ThreadPool::run_on_thread(size_t threadId, std::function f) { + assert(threads.size() > threadId); + threads[threadId]->run_custom_job(std::move(f)); +} -// Wakes up main thread waiting in idle_loop() and -// returns immediately. Main thread will wake up other threads and start the search. +void ThreadPool::wait_on_thread(size_t threadId) { + assert(threads.size() > threadId); + threads[threadId]->wait_for_search_finished(); +} + +size_t ThreadPool::num_threads() const { return threads.size(); } + + +// Wakes up main thread waiting in idle_loop() and returns immediately. +// Main thread will wake up other threads and start the search. void ThreadPool::start_thinking(const OptionsMap& options, Position& pos, StateListPtr& states, @@ -213,33 +276,38 @@ void ThreadPool::start_thinking(const OptionsMap& options, // We use Position::set() to set root position across threads. But there are // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot // be deduced from a fen string, so set() clears them and they are set from - // setupStates->back() later. The rootState is per thread, earlier states are shared - // since they are read-only. - for (Thread* th : threads) + // setupStates->back() later. The rootState is per thread, earlier states are + // shared since they are read-only. + for (auto&& th : threads) { - th->worker->limits = limits; - th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly = - th->worker->bestMoveChanges = 0; - th->worker->rootDepth = th->worker->completedDepth = 0; - th->worker->rootMoves = rootMoves; - th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState); - th->worker->rootState = setupStates->back(); - th->worker->tbConfig = tbConfig; + th->run_custom_job([&]() { + th->worker->limits = limits; + th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly = + th->worker->bestMoveChanges = 0; + th->worker->rootDepth = th->worker->completedDepth = 0; + th->worker->rootMoves = rootMoves; + th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState); + th->worker->rootState = setupStates->back(); + th->worker->tbConfig = tbConfig; + }); } + for (auto&& th : threads) + th->wait_for_search_finished(); + main_thread()->start_searching(); } Thread* ThreadPool::get_best_thread() const { - Thread* bestThread = threads.front(); + Thread* bestThread = threads.front().get(); Value minScore = VALUE_NONE; std::unordered_map votes( 2 * std::min(size(), bestThread->worker->rootMoves.size())); // Find the minimum score of all threads - for (Thread* th : threads) + for (auto&& th : threads) minScore = std::min(minScore, th->worker->rootMoves[0].score); // Vote according to score and depth, and select the best thread @@ -247,10 +315,10 @@ Thread* ThreadPool::get_best_thread() const { return (th->worker->rootMoves[0].score - minScore + 14) * int(th->worker->completedDepth); }; - for (Thread* th : threads) - votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th); + for (auto&& th : threads) + votes[th->worker->rootMoves[0].pv[0]] += thread_voting_value(th.get()); - for (Thread* th : threads) + for (auto&& th : threads) { const auto bestThreadScore = bestThread->worker->rootMoves[0].score; const auto newThreadScore = th->worker->rootMoves[0].score; @@ -261,59 +329,82 @@ Thread* ThreadPool::get_best_thread() const { const auto bestThreadMoveVote = votes[bestThreadPV[0]]; const auto newThreadMoveVote = votes[newThreadPV[0]]; - const bool bestThreadInProvenWin = bestThreadScore >= VALUE_TB_WIN_IN_MAX_PLY; - const bool newThreadInProvenWin = newThreadScore >= VALUE_TB_WIN_IN_MAX_PLY; + const bool bestThreadInProvenWin = is_win(bestThreadScore); + const bool newThreadInProvenWin = is_win(newThreadScore); const bool bestThreadInProvenLoss = - bestThreadScore != -VALUE_INFINITE && bestThreadScore <= VALUE_TB_LOSS_IN_MAX_PLY; + bestThreadScore != -VALUE_INFINITE && is_loss(bestThreadScore); const bool newThreadInProvenLoss = - newThreadScore != -VALUE_INFINITE && newThreadScore <= VALUE_TB_LOSS_IN_MAX_PLY; + newThreadScore != -VALUE_INFINITE && is_loss(newThreadScore); - // Note that we make sure not to pick a thread with truncated-PV for better viewer experience. + // We make sure not to pick a thread with truncated principal variation const bool betterVotingValue = - thread_voting_value(th) * int(newThreadPV.size() > 2) + thread_voting_value(th.get()) * int(newThreadPV.size() > 2) > thread_voting_value(bestThread) * int(bestThreadPV.size() > 2); if (bestThreadInProvenWin) { // Make sure we pick the shortest mate / TB conversion if (newThreadScore > bestThreadScore) - bestThread = th; + bestThread = th.get(); } else if (bestThreadInProvenLoss) { // Make sure we pick the shortest mated / TB conversion if (newThreadInProvenLoss && newThreadScore < bestThreadScore) - bestThread = th; + bestThread = th.get(); } else if (newThreadInProvenWin || newThreadInProvenLoss - || (newThreadScore > VALUE_TB_LOSS_IN_MAX_PLY + || (!is_loss(newThreadScore) && (newThreadMoveVote > bestThreadMoveVote || (newThreadMoveVote == bestThreadMoveVote && betterVotingValue)))) - bestThread = th; + bestThread = th.get(); } return bestThread; } -// Start non-main threads -// Will be invoked by main thread after it has started searching +// Start non-main threads. +// Will be invoked by main thread after it has started searching. void ThreadPool::start_searching() { - for (Thread* th : threads) + for (auto&& th : threads) if (th != threads.front()) th->start_searching(); } // Wait for non-main threads - void ThreadPool::wait_for_search_finished() const { - for (Thread* th : threads) + for (auto&& th : threads) if (th != threads.front()) th->wait_for_search_finished(); } +std::vector ThreadPool::get_bound_thread_count_by_numa_node() const { + std::vector counts; + + if (!boundThreadToNumaNode.empty()) + { + NumaIndex highestNumaNode = 0; + for (NumaIndex n : boundThreadToNumaNode) + if (n > highestNumaNode) + highestNumaNode = n; + + counts.resize(highestNumaNode + 1, 0); + + for (NumaIndex n : boundThreadToNumaNode) + counts[n] += 1; + } + + return counts; +} + +void ThreadPool::ensure_network_replicated() { + for (auto&& th : threads) + th->ensure_network_replicated(); +} + } // namespace Stockfish diff --git a/src/thread.h b/src/thread.h index 223652ae..43e2e142 100644 --- a/src/thread.h +++ b/src/thread.h @@ -23,10 +23,12 @@ #include #include #include +#include #include #include #include +#include "numa.h" #include "position.h" #include "search.h" #include "thread_win32_osx.h" @@ -37,6 +39,32 @@ namespace Stockfish { class OptionsMap; using Value = int; +// Sometimes we don't want to actually bind the threads, but the recipient still +// needs to think it runs on *some* NUMA node, such that it can access structures +// that rely on NUMA node knowledge. This class encapsulates this optional process +// such that the recipient does not need to know whether the binding happened or not. +class OptionalThreadToNumaNodeBinder { + public: + OptionalThreadToNumaNodeBinder(NumaIndex n) : + numaConfig(nullptr), + numaId(n) {} + + OptionalThreadToNumaNodeBinder(const NumaConfig& cfg, NumaIndex n) : + numaConfig(&cfg), + numaId(n) {} + + NumaReplicatedAccessToken operator()() const { + if (numaConfig != nullptr) + return numaConfig->bind_current_thread_to_numa_node(numaId); + else + return NumaReplicatedAccessToken(numaId); + } + + private: + const NumaConfig* numaConfig; + NumaIndex numaId; +}; + // Abstraction of a thread. It contains a pointer to the worker and a native thread. // After construction, the native thread is started with idle_loop() // waiting for a signal to start searching. @@ -44,22 +72,37 @@ using Value = int; // the search is finished, it goes back to idle_loop() waiting for a new signal. class Thread { public: - Thread(Search::SharedState&, std::unique_ptr, size_t); + Thread(Search::SharedState&, + std::unique_ptr, + size_t, + OptionalThreadToNumaNodeBinder); virtual ~Thread(); - void idle_loop(); - void start_searching(); + void idle_loop(); + void start_searching(); + void clear_worker(); + void run_custom_job(std::function f); + + void ensure_network_replicated(); + + // Thread has been slightly altered to allow running custom jobs, so + // this name is no longer correct. However, this class (and ThreadPool) + // require further work to make them properly generic while maintaining + // appropriate specificity regarding search, from the point of view of an + // outside user, so renaming of this function is left for whenever that happens. void wait_for_search_finished(); size_t id() const { return idx; } std::unique_ptr worker; + std::function jobFunc; private: - std::mutex mutex; - std::condition_variable cv; - size_t idx, nthreads; - bool exit = false, searching = true; // Set before starting std::thread - NativeThread stdThread; + std::mutex mutex; + std::condition_variable cv; + size_t idx, nthreads; + bool exit = false, searching = true; // Set before starting std::thread + NativeThread stdThread; + NumaReplicatedAccessToken numaAccessToken; }; @@ -67,31 +110,46 @@ class Thread { // parking and, most importantly, launching a thread. All the access to threads // is done through this class. class ThreadPool { - public: + ThreadPool() {} + ~ThreadPool() { // destroy any existing thread(s) if (threads.size() > 0) { main_thread()->wait_for_search_finished(); - while (threads.size() > 0) - delete threads.back(), threads.pop_back(); + threads.clear(); } } - void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType); - void clear(); - void set(Search::SharedState, const Search::SearchManager::UpdateContext&); + ThreadPool(const ThreadPool&) = delete; + ThreadPool(ThreadPool&&) = delete; + + ThreadPool& operator=(const ThreadPool&) = delete; + ThreadPool& operator=(ThreadPool&&) = delete; + + void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType); + void run_on_thread(size_t threadId, std::function f); + void wait_on_thread(size_t threadId); + size_t num_threads() const; + void clear(); + void set(const NumaConfig& numaConfig, + Search::SharedState, + const Search::SearchManager::UpdateContext&); Search::SearchManager* main_manager(); - Thread* main_thread() const { return threads.front(); } + Thread* main_thread() const { return threads.front().get(); } uint64_t nodes_searched() const; uint64_t tb_hits() const; Thread* get_best_thread() const; void start_searching(); void wait_for_search_finished() const; + std::vector get_bound_thread_count_by_numa_node() const; + + void ensure_network_replicated(); + std::atomic_bool stop, abortedSearch, increaseDepth; auto cbegin() const noexcept { return threads.cbegin(); } @@ -102,13 +160,14 @@ class ThreadPool { auto empty() const noexcept { return threads.empty(); } private: - StateListPtr setupStates; - std::vector threads; + StateListPtr setupStates; + std::vector> threads; + std::vector boundThreadToNumaNode; uint64_t accumulate(std::atomic Search::Worker::*member) const { uint64_t sum = 0; - for (Thread* th : threads) + for (auto&& th : threads) sum += (th->worker.get()->*member).load(std::memory_order_relaxed); return sum; } diff --git a/src/timeman.cpp b/src/timeman.cpp index c651745f..9de70fdc 100644 --- a/src/timeman.cpp +++ b/src/timeman.cpp @@ -32,12 +32,12 @@ TimePoint TimeManagement::optimum() const { return optimumTime; } TimePoint TimeManagement::maximum() const { return maximumTime; } void TimeManagement::clear() { - availableNodes = 0; // When in 'nodes as time' mode + availableNodes = -1; // When in 'nodes as time' mode } void TimeManagement::advance_nodes_time(std::int64_t nodes) { assert(useNodesTime); - availableNodes += nodes; + availableNodes = std::max(int64_t(0), availableNodes - nodes); } // Called at the beginning of the search and calculates @@ -47,15 +47,19 @@ void TimeManagement::advance_nodes_time(std::int64_t nodes) { void TimeManagement::init(Search::LimitsType& limits, Color us, int ply, - const OptionsMap& options) { - // If we have no time, no need to initialize TM, except for the start time, - // which is used by movetime. - startTime = limits.startTime; + const OptionsMap& options, + double& originalTimeAdjust) { + TimePoint npmsec = TimePoint(options["nodestime"]); + + // If we have no time, we don't need to fully initialize TM. + // startTime is used by movetime and useNodesTime is used in elapsed calls. + startTime = limits.startTime; + useNodesTime = npmsec != 0; + if (limits.time[us] == 0) return; TimePoint moveOverhead = TimePoint(options["Move Overhead"]); - TimePoint npmsec = TimePoint(options["nodestime"]); // optScale is a percentage of available time to use for the current move. // maxScale is a multiplier applied to optimumTime. @@ -65,26 +69,31 @@ void TimeManagement::init(Search::LimitsType& limits, // to nodes, and use resulting values in time management formulas. // WARNING: to avoid time losses, the given npmsec (nodes per millisecond) // must be much lower than the real engine speed. - if (npmsec) + if (useNodesTime) { - useNodesTime = true; - - if (!availableNodes) // Only once at game start + if (availableNodes == -1) // Only once at game start availableNodes = npmsec * limits.time[us]; // Time is in msec // Convert from milliseconds to nodes limits.time[us] = TimePoint(availableNodes); limits.inc[us] *= npmsec; limits.npmsec = npmsec; + moveOverhead *= npmsec; } + // These numbers are used where multiplications, divisions or comparisons + // with constants are involved. + const int64_t scaleFactor = useNodesTime ? npmsec : 1; + const TimePoint scaledTime = limits.time[us] / scaleFactor; + const TimePoint scaledInc = limits.inc[us] / scaleFactor; + // Maximum move horizon of 50 moves int mtg = limits.movestogo ? std::min(limits.movestogo, 50) : 50; - // if less than one second, gradually reduce mtg - if (limits.time[us] < 1000 && (double(mtg) / limits.time[us] > 0.05)) + // If less than one second, gradually reduce mtg + if (scaledTime < 1000 && double(mtg) / scaledInc > 0.05) { - mtg = limits.time[us] * 0.05; + mtg = scaledTime * 0.05; } // Make sure timeLeft is > 0 since we may use it as a divisor @@ -96,24 +105,26 @@ void TimeManagement::init(Search::LimitsType& limits, // game time for the current move, so also cap to a percentage of available game time. if (limits.movestogo == 0) { - // Use extra time with larger increments - double optExtra = limits.inc[us] < 500 ? 1.0 : 1.13; + // Extra time according to timeLeft + if (originalTimeAdjust < 0) + originalTimeAdjust = 0.3285 * std::log10(timeLeft) - 0.4830; // Calculate time constants based on current time left. - double optConstant = - std::min(0.00308 + 0.000319 * std::log10(limits.time[us] / 1000.0), 0.00506); - double maxConstant = std::max(3.39 + 3.01 * std::log10(limits.time[us] / 1000.0), 2.93); + double logTimeInSec = std::log10(scaledTime / 1000.0); + double optConstant = std::min(0.00308 + 0.000319 * logTimeInSec, 0.00506); + double maxConstant = std::max(3.39 + 3.01 * logTimeInSec, 2.93); optScale = std::min(0.0122 + std::pow(ply + 2.95, 0.462) * optConstant, - 0.213 * limits.time[us] / double(timeLeft)) - * optExtra; + 0.213 * limits.time[us] / timeLeft) + * originalTimeAdjust; + maxScale = std::min(6.64, maxConstant + ply / 12.0); } // x moves in y seconds (+ z increment) else { - optScale = std::min((0.88 + ply / 116.4) / mtg, 0.88 * limits.time[us] / double(timeLeft)); + optScale = std::min((0.88 + ply / 116.4) / mtg, 0.88 * limits.time[us] / timeLeft); maxScale = std::min(6.3, 1.5 + 0.11 * mtg); } diff --git a/src/timeman.h b/src/timeman.h index 35c3cfc0..10207a8a 100644 --- a/src/timeman.h +++ b/src/timeman.h @@ -36,14 +36,19 @@ struct LimitsType; // the maximum available time, the game move number, and other parameters. class TimeManagement { public: - void init(Search::LimitsType& limits, Color us, int ply, const OptionsMap& options); + void init(Search::LimitsType& limits, + Color us, + int ply, + const OptionsMap& options, + double& originalTimeAdjust); TimePoint optimum() const; TimePoint maximum() const; template TimePoint elapsed(FUNC nodes) const { - return useNodesTime ? TimePoint(nodes()) : now() - startTime; + return useNodesTime ? TimePoint(nodes()) : elapsed_time(); } + TimePoint elapsed_time() const { return now() - startTime; }; void clear(); void advance_nodes_time(std::int64_t nodes); @@ -53,7 +58,7 @@ class TimeManagement { TimePoint optimumTime; TimePoint maximumTime; - std::int64_t availableNodes = 0; // When in 'nodes as time' mode + std::int64_t availableNodes = -1; // When in 'nodes as time' mode bool useNodesTime = false; // True if we are in 'nodes as time' mode }; diff --git a/src/tt.cpp b/src/tt.cpp index 4885a781..75689562 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -23,31 +23,89 @@ #include #include #include -#include -#include +#include "memory.h" #include "misc.h" +#include "syzygy/tbprobe.h" +#include "thread.h" namespace Stockfish { + +// TTEntry struct is the 10 bytes transposition table entry, defined as below: +// +// key 16 bit +// depth 8 bit +// generation 5 bit +// pv node 1 bit +// bound type 2 bit +// move 16 bit +// value 16 bit +// evaluation 16 bit +// +// These fields are in the same order as accessed by TT::probe(), since memory is fastest sequentially. +// Equally, the store order in save() matches this order. + +struct TTEntry { + + // Convert internal bitfields to external types + TTData read() const { + return TTData{Move(move16), Value(value16), + Value(eval16), Depth(depth8 + DEPTH_ENTRY_OFFSET), + Bound(genBound8 & 0x3), bool(genBound8 & 0x4)}; + } + + bool is_occupied() const; + void save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); + // The returned age is a multiple of TranspositionTable::GENERATION_DELTA + uint8_t relative_age(const uint8_t generation8) const; + + private: + friend class TranspositionTable; + + uint16_t key16; + uint8_t depth8; + uint8_t genBound8; + Move move16; + int16_t value16; + int16_t eval16; +}; + +// `genBound8` is where most of the details are. We use the following constants to manipulate 5 leading generation bits +// and 3 trailing miscellaneous bits. + +// These bits are reserved for other things. +static constexpr unsigned GENERATION_BITS = 3; +// increment for generation field +static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS); +// cycle length +static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA; +// mask to pull out generation number +static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; + +// DEPTH_ENTRY_OFFSET exists because 1) we use `bool(depth8)` as the occupancy check, but +// 2) we need to store negative depths for QS. (`depth8` is the only field with "spare bits": +// we sacrifice the ability to store depths greater than 1<<8 less the offset, as asserted in `save`.) +bool TTEntry::is_occupied() const { return bool(depth8); } + // Populates the TTEntry with a new node's data, possibly // overwriting an old position. The update is not atomic and can be racy. void TTEntry::save( Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { - // Preserve any existing move for the same position + // Preserve the old ttmove if we don't have a new one if (m || uint16_t(k) != key16) move16 = m; // Overwrite less valuable entries (cheapest checks first) - if (b == BOUND_EXACT || uint16_t(k) != key16 || d - DEPTH_OFFSET + 2 * pv > depth8 - 4 + if (b == BOUND_EXACT || uint16_t(k) != key16 || d - DEPTH_ENTRY_OFFSET + 2 * pv > depth8 - 4 || relative_age(generation8)) { - assert(d > DEPTH_OFFSET); - assert(d < 256 + DEPTH_OFFSET); + assert(d > DEPTH_ENTRY_OFFSET); + assert(d < 256 + DEPTH_ENTRY_OFFSET); key16 = uint16_t(k); - depth8 = uint8_t(d - DEPTH_OFFSET); + depth8 = uint8_t(d - DEPTH_ENTRY_OFFSET); genBound8 = uint8_t(generation8 | uint8_t(pv) << 2 | b); value16 = int16_t(v); eval16 = int16_t(ev); @@ -61,70 +119,117 @@ uint8_t TTEntry::relative_age(const uint8_t generation8) const { // is needed to keep the unrelated lowest n bits from affecting // the result) to calculate the entry age correctly even after // generation8 overflows into the next cycle. - - return (TranspositionTable::GENERATION_CYCLE + generation8 - genBound8) - & TranspositionTable::GENERATION_MASK; + return (GENERATION_CYCLE + generation8 - genBound8) & GENERATION_MASK; } +// TTWriter is but a very thin wrapper around the pointer +TTWriter::TTWriter(TTEntry* tte) : + entry(tte) {} + +void TTWriter::write( + Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { + entry->save(k, v, pv, b, d, m, ev, generation8); +} + + +// A TranspositionTable is an array of Cluster, of size clusterCount. Each cluster consists of ClusterSize number +// of TTEntry. Each non-empty TTEntry contains information on exactly one position. The size of a Cluster should +// divide the size of a cache line for best performance, as the cacheline is prefetched when possible. + +static constexpr int ClusterSize = 3; + +struct Cluster { + TTEntry entry[ClusterSize]; + char padding[2]; // Pad to 32 bytes +}; + +static_assert(sizeof(Cluster) == 32, "Suboptimal Cluster size"); + + // Sets the size of the transposition table, // measured in megabytes. Transposition table consists // of clusters and each cluster consists of ClusterSize number of TTEntry. -void TranspositionTable::resize(size_t mbSize, int threadCount) { +void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) { aligned_large_pages_free(table); clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); table = static_cast(aligned_large_pages_alloc(clusterCount * sizeof(Cluster))); + if (!table) { std::cerr << "Failed to allocate " << mbSize << "MB for transposition table." << std::endl; exit(EXIT_FAILURE); } - clear(threadCount); + clear(threads); } // Initializes the entire transposition table to zero, // in a multi-threaded way. -void TranspositionTable::clear(size_t threadCount) { - std::vector threads; +void TranspositionTable::clear(ThreadPool& threads) { + generation8 = 0; + const size_t threadCount = threads.num_threads(); - for (size_t idx = 0; idx < size_t(threadCount); ++idx) + for (size_t i = 0; i < threadCount; ++i) { - threads.emplace_back([this, idx, threadCount]() { - // Thread binding gives faster search on systems with a first-touch policy - if (threadCount > 8) - WinProcGroup::bind_this_thread(idx); - + threads.run_on_thread(i, [this, i, threadCount]() { // Each thread will zero its part of the hash table - const size_t stride = size_t(clusterCount / threadCount), start = size_t(stride * idx), - len = idx != size_t(threadCount) - 1 ? stride : clusterCount - start; + const size_t stride = clusterCount / threadCount; + const size_t start = stride * i; + const size_t len = i + 1 != threadCount ? stride : clusterCount - start; std::memset(&table[start], 0, len * sizeof(Cluster)); }); } - for (std::thread& th : threads) - th.join(); + for (size_t i = 0; i < threadCount; ++i) + threads.wait_on_thread(i); } +// Returns an approximation of the hashtable +// occupation during a search. The hash is x permill full, as per UCI protocol. +// Only counts entries which match the current generation. +int TranspositionTable::hashfull(int maxAge) const { + int maxAgeInternal = maxAge << GENERATION_BITS; + int cnt = 0; + for (int i = 0; i < 1000; ++i) + for (int j = 0; j < ClusterSize; ++j) + cnt += table[i].entry[j].is_occupied() + && table[i].entry[j].relative_age(generation8) <= maxAgeInternal; + + return cnt / ClusterSize; +} + + +void TranspositionTable::new_search() { + // increment by delta to keep lower bits as is + generation8 += GENERATION_DELTA; +} + + +uint8_t TranspositionTable::generation() const { return generation8; } + + // Looks up the current position in the transposition -// table. It returns true and a pointer to the TTEntry if the position is found. +// table. It returns true if the position is found. // Otherwise, it returns false and a pointer to an empty or least valuable TTEntry // to be replaced later. The replace value of an entry is calculated as its depth // minus 8 times its relative age. TTEntry t1 is considered more valuable than // TTEntry t2 if its replace value is greater than that of t2. -TTEntry* TranspositionTable::probe(const Key key, bool& found) const { +std::tuple TranspositionTable::probe(const Key key) const { TTEntry* const tte = first_entry(key); const uint16_t key16 = uint16_t(key); // Use the low 16 bits as key inside the cluster for (int i = 0; i < ClusterSize; ++i) - if (tte[i].key16 == key16 || !tte[i].depth8) - return found = bool(tte[i].depth8), &tte[i]; + if (tte[i].key16 == key16) + // This gap is the main place for read races. + // After `read()` completes that copy is final, but may be self-inconsistent. + return {tte[i].is_occupied(), tte[i].read(), TTWriter(&tte[i])}; // Find an entry to be replaced according to the replacement strategy TTEntry* replace = tte; @@ -133,22 +238,12 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const { > tte[i].depth8 - tte[i].relative_age(generation8) * 2) replace = &tte[i]; - return found = false, replace; + return {false, TTData(), TTWriter(replace)}; } -// Returns an approximation of the hashtable -// occupation during a search. The hash is x permill full, as per UCI protocol. -// Only counts entries which match the current generation. -int TranspositionTable::hashfull() const { - - int cnt = 0; - for (int i = 0; i < 1000; ++i) - for (int j = 0; j < ClusterSize; ++j) - cnt += table[i].entry[j].depth8 - && (table[i].entry[j].genBound8 & GENERATION_MASK) == generation8; - - return cnt / ClusterSize; +TTEntry* TranspositionTable::first_entry(const Key key) const { + return &table[mul_hi64(key, clusterCount)].entry[0]; } } // namespace Stockfish diff --git a/src/tt.h b/src/tt.h index 554a81a5..e7bb5c45 100644 --- a/src/tt.h +++ b/src/tt.h @@ -21,99 +21,76 @@ #include #include +#include -#include "misc.h" +#include "memory.h" #include "types.h" namespace Stockfish { -// TTEntry struct is the 10 bytes transposition table entry, defined as below: +class ThreadPool; +struct TTEntry; +struct Cluster; + +// There is only one global hash table for the engine and all its threads. For chess in particular, we even allow racy +// updates between threads to and from the TT, as taking the time to synchronize access would cost thinking time and +// thus elo. As a hash table, collisions are possible and may cause chess playing issues (bizarre blunders, faulty mate +// reports, etc). Fixing these also loses elo; however such risk decreases quickly with larger TT size. // -// key 16 bit -// depth 8 bit -// generation 5 bit -// pv node 1 bit -// bound type 2 bit -// move 16 bit -// value 16 bit -// eval value 16 bit -struct TTEntry { +// `probe` is the primary method: given a board position, we lookup its entry in the table, and return a tuple of: +// 1) whether the entry already has this position +// 2) a copy of the prior data (if any) (may be inconsistent due to read races) +// 3) a writer object to this entry +// The copied data and the writer are separated to maintain clear boundaries between local vs global objects. - Move move() const { return Move(move16); } - Value value() const { return Value(value16); } - Value eval() const { return Value(eval16); } - Depth depth() const { return Depth(depth8 + DEPTH_OFFSET); } - bool is_pv() const { return bool(genBound8 & 0x4); } - Bound bound() const { return Bound(genBound8 & 0x3); } - void save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); - // The returned age is a multiple of TranspositionTable::GENERATION_DELTA - uint8_t relative_age(const uint8_t generation8) const; - private: - friend class TranspositionTable; - - uint16_t key16; - uint8_t depth8; - uint8_t genBound8; - Move move16; - int16_t value16; - int16_t eval16; +// A copy of the data already in the entry (possibly collided). `probe` may be racy, resulting in inconsistent data. +struct TTData { + Move move; + Value value, eval; + Depth depth; + Bound bound; + bool is_pv; +}; + + +// This is used to make racy writes to the global TT. +struct TTWriter { + public: + void write(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); + + private: + friend class TranspositionTable; + TTEntry* entry; + TTWriter(TTEntry* tte); }; -// A TranspositionTable is an array of Cluster, of size clusterCount. Each -// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry -// contains information on exactly one position. The size of a Cluster should -// divide the size of a cache line for best performance, as the cacheline is -// prefetched when possible. class TranspositionTable { - static constexpr int ClusterSize = 3; - - struct Cluster { - TTEntry entry[ClusterSize]; - char padding[2]; // Pad to 32 bytes - }; - - static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size"); - - // Constants used to refresh the hash table periodically - - // We have 8 bits available where the lowest 3 bits are - // reserved for other things. - static constexpr unsigned GENERATION_BITS = 3; - // increment for generation field - static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS); - // cycle length - static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA; - // mask to pull out generation number - static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; - public: ~TranspositionTable() { aligned_large_pages_free(table); } - void new_search() { - // increment by delta to keep lower bits as is - generation8 += GENERATION_DELTA; - } + void resize(size_t mbSize, ThreadPool& threads); // Set TT size + void clear(ThreadPool& threads); // Re-initialize memory, multithreaded + int hashfull(int maxAge = 0) + const; // Approximate what fraction of entries (permille) have been written to during this root search - TTEntry* probe(const Key key, bool& found) const; - int hashfull() const; - void resize(size_t mbSize, int threadCount); - void clear(size_t threadCount); - - TTEntry* first_entry(const Key key) const { - return &table[mul_hi64(key, clusterCount)].entry[0]; - } - - uint8_t generation() const { return generation8; } + void + new_search(); // This must be called at the beginning of each root search to track entry aging + uint8_t generation() const; // The current age, used when writing new data to the TT + std::tuple + probe(const Key key) const; // The main method, whose retvals separate local vs global objects + TTEntry* first_entry(const Key key) + const; // This is the hash function; its only external use is memory prefetching. private: friend struct TTEntry; size_t clusterCount; - Cluster* table = nullptr; - uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 + Cluster* table = nullptr; + + uint8_t generation8 = 0; // Size must be not bigger than TTEntry::genBound8 }; } // namespace Stockfish diff --git a/src/tune.cpp b/src/tune.cpp index 3e5ebe5e..dfcd3468 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,19 +34,19 @@ namespace Stockfish { bool Tune::update_on_last; const Option* LastOption = nullptr; OptionsMap* Tune::options; - - namespace { std::map TuneResults; -void on_tune(const Option& o) { +std::optional on_tune(const Option& o) { if (!Tune::update_on_last || LastOption == &o) Tune::read_options(); + + return std::nullopt; +} } - -void make_option(OptionsMap* options, const string& n, int v, const SetRange& r) { +void Tune::make_option(OptionsMap* opts, const string& n, int v, const SetRange& r) { // Do not generate option when there is nothing to tune (ie. min = max) if (r(v).first == r(v).second) @@ -54,15 +55,17 @@ void make_option(OptionsMap* options, const string& n, int v, const SetRange& r) if (TuneResults.count(n)) v = TuneResults[n]; - (*options)[n] << Option(v, r(v).first, r(v).second, on_tune); - LastOption = &((*options)[n]); + (*opts)[n] << Option(v, r(v).first, r(v).second, on_tune); + LastOption = &((*opts)[n]); // Print formatted parameters, ready to be copy-pasted in Fishtest - std::cout << n << "," << v << "," << r(v).first << "," << r(v).second << "," - << (r(v).second - r(v).first) / 20.0 << "," + std::cout << n << "," // + << v << "," // + << r(v).first << "," // + << r(v).second << "," // + << (r(v).second - r(v).first) / 20.0 << "," // << "0.0020" << std::endl; } -} string Tune::next(string& names, bool pop) { @@ -118,7 +121,6 @@ void Tune::Entry::read_option() { namespace Stockfish { -void Tune::read_results() { /* ...insert your values here... */ -} +void Tune::read_results() { /* ...insert your values here... */ } } // namespace Stockfish diff --git a/src/tune.h b/src/tune.h index 079614db..ed4738cd 100644 --- a/src/tune.h +++ b/src/tune.h @@ -145,6 +145,8 @@ class Tune { return add(value, (next(names), std::move(names)), args...); } + static void make_option(OptionsMap* options, const std::string& n, int v, const SetRange& r); + std::vector> list; public: diff --git a/src/types.h b/src/types.h index 7e523e25..b0bb1c07 100644 --- a/src/types.h +++ b/src/types.h @@ -137,9 +137,9 @@ enum Bound { BOUND_EXACT = BOUND_UPPER | BOUND_LOWER }; -// Value is used as an alias for int16_t, this is done to differentiate between -// a search value and any other integer value. The values used in search are always -// supposed to be in the range (-VALUE_NONE, VALUE_NONE] and should not exceed this range. +// Value is used as an alias for int, this is done to differentiate between a search +// value and any other integer value. The values used in search are always supposed +// to be in the range (-VALUE_NONE, VALUE_NONE] and should not exceed this range. using Value = int; constexpr Value VALUE_ZERO = 0; @@ -155,6 +155,21 @@ constexpr Value VALUE_TB = VALUE_MATE_IN_MAX_PLY - 1; constexpr Value VALUE_TB_WIN_IN_MAX_PLY = VALUE_TB - MAX_PLY; constexpr Value VALUE_TB_LOSS_IN_MAX_PLY = -VALUE_TB_WIN_IN_MAX_PLY; + +constexpr bool is_valid(Value value) { return value != VALUE_NONE; } + +constexpr bool is_win(Value value) { + assert(is_valid(value)); + return value >= VALUE_TB_WIN_IN_MAX_PLY; +} + +constexpr bool is_loss(Value value) { + assert(is_valid(value)); + return value <= VALUE_TB_LOSS_IN_MAX_PLY; +} + +constexpr bool is_decisive(Value value) { return is_win(value) || is_loss(value); } + // In the code, we make the assumption that these values // are such that non_pawn_material() can be used to uniquely // identify the material on the board. @@ -187,12 +202,21 @@ constexpr Value PieceValue[PIECE_NB] = { using Depth = int; enum : int { - DEPTH_QS_CHECKS = 0, - DEPTH_QS_NO_CHECKS = -1, - - DEPTH_NONE = -6, - - DEPTH_OFFSET = -7 // value used only for TT entry occupancy check + // The following DEPTH_ constants are used for transposition table entries + // and quiescence search move generation stages. In regular search, the + // depth stored in the transposition table is literal: the search depth + // (effort) used to make the corresponding transposition table value. In + // quiescence search, however, the transposition table entries only store + // the current quiescence move generation stage (which should thus compare + // lower than any regular search depth). + DEPTH_QS = 0, + // For transposition table entries where no searching at all was done + // (whether regular or qsearch) we use DEPTH_UNSEARCHED, which should thus + // compare lower than any quiescence or regular depth. DEPTH_ENTRY_OFFSET + // is used only for the transposition table entry occupancy check (see tt.cpp), + // and should thus be lower than DEPTH_UNSEARCHED. + DEPTH_UNSEARCHED = -2, + DEPTH_ENTRY_OFFSET = -3 }; // clang-format off @@ -357,9 +381,10 @@ enum MoveType { // bit 14-15: special move flag: promotion (1), en passant (2), castling (3) // NOTE: en passant bit is set only when a pawn can be captured // -// Special cases are Move::none() and Move::null(). We can sneak these in because in -// any normal move destination square is always different from origin square -// while Move::none() and Move::null() have the same origin and destination square. +// Special cases are Move::none() and Move::null(). We can sneak these in because +// in any normal move the destination square and origin square are always different, +// but Move::none() and Move::null() have the same origin and destination square. + class Move { public: Move() = default; diff --git a/src/uci.cpp b/src/uci.cpp index cb686a02..8388cad8 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -30,20 +31,19 @@ #include "benchmark.h" #include "engine.h" -#include "evaluate.h" +#include "memory.h" #include "movegen.h" #include "position.h" #include "score.h" #include "search.h" -#include "syzygy/tbprobe.h" #include "types.h" #include "ucioption.h" namespace Stockfish { -constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; -constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048; +constexpr auto BenchmarkCommand = "speedtest"; +constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; template struct overload: Ts... { using Ts::operator()...; @@ -52,50 +52,40 @@ struct overload: Ts... { template overload(Ts...) -> overload; +void UCIEngine::print_info_string(std::string_view str) { + sync_cout_start(); + for (auto& line : split(str, "\n")) + { + if (!is_whitespace(line)) + { + std::cout << "info string " << line << '\n'; + } + } + sync_cout_end(); +} + UCIEngine::UCIEngine(int argc, char** argv) : engine(argv[0]), cli(argc, argv) { - auto& options = engine.get_options(); - - options["Debug Log File"] << Option("", [](const Option& o) { start_logger(o); }); - - options["Threads"] << Option(1, 1, 1024, [this](const Option&) { engine.resize_threads(); }); - - options["Hash"] << Option(16, 1, MaxHashMB, [this](const Option& o) { engine.set_tt_size(o); }); - - options["Clear Hash"] << Option([this](const Option&) { engine.search_clear(); }); - options["Ponder"] << Option(false); - options["MultiPV"] << Option(1, 1, MAX_MOVES); - options["Skill Level"] << Option(20, 0, 20); - options["Move Overhead"] << Option(10, 0, 5000); - options["nodestime"] << Option(0, 0, 10000); - options["UCI_Chess960"] << Option(false); - options["UCI_LimitStrength"] << Option(false); - options["UCI_Elo"] << Option(1320, 1320, 3190); - options["UCI_ShowWDL"] << Option(false); - options["SyzygyPath"] << Option("", [](const Option& o) { Tablebases::init(o); }); - options["SyzygyProbeDepth"] << Option(1, 1, 100); - options["Syzygy50MoveRule"] << Option(true); - options["SyzygyProbeLimit"] << Option(7, 0, 7); - options["EvalFile"] << Option(EvalFileDefaultNameBig, - [this](const Option& o) { engine.load_big_network(o); }); - options["EvalFileSmall"] << Option(EvalFileDefaultNameSmall, - [this](const Option& o) { engine.load_small_network(o); }); + engine.get_options().add_info_listener([](const std::optional& str) { + if (str.has_value()) + print_info_string(*str); + }); + init_search_update_listeners(); +} +void UCIEngine::init_search_update_listeners() { engine.set_on_iter([](const auto& i) { on_iter(i); }); engine.set_on_update_no_moves([](const auto& i) { on_update_no_moves(i); }); - engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); }); + engine.set_on_update_full( + [this](const auto& i) { on_update_full(i, engine.get_options()["UCI_ShowWDL"]); }); engine.set_on_bestmove([](const auto& bm, const auto& p) { on_bestmove(bm, p); }); - - engine.load_networks(); - engine.resize_threads(); - engine.search_clear(); // After threads are up + engine.set_on_verify_networks([](const auto& s) { print_info_string(s); }); } void UCIEngine::loop() { - std::string token, cmd; for (int i = 1; i < cli.argc; ++i) @@ -123,13 +113,22 @@ void UCIEngine::loop() { engine.set_ponderhit(false); else if (token == "uci") + { sync_cout << "id name " << engine_info(true) << "\n" - << engine.get_options() << "\nuciok" << sync_endl; + << engine.get_options() << sync_endl; + + sync_cout << "uciok" << sync_endl; + } else if (token == "setoption") setoption(is); else if (token == "go") + { + // send info strings after the go command is sent for old GUIs and python-chess + print_info_string(engine.numa_config_information_as_string()); + print_info_string(engine.thread_allocation_information_as_string()); go(is); + } else if (token == "position") position(is); else if (token == "ucinewgame") @@ -143,6 +142,8 @@ void UCIEngine::loop() { engine.flip(); else if (token == "bench") bench(is); + else if (token == BenchmarkCommand) + benchmark(is); else if (token == "d") sync_cout << engine.visualize() << sync_endl; else if (token == "eval") @@ -258,7 +259,7 @@ void UCIEngine::bench(std::istream& args) { Search::LimitsType limits = parse_limits(is); if (limits.perft) - nodes = perft(limits); + nodesSearched = perft(limits); else { engine.go(limits); @@ -286,14 +287,178 @@ void UCIEngine::bench(std::istream& args) { dbg_print(); - std::cerr << "\n===========================" - << "\nTotal time (ms) : " << elapsed << "\nNodes searched : " << nodes + std::cerr << "\n===========================" // + << "\nTotal time (ms) : " << elapsed // + << "\nNodes searched : " << nodes // << "\nNodes/second : " << 1000 * nodes / elapsed << std::endl; // reset callback, to not capture a dangling reference to nodesSearched engine.set_on_update_full([&](const auto& i) { on_update_full(i, options["UCI_ShowWDL"]); }); } +void UCIEngine::benchmark(std::istream& args) { + // Probably not very important for a test this long, but include for completeness and sanity. + static constexpr int NUM_WARMUP_POSITIONS = 3; + + std::string token; + uint64_t nodes = 0, cnt = 1; + uint64_t nodesSearched = 0; + + engine.set_on_update_full([&](const Engine::InfoFull& i) { nodesSearched = i.nodes; }); + + engine.set_on_iter([](const auto&) {}); + engine.set_on_update_no_moves([](const auto&) {}); + engine.set_on_bestmove([](const auto&, const auto&) {}); + engine.set_on_verify_networks([](const auto&) {}); + + Benchmark::BenchmarkSetup setup = Benchmark::setup_benchmark(args); + + const int numGoCommands = count_if(setup.commands.begin(), setup.commands.end(), + [](const std::string& s) { return s.find("go ") == 0; }); + + TimePoint totalTime = 0; + + // Set options once at the start. + auto ss = std::istringstream("name Threads value " + std::to_string(setup.threads)); + setoption(ss); + ss = std::istringstream("name Hash value " + std::to_string(setup.ttSize)); + setoption(ss); + ss = std::istringstream("name UCI_Chess960 value false"); + setoption(ss); + + // Warmup + for (const auto& cmd : setup.commands) + { + std::istringstream is(cmd); + is >> std::skipws >> token; + + if (token == "go") + { + // One new line is produced by the search, so omit it here + std::cerr << "\rWarmup position " << cnt++ << '/' << NUM_WARMUP_POSITIONS; + + Search::LimitsType limits = parse_limits(is); + + TimePoint elapsed = now(); + + // Run with silenced network verification + engine.go(limits); + engine.wait_for_search_finished(); + + totalTime += now() - elapsed; + + nodes += nodesSearched; + nodesSearched = 0; + } + else if (token == "position") + position(is); + else if (token == "ucinewgame") + { + engine.search_clear(); // search_clear may take a while + } + + if (cnt > NUM_WARMUP_POSITIONS) + break; + } + + std::cerr << "\n"; + + cnt = 1; + nodes = 0; + + int numHashfullReadings = 0; + constexpr int hashfullAges[] = {0, 999}; // Only normal hashfull and touched hash. + int totalHashfull[std::size(hashfullAges)] = {0}; + int maxHashfull[std::size(hashfullAges)] = {0}; + + auto updateHashfullReadings = [&]() { + numHashfullReadings += 1; + + for (int i = 0; i < static_cast(std::size(hashfullAges)); ++i) + { + const int hashfull = engine.get_hashfull(hashfullAges[i]); + maxHashfull[i] = std::max(maxHashfull[i], hashfull); + totalHashfull[i] += hashfull; + } + }; + + engine.search_clear(); // search_clear may take a while + + for (const auto& cmd : setup.commands) + { + std::istringstream is(cmd); + is >> std::skipws >> token; + + if (token == "go") + { + // One new line is produced by the search, so omit it here + std::cerr << "\rPosition " << cnt++ << '/' << numGoCommands; + + Search::LimitsType limits = parse_limits(is); + + TimePoint elapsed = now(); + + // Run with silenced network verification + engine.go(limits); + engine.wait_for_search_finished(); + + totalTime += now() - elapsed; + + updateHashfullReadings(); + + nodes += nodesSearched; + nodesSearched = 0; + } + else if (token == "position") + position(is); + else if (token == "ucinewgame") + { + engine.search_clear(); // search_clear may take a while + } + } + + totalTime = std::max(totalTime, 1); // Ensure positivity to avoid a 'divide by zero' + + dbg_print(); + + std::cerr << "\n"; + + static_assert( + std::size(hashfullAges) == 2 && hashfullAges[0] == 0 && hashfullAges[1] == 999, + "Hardcoded for display. Would complicate the code needlessly in the current state."); + + std::string threadBinding = engine.thread_binding_information_as_string(); + if (threadBinding.empty()) + threadBinding = "none"; + + // clang-format off + + std::cerr << "===========================" + << "\nVersion : " + << engine_version_info() + // "\nCompiled by : " + << compiler_info() + << "Large pages : " << (has_large_pages() ? "yes" : "no") + << "\nUser invocation : " << BenchmarkCommand << " " + << setup.originalInvocation << "\nFilled invocation : " << BenchmarkCommand + << " " << setup.filledInvocation + << "\nAvailable processors : " << engine.get_numa_config_as_string() + << "\nThread count : " << setup.threads + << "\nThread binding : " << threadBinding + << "\nTT size [MiB] : " << setup.ttSize + << "\nHash max, avg [per mille] : " + << "\n single search : " << maxHashfull[0] << ", " + << totalHashfull[0] / numHashfullReadings + << "\n single game : " << maxHashfull[1] << ", " + << totalHashfull[1] / numHashfullReadings + << "\nTotal nodes searched : " << nodes + << "\nTotal search time [s] : " << totalTime / 1000.0 + << "\nNodes/second : " << 1000 * nodes / totalTime << std::endl; + + // clang-format on + + init_search_update_listeners(); +} void UCIEngine::setoption(std::istringstream& is) { engine.wait_for_search_finished(); @@ -344,12 +509,12 @@ WinRateParams win_rate_params(const Position& pos) { int material = pos.count() + 3 * pos.count() + 3 * pos.count() + 5 * pos.count() + 9 * pos.count(); - // The fitted model only uses data for material counts in [10, 78], and is anchored at count 58. - double m = std::clamp(material, 10, 78) / 58.0; + // The fitted model only uses data for material counts in [17, 78], and is anchored at count 58. + double m = std::clamp(material, 17, 78) / 58.0; // Return a = p_a(material) and b = p_b(material), see github.com/official-stockfish/WDL_model - constexpr double as[] = {-150.77043883, 394.96159472, -321.73403766, 406.15850091}; - constexpr double bs[] = {62.33245393, -91.02264855, 45.88486850, 51.63461272}; + constexpr double as[] = {-37.45051876, 121.19101539, -132.78783573, 420.70576692}; + constexpr double bs[] = {90.26261072, -137.26549898, 71.10130540, 51.35259597}; double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3]; double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; @@ -390,8 +555,8 @@ std::string UCIEngine::format_score(const Score& s) { // without treatment of mate and similar special scores. int UCIEngine::to_cp(Value v, const Position& pos) { - // In general, the score can be defined via the the WDL as - // (log(1/L - 1) - log(1/W - 1)) / ((log(1/L - 1) + log(1/W - 1)) + // In general, the score can be defined via the WDL as + // (log(1/L - 1) - log(1/W - 1)) / (log(1/L - 1) + log(1/W - 1)). // Based on our win_rate_model, this simply yields v / a. auto [a, b] = win_rate_params(pos); diff --git a/src/uci.h b/src/uci.h index 55d580f9..6adf74cb 100644 --- a/src/uci.h +++ b/src/uci.h @@ -19,10 +19,10 @@ #ifndef UCI_H_INCLUDED #define UCI_H_INCLUDED +#include #include #include #include -#include #include "engine.h" #include "misc.h" @@ -58,8 +58,11 @@ class UCIEngine { Engine engine; CommandLine cli; + static void print_info_string(std::string_view str); + void go(std::istringstream& is); void bench(std::istream& args); + void benchmark(std::istream& args); void position(std::istringstream& is); void setoption(std::istringstream& is); std::uint64_t perft(const Search::LimitsType&); @@ -68,6 +71,8 @@ class UCIEngine { static void on_update_full(const Engine::InfoFull& info, bool showWDL); static void on_iter(const Engine::InfoIter& info); static void on_bestmove(std::string_view bestmove, std::string_view ponder); + + void init_search_update_listeners(); }; } // namespace Stockfish diff --git a/src/ucioption.cpp b/src/ucioption.cpp index e1ffe546..455803cf 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -36,6 +36,8 @@ bool CaseInsensitiveLess::operator()(const std::string& s1, const std::string& s [](char c1, char c2) { return std::tolower(c1) < std::tolower(c2); }); } +void OptionsMap::add_info_listener(InfoListener&& message_func) { info = std::move(message_func); } + void OptionsMap::setoption(std::istringstream& is) { std::string token, name, value; @@ -57,13 +59,20 @@ void OptionsMap::setoption(std::istringstream& is) { Option OptionsMap::operator[](const std::string& name) const { auto it = options_map.find(name); - return it != options_map.end() ? it->second : Option(); + return it != options_map.end() ? it->second : Option(this); } -Option& OptionsMap::operator[](const std::string& name) { return options_map[name]; } +Option& OptionsMap::operator[](const std::string& name) { + if (!options_map.count(name)) + options_map[name] = Option(this); + return options_map[name]; +} std::size_t OptionsMap::count(const std::string& name) const { return options_map.count(name); } +Option::Option(const OptionsMap* map) : + parent(map) {} + Option::Option(const char* v, OnChange f) : type("string"), min(0), @@ -118,6 +127,8 @@ bool Option::operator==(const char* s) const { return !CaseInsensitiveLess()(currentValue, s) && !CaseInsensitiveLess()(s, currentValue); } +bool Option::operator!=(const char* s) const { return !(*this == s); } + // Inits options and assigns idx in the correct printing order @@ -125,10 +136,12 @@ void Option::operator<<(const Option& o) { static size_t insert_order = 0; - *this = o; - idx = insert_order++; -} + auto p = this->parent; + *this = o; + this->parent = p; + idx = insert_order++; +} // Updates currentValue and triggers on_change() action. It's up to // the GUI to check for option's limits, but we could receive the new value @@ -153,11 +166,18 @@ Option& Option::operator=(const std::string& v) { return *this; } - if (type != "button") + if (type == "string") + currentValue = v == "" ? "" : v; + else if (type != "button") currentValue = v; if (on_change) - on_change(*this); + { + const auto ret = on_change(*this); + + if (ret && parent != nullptr && parent->info != nullptr) + parent->info(ret); + } return *this; } @@ -170,10 +190,16 @@ std::ostream& operator<<(std::ostream& os, const OptionsMap& om) { const Option& o = it.second; os << "\noption name " << it.first << " type " << o.type; - if (o.type == "string" || o.type == "check" || o.type == "combo") + if (o.type == "check" || o.type == "combo") os << " default " << o.defaultValue; - if (o.type == "spin") + else if (o.type == "string") + { + std::string defaultValue = o.defaultValue.empty() ? "" : o.defaultValue; + os << " default " << defaultValue; + } + + else if (o.type == "spin") os << " default " << int(stof(o.defaultValue)) << " min " << o.min << " max " << o.max; diff --git a/src/ucioption.h b/src/ucioption.h index b575d164..a47cc98d 100644 --- a/src/ucioption.h +++ b/src/ucioption.h @@ -23,6 +23,7 @@ #include #include #include +#include #include namespace Stockfish { @@ -31,31 +32,14 @@ struct CaseInsensitiveLess { bool operator()(const std::string&, const std::string&) const; }; -class Option; - -class OptionsMap { - public: - void setoption(std::istringstream&); - - friend std::ostream& operator<<(std::ostream&, const OptionsMap&); - - Option operator[](const std::string&) const; - Option& operator[](const std::string&); - - std::size_t count(const std::string&) const; - - private: - // The options container is defined as a std::map - using OptionsStore = std::map; - - OptionsStore options_map; -}; +class OptionsMap; // The Option class implements each option as specified by the UCI protocol class Option { public: - using OnChange = std::function; + using OnChange = std::function(const Option&)>; + Option(const OptionsMap*); Option(OnChange = nullptr); Option(bool v, OnChange = nullptr); Option(const char* v, OnChange = nullptr); @@ -63,18 +47,57 @@ class Option { Option(const char* v, const char* cur, OnChange = nullptr); Option& operator=(const std::string&); - void operator<<(const Option&); operator int() const; operator std::string() const; bool operator==(const char*) const; + bool operator!=(const char*) const; friend std::ostream& operator<<(std::ostream&, const OptionsMap&); private: - std::string defaultValue, currentValue, type; - int min, max; - size_t idx; - OnChange on_change; + friend class OptionsMap; + friend class Engine; + friend class Tune; + + void operator<<(const Option&); + + std::string defaultValue, currentValue, type; + int min, max; + size_t idx; + OnChange on_change; + const OptionsMap* parent = nullptr; +}; + +class OptionsMap { + public: + using InfoListener = std::function)>; + + OptionsMap() = default; + OptionsMap(const OptionsMap&) = delete; + OptionsMap(OptionsMap&&) = delete; + OptionsMap& operator=(const OptionsMap&) = delete; + OptionsMap& operator=(OptionsMap&&) = delete; + + void add_info_listener(InfoListener&&); + + void setoption(std::istringstream&); + + Option operator[](const std::string&) const; + Option& operator[](const std::string&); + + std::size_t count(const std::string&) const; + + private: + friend class Engine; + friend class Option; + + friend std::ostream& operator<<(std::ostream&, const OptionsMap&); + + // The options container is defined as a std::map + using OptionsStore = std::map; + + OptionsStore options_map; + InfoListener info; }; } diff --git a/tests/instrumented.py b/tests/instrumented.py new file mode 100644 index 00000000..db5ec8e0 --- /dev/null +++ b/tests/instrumented.py @@ -0,0 +1,520 @@ +import argparse +import re +import sys +import subprocess +import pathlib +import os + +from testing import ( + EPD, + TSAN, + Stockfish as Engine, + MiniTestFramework, + OrderedClassMembers, + Valgrind, + Syzygy, +) + +PATH = pathlib.Path(__file__).parent.resolve() +CWD = os.getcwd() + + +def get_prefix(): + if args.valgrind: + return Valgrind.get_valgrind_command() + if args.valgrind_thread: + return Valgrind.get_valgrind_thread_command() + + return [] + + +def get_threads(): + if args.valgrind_thread or args.sanitizer_thread: + return 2 + return 1 + + +def get_path(): + return os.path.abspath(os.path.join(CWD, args.stockfish_path)) + + +def postfix_check(output): + if args.sanitizer_undefined: + for idx, line in enumerate(output): + if "runtime error:" in line: + # print next possible 50 lines + for i in range(50): + debug_idx = idx + i + if debug_idx < len(output): + print(output[debug_idx]) + return False + + if args.sanitizer_thread: + for idx, line in enumerate(output): + if "WARNING: ThreadSanitizer:" in line: + # print next possible 50 lines + for i in range(50): + debug_idx = idx + i + if debug_idx < len(output): + print(output[debug_idx]) + return False + + return True + + +def Stockfish(*args, **kwargs): + return Engine(get_prefix(), get_path(), *args, **kwargs) + + +class TestCLI(metaclass=OrderedClassMembers): + + def beforeAll(self): + pass + + def afterAll(self): + pass + + def beforeEach(self): + self.stockfish = None + + def afterEach(self): + assert postfix_check(self.stockfish.get_output()) == True + self.stockfish.clear_output() + + def test_eval(self): + self.stockfish = Stockfish("eval".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_go_nodes_1000(self): + self.stockfish = Stockfish("go nodes 1000".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_go_depth_10(self): + self.stockfish = Stockfish("go depth 10".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_go_perft_4(self): + self.stockfish = Stockfish("go perft 4".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_go_movetime_1000(self): + self.stockfish = Stockfish("go movetime 1000".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_go_wtime_8000_btime_8000_winc_500_binc_500(self): + self.stockfish = Stockfish( + "go wtime 8000 btime 8000 winc 500 binc 500".split(" "), + True, + ) + assert self.stockfish.process.returncode == 0 + + def test_go_wtime_1000_btime_1000_winc_0_binc_0(self): + self.stockfish = Stockfish( + "go wtime 1000 btime 1000 winc 0 binc 0".split(" "), + True, + ) + assert self.stockfish.process.returncode == 0 + + def test_go_wtime_1000_btime_1000_winc_0_binc_0_movestogo_5(self): + self.stockfish = Stockfish( + "go wtime 1000 btime 1000 winc 0 binc 0 movestogo 5".split(" "), + True, + ) + assert self.stockfish.process.returncode == 0 + + def test_go_movetime_200(self): + self.stockfish = Stockfish("go movetime 200".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_go_nodes_20000_searchmoves_e2e4_d2d4(self): + self.stockfish = Stockfish( + "go nodes 20000 searchmoves e2e4 d2d4".split(" "), True + ) + assert self.stockfish.process.returncode == 0 + + def test_bench_128_threads_8_default_depth(self): + self.stockfish = Stockfish( + f"bench 128 {get_threads()} 8 default depth".split(" "), + True, + ) + assert self.stockfish.process.returncode == 0 + + def test_bench_128_threads_3_bench_tmp_epd_depth(self): + self.stockfish = Stockfish( + f"bench 128 {get_threads()} 3 {os.path.join(PATH,'bench_tmp.epd')} depth".split( + " " + ), + True, + ) + assert self.stockfish.process.returncode == 0 + + def test_d(self): + self.stockfish = Stockfish("d".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_compiler(self): + self.stockfish = Stockfish("compiler".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_license(self): + self.stockfish = Stockfish("license".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_uci(self): + self.stockfish = Stockfish("uci".split(" "), True) + assert self.stockfish.process.returncode == 0 + + def test_export_net_verify_nnue(self): + current_path = os.path.abspath(os.getcwd()) + self.stockfish = Stockfish( + f"export_net {os.path.join(current_path , 'verify.nnue')}".split(" "), True + ) + assert self.stockfish.process.returncode == 0 + + # verify the generated net equals the base net + + def test_network_equals_base(self): + self.stockfish = Stockfish( + ["uci"], + True, + ) + + output = self.stockfish.process.stdout + + # find line + for line in output.split("\n"): + if "option name EvalFile type string default" in line: + network = line.split(" ")[-1] + break + + # find network file in src dir + network = os.path.join(PATH.parent.resolve(), "src", network) + + if not os.path.exists(network): + print( + f"Network file {network} not found, please download the network file over the make command." + ) + assert False + + diff = subprocess.run(["diff", network, f"verify.nnue"]) + + assert diff.returncode == 0 + + +class TestInteractive(metaclass=OrderedClassMembers): + def beforeAll(self): + self.stockfish = Stockfish() + + def afterAll(self): + self.stockfish.quit() + assert self.stockfish.close() == 0 + + def afterEach(self): + assert postfix_check(self.stockfish.get_output()) == True + self.stockfish.clear_output() + + def test_startup_output(self): + self.stockfish.starts_with("Stockfish") + + def test_uci_command(self): + self.stockfish.send_command("uci") + self.stockfish.equals("uciok") + + def test_set_threads_option(self): + self.stockfish.send_command(f"setoption name Threads value {get_threads()}") + + def test_ucinewgame_and_startpos_nodes_1000(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position startpos") + self.stockfish.send_command("go nodes 1000") + self.stockfish.starts_with("bestmove") + + def test_ucinewgame_and_startpos_moves(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position startpos moves e2e4 e7e6") + self.stockfish.send_command("go nodes 1000") + self.stockfish.starts_with("bestmove") + + def test_fen_position_1(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1") + self.stockfish.send_command("go nodes 1000") + self.stockfish.starts_with("bestmove") + + def test_fen_position_2_flip(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1") + self.stockfish.send_command("flip") + self.stockfish.send_command("go nodes 1000") + self.stockfish.starts_with("bestmove") + + def test_depth_5_with_callback(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position startpos") + self.stockfish.send_command("go depth 5") + + def callback(output): + regex = r"info depth \d+ seldepth \d+ multipv \d+ score cp \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv" + if output.startswith("info depth") and not re.match(regex, output): + assert False + if output.startswith("bestmove"): + return True + return False + + self.stockfish.check_output(callback) + + def test_ucinewgame_and_go_depth_9(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("setoption name UCI_ShowWDL value true") + self.stockfish.send_command("position startpos") + self.stockfish.send_command("go depth 9") + + depth = 1 + + def callback(output): + nonlocal depth + + regex = rf"info depth {depth} seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv" + + if output.startswith("info depth"): + if not re.match(regex, output): + assert False + depth += 1 + + if output.startswith("bestmove"): + assert depth == 10 + return True + + return False + + self.stockfish.check_output(callback) + + def test_clear_hash(self): + self.stockfish.send_command("setoption name Clear Hash") + + def test_fen_position_mate_1(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 5K2/8/2qk4/2nPp3/3r4/6B1/B7/3R4 w - e6" + ) + self.stockfish.send_command("go depth 18") + + self.stockfish.expect("* score mate 1 * pv d5e6") + self.stockfish.equals("bestmove d5e6") + + def test_fen_position_mate_minus_1(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 2brrb2/8/p7/Q7/1p1kpPp1/1P1pN1K1/3P4/8 b - -" + ) + self.stockfish.send_command("go depth 18") + self.stockfish.expect("* score mate -1 *") + self.stockfish.starts_with("bestmove") + + def test_fen_position_fixed_node(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 5K2/8/2P1P1Pk/6pP/3p2P1/1P6/3P4/8 w - - 0 1" + ) + self.stockfish.send_command("go nodes 500000") + self.stockfish.starts_with("bestmove") + + def test_fen_position_with_mate_go_depth(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -" + ) + self.stockfish.send_command("go depth 18 searchmoves c6d7") + self.stockfish.expect("* score mate 2 * pv c6d7 * f7f5") + + self.stockfish.starts_with("bestmove") + + def test_fen_position_with_mate_go_mate(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -" + ) + self.stockfish.send_command("go mate 2 searchmoves c6d7") + self.stockfish.expect("* score mate 2 * pv c6d7 *") + + self.stockfish.starts_with("bestmove") + + def test_fen_position_with_mate_go_nodes(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -" + ) + self.stockfish.send_command("go nodes 500000 searchmoves c6d7") + self.stockfish.expect("* score mate 2 * pv c6d7 * f7f5") + + self.stockfish.starts_with("bestmove") + + def test_fen_position_depth_27(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen r1b2r1k/pp1p2pp/2p5/2B1q3/8/8/P1PN2PP/R4RK1 w - - 0 18" + ) + self.stockfish.send_command("go") + self.stockfish.contains("score mate 1") + + self.stockfish.starts_with("bestmove") + + def test_fen_position_with_mate_go_depth_and_promotion(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - - moves c6d7 f2f1q" + ) + self.stockfish.send_command("go depth 18") + self.stockfish.expect("* score mate 1 * pv f7f5") + self.stockfish.starts_with("bestmove f7f5") + + def test_fen_position_with_mate_go_depth_and_searchmoves(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -" + ) + self.stockfish.send_command("go depth 18 searchmoves c6d7") + self.stockfish.expect("* score mate 2 * pv c6d7 * f7f5") + + self.stockfish.starts_with("bestmove c6d7") + + def test_fen_position_with_moves_with_mate_go_depth_and_searchmoves(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command( + "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - - moves c6d7" + ) + self.stockfish.send_command("go depth 18 searchmoves e3e2") + self.stockfish.expect("* score mate -1 * pv e3e2 f7f5") + self.stockfish.starts_with("bestmove e3e2") + + def test_verify_nnue_network(self): + current_path = os.path.abspath(os.getcwd()) + Stockfish( + f"export_net {os.path.join(current_path , 'verify.nnue')}".split(" "), True + ) + + self.stockfish.send_command("setoption name EvalFile value verify.nnue") + self.stockfish.send_command("position startpos") + self.stockfish.send_command("go depth 5") + self.stockfish.starts_with("bestmove") + + def test_multipv_setting(self): + self.stockfish.send_command("setoption name MultiPV value 4") + self.stockfish.send_command("position startpos") + self.stockfish.send_command("go depth 5") + self.stockfish.starts_with("bestmove") + + def test_fen_position_with_skill_level(self): + self.stockfish.send_command("setoption name Skill Level value 10") + self.stockfish.send_command("position startpos") + self.stockfish.send_command("go depth 5") + self.stockfish.starts_with("bestmove") + + self.stockfish.send_command("setoption name Skill Level value 20") + + +class TestSyzygy(metaclass=OrderedClassMembers): + def beforeAll(self): + self.stockfish = Stockfish() + + def afterAll(self): + self.stockfish.quit() + assert self.stockfish.close() == 0 + + def afterEach(self): + assert postfix_check(self.stockfish.get_output()) == True + self.stockfish.clear_output() + + def test_syzygy_setup(self): + self.stockfish.starts_with("Stockfish") + self.stockfish.send_command("uci") + self.stockfish.send_command( + f"setoption name SyzygyPath value {os.path.join(PATH, 'syzygy')}" + ) + self.stockfish.expect( + "info string Found 35 WDL and 35 DTZ tablebase files (up to 4-man)." + ) + + def test_syzygy_bench(self): + self.stockfish.send_command("bench 128 1 8 default depth") + self.stockfish.expect("Nodes searched :*") + + def test_syzygy_position(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position fen 4k3/PP6/8/8/8/8/8/4K3 w - - 0 1") + self.stockfish.send_command("go depth 5") + + def check_output(output): + if "score cp 20000" in output or "score mate" in output: + return True + + self.stockfish.check_output(check_output) + self.stockfish.expect("bestmove *") + + def test_syzygy_position_2(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position fen 8/1P6/2B5/8/4K3/8/6k1/8 w - - 0 1") + self.stockfish.send_command("go depth 5") + + def check_output(output): + if "score cp 20000" in output or "score mate" in output: + return True + + self.stockfish.check_output(check_output) + self.stockfish.expect("bestmove *") + + def test_syzygy_position_3(self): + self.stockfish.send_command("ucinewgame") + self.stockfish.send_command("position fen 8/1P6/2B5/8/4K3/8/6k1/8 b - - 0 1") + self.stockfish.send_command("go depth 5") + + def check_output(output): + if "score cp -20000" in output or "score mate" in output: + return True + + self.stockfish.check_output(check_output) + self.stockfish.expect("bestmove *") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Run Stockfish with testing options") + parser.add_argument("--valgrind", action="store_true", help="Run valgrind testing") + parser.add_argument( + "--valgrind-thread", action="store_true", help="Run valgrind-thread testing" + ) + parser.add_argument( + "--sanitizer-undefined", + action="store_true", + help="Run sanitizer-undefined testing", + ) + parser.add_argument( + "--sanitizer-thread", action="store_true", help="Run sanitizer-thread testing" + ) + + parser.add_argument( + "--none", action="store_true", help="Run without any testing options" + ) + parser.add_argument("stockfish_path", type=str, help="Path to Stockfish binary") + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + EPD.create_bench_epd() + TSAN.set_tsan_option() + Syzygy.download_syzygy() + + framework = MiniTestFramework() + + # Each test suite will be ran inside a temporary directory + framework.run([TestCLI, TestInteractive, TestSyzygy]) + + EPD.delete_bench_epd() + TSAN.unset_tsan_option() + + if framework.has_failed(): + sys.exit(1) + + sys.exit(0) diff --git a/tests/instrumented.sh b/tests/instrumented.sh deleted file mode 100755 index 4c63fc57..00000000 --- a/tests/instrumented.sh +++ /dev/null @@ -1,301 +0,0 @@ -#!/bin/bash -# check for errors under Valgrind or sanitizers. - -error() -{ - echo "instrumented testing failed on line $1" - exit 1 -} -trap 'error ${LINENO}' ERR - -# define suitable post and prefixes for testing options -case $1 in - --valgrind) - echo "valgrind testing started" - prefix='' - exeprefix='valgrind --error-exitcode=42 --errors-for-leak-kinds=all --leak-check=full' - postfix='' - threads="1" - ;; - --valgrind-thread) - echo "valgrind-thread testing started" - prefix='' - exeprefix='valgrind --fair-sched=try --error-exitcode=42' - postfix='' - threads="2" - ;; - --sanitizer-undefined) - echo "sanitizer-undefined testing started" - prefix='!' - exeprefix='' - postfix='2>&1 | grep -A50 "runtime error:"' - threads="1" - ;; - --sanitizer-thread) - echo "sanitizer-thread testing started" - prefix='!' - exeprefix='' - postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"' - threads="2" - -cat << EOF > tsan.supp -race:Stockfish::TTEntry::move -race:Stockfish::TTEntry::depth -race:Stockfish::TTEntry::bound -race:Stockfish::TTEntry::save -race:Stockfish::TTEntry::value -race:Stockfish::TTEntry::eval -race:Stockfish::TTEntry::is_pv - -race:Stockfish::TranspositionTable::probe -race:Stockfish::TranspositionTable::hashfull - -EOF - - export TSAN_OPTIONS="suppressions=./tsan.supp" - - ;; - *) - echo "unknown testing started" - prefix='' - exeprefix='' - postfix='' - threads="1" - ;; -esac - -cat << EOF > bench_tmp.epd -Rn6/1rbq1bk1/2p2n1p/2Bp1p2/3Pp1pP/1N2P1P1/2Q1NPB1/6K1 w - - 2 26 -rnbqkb1r/ppp1pp2/5n1p/3p2p1/P2PP3/5P2/1PP3PP/RNBQKBNR w KQkq - 0 3 -3qnrk1/4bp1p/1p2p1pP/p2bN3/1P1P1B2/P2BQ3/5PP1/4R1K1 w - - 9 28 -r4rk1/1b2ppbp/pq4pn/2pp1PB1/1p2P3/1P1P1NN1/1PP3PP/R2Q1RK1 w - - 0 13 -EOF - -# simple command line testing -for args in "eval" \ - "go nodes 1000" \ - "go depth 10" \ - "go perft 4" \ - "go movetime 1000" \ - "go wtime 8000 btime 8000 winc 500 binc 500" \ - "go wtime 1000 btime 1000 winc 0 binc 0" \ - "go wtime 1000 btime 1000 winc 0 binc 0" \ - "go wtime 1000 btime 1000 winc 0 binc 0 movestogo 5" \ - "go movetime 200" \ - "go nodes 20000 searchmoves e2e4 d2d4" \ - "bench 128 $threads 8 default depth" \ - "bench 128 $threads 3 bench_tmp.epd depth" \ - "export_net verify.nnue" \ - "d" \ - "compiler" \ - "license" \ - "uci" -do - - echo "$prefix $exeprefix ./stockfish $args $postfix" - eval "$prefix $exeprefix ./stockfish $args $postfix" - -done - -# verify the generated net equals the base net -network=`./stockfish uci | grep 'option name EvalFile type string default' | awk '{print $NF}'` -echo "Comparing $network to the written verify.nnue" -diff $network verify.nnue - -# more general testing, following an uci protocol exchange -cat << EOF > game.exp - set timeout 240 - # to correctly catch eof we need the following line - # expect_before timeout { exit 2 } eof { exit 3 } - expect_before timeout { exit 2 } - - spawn $exeprefix ./stockfish - expect "Stockfish" - - send "uci\n" - expect "uciok" - - # send "setoption name Debug Log File value debug.log\n" - send "setoption name Threads value $threads\n" - - send "ucinewgame\n" - send "position startpos\n" - send "go nodes 1000\n" - expect "bestmove" - - send "ucinewgame\n" - send "position startpos moves e2e4 e7e6\n" - send "go nodes 1000\n" - expect "bestmove" - - send "ucinewgame\n" - send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n" - send "go depth 10\n" - expect "bestmove" - - send "ucinewgame\n" - send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n" - send "flip\n" - send "go depth 10\n" - expect "bestmove" - - send "ucinewgame\n" - send "position startpos\n" - send "go depth 5\n" - expect -re {info depth \d+ seldepth \d+ multipv \d+ score cp \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect "bestmove" - - send "ucinewgame\n" - send "setoption name UCI_ShowWDL value true\n" - send "position startpos\n" - send "go depth 9\n" - expect -re {info depth 1 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 2 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 3 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 4 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 5 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 6 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 7 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 8 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect -re {info depth 9 seldepth \d+ multipv \d+ score cp \d+ wdl \d+ \d+ \d+ nodes \d+ nps \d+ hashfull \d+ tbhits \d+ time \d+ pv} - expect "bestmove" - - send "setoption name Clear Hash\n" - - send "ucinewgame\n" - send "position fen 5K2/8/2qk4/2nPp3/3r4/6B1/B7/3R4 w - e6\n" - send "go depth 18\n" - expect "score mate 1" - expect "pv d5e6" - expect "bestmove d5e6" - - send "ucinewgame\n" - send "position fen 2brrb2/8/p7/Q7/1p1kpPp1/1P1pN1K1/3P4/8 b - -\n" - send "go depth 18\n" - expect "score mate -1" - expect "bestmove" - - send "ucinewgame\n" - send "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -\n" - send "go depth 18\n" - expect "score mate 2 * pv c6d7 * f7f5" - expect "bestmove c6d7" - - send "ucinewgame\n" - send "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -\n" - send "go mate 2\n" - expect "score mate 2 * pv c6d7" - expect "bestmove c6d7" - - send "ucinewgame\n" - send "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -\n" - send "go nodes 10000\n" - expect "score mate 2 * pv c6d7 * f7f5" - expect "bestmove c6d7" - - send "ucinewgame\n" - send "position fen 1NR2B2/5p2/5p2/1p1kpp2/1P2rp2/2P1pB2/2P1P1K1/8 b - - \n" - send "go depth 18\n" - expect "score mate -2" - expect "pv d5e6 c8d8" - expect "bestmove d5e6" - - send "ucinewgame\n" - send "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - - moves c6d7 f2f1q\n" - send "go depth 18\n" - expect "score mate 1 * pv f7f5" - expect "bestmove f7f5" - - send "ucinewgame\n" - send "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - -\n" - send "go depth 18 searchmoves c6d7\n" - expect "score mate 2 * pv c6d7 * f7f5" - expect "bestmove c6d7" - - send "ucinewgame\n" - send "position fen 8/5R2/2K1P3/4k3/8/b1PPpp1B/5p2/8 w - - moves c6d7\n" - send "go depth 18 searchmoves e3e2\n" - expect "score mate -1 * pv e3e2 f7f5" - expect "bestmove e3e2" - - send "setoption name EvalFile value verify.nnue\n" - send "position startpos\n" - send "go depth 5\n" - expect "bestmove" - - send "setoption name MultiPV value 4\n" - send "position startpos\n" - send "go depth 5\n" - expect "bestmove" - - send "setoption name Skill Level value 10\n" - send "position startpos\n" - send "go depth 5\n" - expect "bestmove" - send "setoption name Skill Level value 20\n" - - send "quit\n" - expect eof - - # return error code of the spawned program, useful for Valgrind - lassign [wait] pid spawnid os_error_flag value - exit \$value -EOF - -#download TB as needed -if [ ! -d ../tests/syzygy ]; then - curl -sL https://api.github.com/repos/niklasf/python-chess/tarball/9b9aa13f9f36d08aadfabff872882f4ab1494e95 | tar -xzf - - mv niklasf-python-chess-9b9aa13 ../tests/syzygy -fi - -cat << EOF > syzygy.exp - set timeout 240 - # to correctly catch eof we need the following line - # expect_before timeout { exit 2 } eof { exit 3 } - expect_before timeout { exit 2 } - spawn $exeprefix ./stockfish - expect "Stockfish" - send "uci\n" - send "setoption name SyzygyPath value ../tests/syzygy/\n" - expect "info string Found 35 tablebases" - send "bench 128 1 8 default depth\n" - expect "Nodes searched :" - send "ucinewgame\n" - send "position fen 4k3/PP6/8/8/8/8/8/4K3 w - - 0 1\n" - send "go depth 5\n" - expect -re {score cp 20000|score mate} - expect "bestmove" - send "ucinewgame\n" - send "position fen 8/1P6/2B5/8/4K3/8/6k1/8 w - - 0 1\n" - send "go depth 5\n" - expect -re {score cp 20000|score mate} - expect "bestmove" - send "ucinewgame\n" - send "position fen 8/1P6/2B5/8/4K3/8/6k1/8 b - - 0 1\n" - send "go depth 5\n" - expect -re {score cp -20000|score mate} - expect "bestmove" - send "quit\n" - expect eof - - # return error code of the spawned program, useful for Valgrind - lassign [wait] pid spawnid os_error_flag value - exit \$value -EOF - -for exp in game.exp syzygy.exp -do - - echo "======== $exp ==============" - cat $exp - echo "============================" - echo "$prefix expect $exp $postfix" - eval "$prefix expect $exp $postfix" - - rm $exp - -done - -rm -f tsan.supp bench_tmp.epd - -echo "instrumented testing OK" diff --git a/tests/perft.sh b/tests/perft.sh index 545e750f..c1532c20 100755 --- a/tests/perft.sh +++ b/tests/perft.sh @@ -1,5 +1,5 @@ #!/bin/bash -# verify perft numbers (positions from www.chessprogramming.org/Perft_Results) +# verify perft numbers (positions from https://www.chessprogramming.org/Perft_Results) error() { diff --git a/tests/testing.py b/tests/testing.py new file mode 100644 index 00000000..d51ca89a --- /dev/null +++ b/tests/testing.py @@ -0,0 +1,378 @@ +import subprocess +from typing import List +import os +import collections +import time +import sys +import traceback +import fnmatch +from functools import wraps +from contextlib import redirect_stdout +import io +import tarfile +import pathlib +import concurrent.futures +import tempfile +import shutil +import requests + +CYAN_COLOR = "\033[36m" +GRAY_COLOR = "\033[2m" +RED_COLOR = "\033[31m" +GREEN_COLOR = "\033[32m" +RESET_COLOR = "\033[0m" +WHITE_BOLD = "\033[1m" + +MAX_TIMEOUT = 60 * 5 + +PATH = pathlib.Path(__file__).parent.resolve() + + +class Valgrind: + @staticmethod + def get_valgrind_command(): + return [ + "valgrind", + "--error-exitcode=42", + "--errors-for-leak-kinds=all", + "--leak-check=full", + ] + + @staticmethod + def get_valgrind_thread_command(): + return ["valgrind", "--error-exitcode=42", "--fair-sched=try"] + + +class TSAN: + @staticmethod + def set_tsan_option(): + with open(f"tsan.supp", "w") as f: + f.write( + """ +race:Stockfish::TTEntry::read +race:Stockfish::TTEntry::save +race:Stockfish::TranspositionTable::probe +race:Stockfish::TranspositionTable::hashfull +""" + ) + + os.environ["TSAN_OPTIONS"] = "suppressions=./tsan.supp" + + @staticmethod + def unset_tsan_option(): + os.environ.pop("TSAN_OPTIONS", None) + os.remove(f"tsan.supp") + + +class EPD: + @staticmethod + def create_bench_epd(): + with open(f"{os.path.join(PATH,'bench_tmp.epd')}", "w") as f: + f.write( + """ +Rn6/1rbq1bk1/2p2n1p/2Bp1p2/3Pp1pP/1N2P1P1/2Q1NPB1/6K1 w - - 2 26 +rnbqkb1r/ppp1pp2/5n1p/3p2p1/P2PP3/5P2/1PP3PP/RNBQKBNR w KQkq - 0 3 +3qnrk1/4bp1p/1p2p1pP/p2bN3/1P1P1B2/P2BQ3/5PP1/4R1K1 w - - 9 28 +r4rk1/1b2ppbp/pq4pn/2pp1PB1/1p2P3/1P1P1NN1/1PP3PP/R2Q1RK1 w - - 0 13 +""" + ) + + @staticmethod + def delete_bench_epd(): + os.remove(f"{os.path.join(PATH,'bench_tmp.epd')}") + + +class Syzygy: + @staticmethod + def get_syzygy_path(): + return os.path.abspath("syzygy") + + @staticmethod + def download_syzygy(): + if not os.path.isdir(os.path.join(PATH, "syzygy")): + url = "https://api.github.com/repos/niklasf/python-chess/tarball/9b9aa13f9f36d08aadfabff872882f4ab1494e95" + file = "niklasf-python-chess-9b9aa13" + + with tempfile.TemporaryDirectory() as tmpdirname: + tarball_path = os.path.join(tmpdirname, f"{file}.tar.gz") + + response = requests.get(url, stream=True) + with open(tarball_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + with tarfile.open(tarball_path, "r:gz") as tar: + tar.extractall(tmpdirname) + + shutil.move(os.path.join(tmpdirname, file), os.path.join(PATH, "syzygy")) + +class OrderedClassMembers(type): + @classmethod + def __prepare__(self, name, bases): + return collections.OrderedDict() + + def __new__(self, name, bases, classdict): + classdict["__ordered__"] = [ + key for key in classdict.keys() if key not in ("__module__", "__qualname__") + ] + return type.__new__(self, name, bases, classdict) + + +class TimeoutException(Exception): + def __init__(self, message: str, timeout: int): + self.message = message + self.timeout = timeout + + +def timeout_decorator(timeout: float): + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(func, *args, **kwargs) + try: + result = future.result(timeout=timeout) + except concurrent.futures.TimeoutError: + raise TimeoutException( + f"Function {func.__name__} timed out after {timeout} seconds", + timeout, + ) + return result + + return wrapper + + return decorator + + +class MiniTestFramework: + def __init__(self): + self.passed_test_suites = 0 + self.failed_test_suites = 0 + self.passed_tests = 0 + self.failed_tests = 0 + + def has_failed(self) -> bool: + return self.failed_test_suites > 0 + + def run(self, classes: List[type]) -> bool: + self.start_time = time.time() + + for test_class in classes: + with tempfile.TemporaryDirectory() as tmpdirname: + original_cwd = os.getcwd() + os.chdir(tmpdirname) + + try: + if self.__run(test_class): + self.failed_test_suites += 1 + else: + self.passed_test_suites += 1 + finally: + os.chdir(original_cwd) + + self.__print_summary(round(time.time() - self.start_time, 2)) + return self.has_failed() + + def __run(self, test_class) -> bool: + test_instance = test_class() + test_name = test_instance.__class__.__name__ + test_methods = [m for m in test_instance.__ordered__ if m.startswith("test_")] + + print(f"\nTest Suite: {test_name}") + + if hasattr(test_instance, "beforeAll"): + test_instance.beforeAll() + + fails = 0 + + for method in test_methods: + fails += self.__run_test_method(test_instance, method) + + if hasattr(test_instance, "afterAll"): + test_instance.afterAll() + + self.failed_tests += fails + + return fails > 0 + + def __run_test_method(self, test_instance, method: str) -> int: + print(f" Running {method}... \r", end="", flush=True) + + buffer = io.StringIO() + fails = 0 + + try: + t0 = time.time() + + with redirect_stdout(buffer): + if hasattr(test_instance, "beforeEach"): + test_instance.beforeEach() + + getattr(test_instance, method)() + + if hasattr(test_instance, "afterEach"): + test_instance.afterEach() + + duration = time.time() - t0 + + self.print_success(f" {method} ({duration * 1000:.2f}ms)") + self.passed_tests += 1 + except Exception as e: + if isinstance(e, TimeoutException): + self.print_failure( + f" {method} (hit execution limit of {e.timeout} seconds)" + ) + + if isinstance(e, AssertionError): + self.__handle_assertion_error(t0, method) + + fails += 1 + finally: + self.__print_buffer_output(buffer) + + return fails + + def __handle_assertion_error(self, start_time, method: str): + duration = time.time() - start_time + self.print_failure(f" {method} ({duration * 1000:.2f}ms)") + traceback_output = "".join(traceback.format_tb(sys.exc_info()[2])) + + colored_traceback = "\n".join( + f" {CYAN_COLOR}{line}{RESET_COLOR}" + for line in traceback_output.splitlines() + ) + + print(colored_traceback) + + def __print_buffer_output(self, buffer: io.StringIO): + output = buffer.getvalue() + if output: + indented_output = "\n".join(f" {line}" for line in output.splitlines()) + print(f" {RED_COLOR}⎯⎯⎯⎯⎯OUTPUT⎯⎯⎯⎯⎯{RESET_COLOR}") + print(f"{GRAY_COLOR}{indented_output}{RESET_COLOR}") + print(f" {RED_COLOR}⎯⎯⎯⎯⎯OUTPUT⎯⎯⎯⎯⎯{RESET_COLOR}") + + def __print_summary(self, duration: float): + print(f"\n{WHITE_BOLD}Test Summary{RESET_COLOR}\n") + print( + f" Test Suites: {GREEN_COLOR}{self.passed_test_suites} passed{RESET_COLOR}, {RED_COLOR}{self.failed_test_suites} failed{RESET_COLOR}, {self.passed_test_suites + self.failed_test_suites} total" + ) + print( + f" Tests: {GREEN_COLOR}{self.passed_tests} passed{RESET_COLOR}, {RED_COLOR}{self.failed_tests} failed{RESET_COLOR}, {self.passed_tests + self.failed_tests} total" + ) + print(f" Time: {duration}s\n") + + def print_failure(self, add: str): + print(f" {RED_COLOR}✗{RESET_COLOR}{add}", flush=True) + + def print_success(self, add: str): + print(f" {GREEN_COLOR}✓{RESET_COLOR}{add}", flush=True) + + +class Stockfish: + def __init__( + self, + prefix: List[str], + path: str, + args: List[str] = [], + cli: bool = False, + ): + self.path = path + self.process = None + self.args = args + self.cli = cli + self.prefix = prefix + self.output = [] + + self.start() + + def start(self): + if self.cli: + self.process = subprocess.run( + self.prefix + [self.path] + self.args, + capture_output=True, + text=True, + ) + + self.process.stdout + + return + + self.process = subprocess.Popen( + self.prefix + [self.path] + self.args, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + bufsize=1, + ) + + def setoption(self, name: str, value: str): + self.send_command(f"setoption name {name} value {value}") + + def send_command(self, command: str): + if not self.process: + raise RuntimeError("Stockfish process is not started") + + self.process.stdin.write(command + "\n") + self.process.stdin.flush() + + @timeout_decorator(MAX_TIMEOUT) + def equals(self, expected_output: str): + for line in self.readline(): + if line == expected_output: + return + + @timeout_decorator(MAX_TIMEOUT) + def expect(self, expected_output: str): + for line in self.readline(): + if fnmatch.fnmatch(line, expected_output): + return + + @timeout_decorator(MAX_TIMEOUT) + def contains(self, expected_output: str): + for line in self.readline(): + if expected_output in line: + return + + @timeout_decorator(MAX_TIMEOUT) + def starts_with(self, expected_output: str): + for line in self.readline(): + if line.startswith(expected_output): + return + + @timeout_decorator(MAX_TIMEOUT) + def check_output(self, callback): + if not callback: + raise ValueError("Callback function is required") + + for line in self.readline(): + if callback(line) == True: + return + + def readline(self): + if not self.process: + raise RuntimeError("Stockfish process is not started") + + while True: + line = self.process.stdout.readline().strip() + self.output.append(line) + + yield line + + def clear_output(self): + self.output = [] + + def get_output(self) -> List[str]: + return self.output + + def quit(self): + self.send_command("quit") + + def close(self): + if self.process: + self.process.stdin.close() + self.process.stdout.close() + return self.process.wait() + + return 0