From 7e72b37e4ce3351399bb0ac08686bd84cdc4fba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= Date: Wed, 10 Jul 2024 14:51:48 +0200 Subject: [PATCH] Clean up comments in code - Capitalize comments - Reformat multi-lines comments to equalize the widths of the lines - Try to keep the width of comments around 85 characters - Remove periods at the end of single-line comments closes https://github.com/official-stockfish/Stockfish/pull/5469 No functional change --- src/engine.h | 2 +- src/memory.cpp | 26 ++- src/memory.h | 27 +-- src/misc.cpp | 19 +- src/movepick.cpp | 8 +- src/nnue/nnue_feature_transformer.h | 40 +++-- src/numa.h | 203 +++++++++++---------- src/position.h | 4 +- src/search.cpp | 268 ++++++++++++++-------------- src/search.h | 9 +- src/thread.cpp | 31 ++-- src/types.h | 30 ++-- 12 files changed, 356 insertions(+), 311 deletions(-) diff --git a/src/engine.h b/src/engine.h index 0d6f0f2b..127f7d7c 100644 --- a/src/engine.h +++ b/src/engine.h @@ -49,7 +49,7 @@ class Engine { Engine(std::string path = ""); - // Can't be movable due to components holding backreferences to fields + // Cannot be movable due to components holding backreferences to fields Engine(const Engine&) = delete; Engine(Engine&&) = delete; Engine& operator=(const Engine&) = delete; diff --git a/src/memory.cpp b/src/memory.cpp index 1769a661..ae303c53 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -49,10 +49,12 @@ #include // std::cerr #include // std::endl #include + // The needed Windows API for processor groups could be missed from old Windows // versions, so instead of calling them directly (forcing the linker to resolve // the calls at compile time), try to load them at runtime. To do this we need // first to define the corresponding function pointers. + extern "C" { using OpenProcessToken_t = bool (*)(HANDLE, DWORD, PHANDLE); using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID); @@ -64,9 +66,10 @@ using AdjustTokenPrivileges_t = namespace Stockfish { -// Wrapper for systems where the c++17 implementation -// does not guarantee the availability of aligned_alloc(). Memory allocated with -// std_aligned_alloc() must be freed with std_aligned_free(). +// Wrappers for systems where the c++17 implementation does not guarantee the +// availability of aligned_alloc(). Memory allocated with std_aligned_alloc() +// must be freed with std_aligned_free(). + void* std_aligned_alloc(size_t alignment, size_t size) { #if defined(_ISOC11_SOURCE) return aligned_alloc(alignment, size); @@ -96,7 +99,8 @@ void std_aligned_free(void* ptr) { #endif } -// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages. +// aligned_large_pages_alloc() will return suitably aligned memory, +// if possible using large pages. #if defined(_WIN32) @@ -135,6 +139,7 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize return nullptr; // We need SeLockMemoryPrivilege, so try to enable it for the process + if (!OpenProcessToken_f( // OpenProcessToken() GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) return nullptr; @@ -149,8 +154,10 @@ static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize tp.Privileges[0].Luid = luid; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds, - // we still need to query GetLastError() to ensure that the privileges were actually obtained. + // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() + // succeeds, we still need to query GetLastError() to ensure that the privileges + // were actually obtained. + if (AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) && GetLastError() == ERROR_SUCCESS) @@ -189,9 +196,9 @@ void* aligned_large_pages_alloc(size_t allocSize) { void* aligned_large_pages_alloc(size_t allocSize) { #if defined(__linux__) - constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size + constexpr size_t alignment = 2 * 1024 * 1024; // 2MB page size assumed #else - constexpr size_t alignment = 4096; // assumed small page size + constexpr size_t alignment = 4096; // small page size assumed #endif // Round up to multiples of alignment @@ -206,7 +213,8 @@ void* aligned_large_pages_alloc(size_t allocSize) { #endif -// aligned_large_pages_free() will free the previously allocated ttmem +// aligned_large_pages_free() will free the previously memory allocated +// by aligned_large_pages_alloc(). The effect is a nop if mem == nullptr. #if defined(_WIN32) diff --git a/src/memory.h b/src/memory.h index ad7ca602..3155a5aa 100644 --- a/src/memory.h +++ b/src/memory.h @@ -33,13 +33,13 @@ namespace Stockfish { void* std_aligned_alloc(size_t alignment, size_t size); void std_aligned_free(void* ptr); -// memory aligned by page size, min alignment: 4096 bytes -void* aligned_large_pages_alloc(size_t size); -// nop if mem == nullptr -void aligned_large_pages_free(void* mem); -// frees memory which was placed there with placement new. -// works for both single objects and arrays of unknown bound +// Memory aligned by page size, min alignment: 4096 bytes +void* aligned_large_pages_alloc(size_t size); +void aligned_large_pages_free(void* mem); + +// Frees memory which was placed there with placement new. +// Works for both single objects and arrays of unknown bound. template void memory_deleter(T* ptr, FREE_FUNC free_func) { if (!ptr) @@ -53,15 +53,15 @@ void memory_deleter(T* ptr, FREE_FUNC free_func) { return; } -// frees memory which was placed there with placement new. -// works for both single objects and arrays of unknown bound +// Frees memory which was placed there with placement new. +// Works for both single objects and arrays of unknown bound. template void memory_deleter_array(T* ptr, FREE_FUNC free_func) { if (!ptr) return; - // Move back on the pointer to where the size is allocated. + // Move back on the pointer to where the size is allocated const size_t array_offset = std::max(sizeof(size_t), alignof(T)); char* raw_memory = reinterpret_cast(ptr) - array_offset; @@ -77,7 +77,7 @@ void memory_deleter_array(T* ptr, FREE_FUNC free_func) { free_func(raw_memory); } -// Allocates memory for a single object and places it there with placement new. +// Allocates memory for a single object and places it there with placement new template inline std::enable_if_t, T*> memory_allocator(ALLOC_FUNC alloc_func, Args&&... args) { @@ -86,7 +86,7 @@ inline std::enable_if_t, T*> memory_allocator(ALLOC_FUNC all return new (raw_memory) T(std::forward(args)...); } -// Allocates memory for an array of unknown bound and places it there with placement new. +// Allocates memory for an array of unknown bound and places it there with placement new template inline std::enable_if_t, std::remove_extent_t*> memory_allocator(ALLOC_FUNC alloc_func, size_t num) { @@ -94,7 +94,7 @@ memory_allocator(ALLOC_FUNC alloc_func, size_t num) { const size_t array_offset = std::max(sizeof(size_t), alignof(ElementType)); - // save the array size in the memory location + // Save the array size in the memory location char* raw_memory = reinterpret_cast(alloc_func(array_offset + num * sizeof(ElementType))); ASSERT_ALIGNED(raw_memory, alignof(T)); @@ -104,7 +104,8 @@ memory_allocator(ALLOC_FUNC alloc_func, size_t num) { for (size_t i = 0; i < num; ++i) new (raw_memory + array_offset + i * sizeof(ElementType)) ElementType(); - // Need to return the pointer at the start of the array so that the indexing in unique_ptr works + // Need to return the pointer at the start of the array so that + // the indexing in unique_ptr works. return reinterpret_cast(raw_memory + array_offset); } diff --git a/src/misc.cpp b/src/misc.cpp index b68c12b9..664ab4b8 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -112,14 +112,16 @@ class Logger { // Returns the full name of the current Stockfish version. -// For local dev compiles we try to append the commit sha and commit date -// from git if that fails only the local compilation date is set and "nogit" is specified: -// Stockfish dev-YYYYMMDD-SHA -// or -// Stockfish dev-YYYYMMDD-nogit +// +// For local dev compiles we try to append the commit SHA and +// commit date from git. If that fails only the local compilation +// date is set and "nogit" is specified: +// Stockfish dev-YYYYMMDD-SHA +// or +// Stockfish dev-YYYYMMDD-nogit // // For releases (non-dev builds) we only include the version number: -// Stockfish version +// Stockfish version std::string engine_info(bool to_uci) { std::stringstream ss; ss << "Stockfish " << version << std::setfill('0'); @@ -131,8 +133,9 @@ std::string engine_info(bool to_uci) { ss << stringify(GIT_DATE); #else constexpr std::string_view months("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec"); - std::string month, day, year; - std::stringstream date(__DATE__); // From compiler, format is "Sep 21 2008" + + std::string month, day, year; + std::stringstream date(__DATE__); // From compiler, format is "Sep 21 2008" date >> month >> day >> year; ss << year << std::setw(2) << std::setfill('0') << (1 + months.find(month) / 4) diff --git a/src/movepick.cpp b/src/movepick.cpp index f6f9f0dc..c21b14a9 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -59,8 +59,8 @@ enum Stages { QCHECK }; -// Sort moves in descending order up to and including -// a given limit. The order of moves smaller than the limit is left unspecified. +// Sort moves in descending order up to and including a given limit. +// The order of moves smaller than the limit is left unspecified. void partial_insertion_sort(ExtMove* begin, ExtMove* end, int limit) { for (ExtMove *sortedEnd = begin, *p = begin + 1; p < end; ++p) @@ -125,8 +125,8 @@ MovePicker::MovePicker(const Position& p, stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) + !(ttm && pos.pseudo_legal(ttm)); } -// Constructor for ProbCut: we generate captures with SEE greater -// than or equal to the given threshold. +// Constructor for ProbCut: we generate captures with SEE greater than or equal +// to the given threshold. MovePicker::MovePicker(const Position& p, Move ttm, int th, const CapturePieceToHistory* cph) : pos(p), captureHistory(cph), diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 483b84a8..ad0fb1b4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -354,13 +354,18 @@ class FeatureTransformer { for (IndexType j = 0; j < NumOutputChunks; ++j) { - // What we want to do is multiply inputs in a pairwise manner (after clipping), and then shift right by 9. - // Instead, we shift left by 7, and use mulhi, stripping the bottom 16 bits, effectively shifting right by 16, - // resulting in a net shift of 9 bits. We use mulhi because it maintains the sign of the multiplication (unlike mullo), - // allowing us to make use of packus to clip 2 of the inputs, resulting in a save of 2 "vec_max_16" calls. - // A special case is when we use NEON, where we shift left by 6 instead, because the instruction "vqdmulhq_s16" - // also doubles the return value after the multiplication, adding an extra shift to the left by 1, so we - // compensate by shifting less before the multiplication. + // What we want to do is multiply inputs in a pairwise manner + // (after clipping), and then shift right by 9. Instead, we + // shift left by 7, and use mulhi, stripping the bottom 16 bits, + // effectively shifting right by 16, resulting in a net shift + // of 9 bits. We use mulhi because it maintains the sign of + // the multiplication (unlike mullo), allowing us to make use + // of packus to clip 2 of the inputs, resulting in a save of 2 + // "vec_max_16" calls. A special case is when we use NEON, + // where we shift left by 6 instead, because the instruction + // "vqdmulhq_s16" also doubles the return value after the + // multiplication, adding an extra shift to the left by 1, so + // we compensate by shifting less before the multiplication. #if defined(USE_SSE2) constexpr int shift = 7; @@ -426,10 +431,10 @@ class FeatureTransformer { } // NOTE: The parameter states_to_update is an array of position states. - // All states must be sequential, that is states_to_update[i] must either be reachable - // by repeatedly applying ->previous from states_to_update[i+1]. - // computed_st must be reachable by repeatedly applying ->previous on - // states_to_update[0]. + // All states must be sequential, that is states_to_update[i] must + // either be reachable by repeatedly applying ->previous from + // states_to_update[i+1], and computed_st must be reachable by + // repeatedly applying ->previous on states_to_update[0]. template void update_accumulator_incremental(const Position& pos, StateInfo* computed_st, @@ -446,7 +451,7 @@ class FeatureTransformer { #ifdef VECTOR // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch + // is defined in the VECTOR code below, once in each branch. vec_t acc[NumRegs]; psqt_vec_t psqt[NumPsqtRegs]; #endif @@ -474,7 +479,8 @@ class FeatureTransformer { StateInfo* st = computed_st; - // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. + // Now update the accumulators listed in states_to_update[], + // where the last element is a sentinel. #ifdef VECTOR if (N == 1 && (removed[0].size() == 1 || removed[0].size() == 2) && added[0].size() == 1) @@ -794,7 +800,7 @@ class FeatureTransformer { } // The accumulator of the refresh entry has been updated. - // Now copy its content to the actual accumulator we were refreshing + // Now copy its content to the actual accumulator we were refreshing. std::memcpy(accumulator.accumulation[Perspective], entry.accumulation, sizeof(BiasType) * HalfDimensions); @@ -827,7 +833,7 @@ class FeatureTransformer { if ((oldest_st->*accPtr).computed[Perspective]) { - // Only update current position accumulator to minimize work. + // Only update current position accumulator to minimize work StateInfo* states_to_update[1] = {pos.state()}; update_accumulator_incremental(pos, oldest_st, states_to_update); } @@ -846,8 +852,8 @@ class FeatureTransformer { if (next == nullptr) return; - // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. - // Currently we update 2 accumulators. + // Now update the accumulators listed in states_to_update[], where + // the last element is a sentinel. Currently we update two accumulators: // 1. for the current position // 2. the next accumulator after the computed one // The heuristic may change in the future. diff --git a/src/numa.h b/src/numa.h index fd9abd4d..3de8281d 100644 --- a/src/numa.h +++ b/src/numa.h @@ -37,8 +37,8 @@ #include "memory.h" -// We support linux very well, but we explicitly do NOT support Android, because there's -// no affected systems, not worth maintaining. +// We support linux very well, but we explicitly do NOT support Android, +// because there is no affected systems, not worth maintaining. #if defined(__linux__) && !defined(__ANDROID__) #if !defined(_GNU_SOURCE) #define _GNU_SOURCE @@ -81,9 +81,9 @@ using NumaIndex = size_t; inline CpuIndex get_hardware_concurrency() { CpuIndex concurrency = std::thread::hardware_concurrency(); - // Get all processors across all processor groups on windows, since ::hardware_concurrency - // only returns the number of processors in the first group, because only these - // are available to std::thread. + // Get all processors across all processor groups on windows, since + // hardware_concurrency() only returns the number of processors in + // the first group, because only these are available to std::thread. #ifdef _WIN64 concurrency = std::max(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS)); #endif @@ -101,7 +101,7 @@ struct WindowsAffinity { // We also provide diagnostic for when the affinity is set to nullopt // whether it was due to being indeterminate. If affinity is indeterminate - // it's best to assume it is not set at all, so consistent with the meaning + // it is best to assume it is not set at all, so consistent with the meaning // of the nullopt affinity. bool isNewDeterminate = true; bool isOldDeterminate = true; @@ -119,23 +119,25 @@ struct WindowsAffinity { } // Since Windows 11 and Windows Server 2022 thread affinities can span - // processor groups and can be set as such by a new WinAPI function. - // However, we may need to force using the old API if we detect - // that the process has affinity set by the old API already and we want to override that. - // Due to the limitations of the old API we can't detect its use reliably. - // There will be cases where we detect not use but it has actually been used and vice versa. + // processor groups and can be set as such by a new WinAPI function. However, + // we may need to force using the old API if we detect that the process has + // affinity set by the old API already and we want to override that. Due to the + // limitations of the old API we cannot detect its use reliably. There will be + // cases where we detect not use but it has actually been used and vice versa. + bool likely_used_old_api() const { return oldApi.has_value() || !isOldDeterminate; } }; inline std::pair> get_process_group_affinity() { + // GetProcessGroupAffinity requires the GroupArray argument to be // aligned to 4 bytes instead of just 2. static constexpr size_t GroupArrayMinimumAlignment = 4; static_assert(GroupArrayMinimumAlignment >= alignof(USHORT)); // The function should succeed the second time, but it may fail if the group - // affinity has changed between GetProcessGroupAffinity calls. - // In such case we consider this a hard error, as we can't work with unstable affinities + // affinity has changed between GetProcessGroupAffinity calls. In such case + // we consider this a hard error, as we Cannot work with unstable affinities // anyway. static constexpr int MAX_TRIES = 2; USHORT GroupCount = 1; @@ -165,10 +167,10 @@ inline std::pair> get_process_group_affinity() { } // On Windows there are two ways to set affinity, and therefore 2 ways to get it. -// These are not consistent, so we have to check both. -// In some cases it is actually not possible to determine affinity. -// For example when two different threads have affinity on different processor groups, -// set using SetThreadAffinityMask, we can't retrieve the actual affinities. +// These are not consistent, so we have to check both. In some cases it is actually +// not possible to determine affinity. For example when two different threads have +// affinity on different processor groups, set using SetThreadAffinityMask, we cannot +// retrieve the actual affinities. // From documentation on GetProcessAffinityMask: // > If the calling process contains threads in multiple groups, // > the function returns zero for both affinity masks. @@ -196,8 +198,8 @@ inline WindowsAffinity get_process_affinity() { } else if (RequiredMaskCount > 0) { - // If RequiredMaskCount then these affinities were never set, but it's not consistent - // so GetProcessAffinityMask may still return some affinity. + // If RequiredMaskCount then these affinities were never set, but it's + // not consistent so GetProcessAffinityMask may still return some affinity. auto groupAffinities = std::make_unique(RequiredMaskCount); status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(), @@ -233,7 +235,7 @@ inline WindowsAffinity get_process_affinity() { DWORD_PTR proc, sys; status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys); - // If proc == 0 then we can't determine affinity because it spans processor groups. + // If proc == 0 then we cannot determine affinity because it spans processor groups. // On Windows 11 and Server 2022 it will instead // > If, however, hHandle specifies a handle to the current process, the function // > always uses the calling thread's primary group (which by default is the same @@ -246,10 +248,12 @@ inline WindowsAffinity get_process_affinity() { return affinity; } - // If SetProcessAffinityMask was never called the affinity - // must span all processor groups, but if it was called it must only span one. + // If SetProcessAffinityMask was never called the affinity must span + // all processor groups, but if it was called it must only span one. + std::vector groupAffinity; // We need to capture this later and capturing // from structured bindings requires c++20. + std::tie(status, groupAffinity) = get_process_group_affinity(); if (status == 0) { @@ -282,11 +286,12 @@ inline WindowsAffinity get_process_affinity() { // If we got here it means that either SetProcessAffinityMask was never set // or we're on Windows 11/Server 2022. - // Since Windows 11 and Windows Server 2022 the behaviour of GetProcessAffinityMask changed - // > If, however, hHandle specifies a handle to the current process, the function - // > always uses the calling thread's primary group (which by default is the same - // > as the process' primary group) in order to set the - // > lpProcessAffinityMask and lpSystemAffinityMask. + // Since Windows 11 and Windows Server 2022 the behaviour of + // GetProcessAffinityMask changed: + // > If, however, hHandle specifies a handle to the current process, + // > the function always uses the calling thread's primary group + // > (which by default is the same as the process' primary group) + // > in order to set the lpProcessAffinityMask and lpSystemAffinityMask. // In which case we can actually retrieve the full affinity. if (GetThreadSelectedCpuSetMasks_f != nullptr) @@ -300,9 +305,11 @@ inline WindowsAffinity get_process_affinity() { const int numActiveProcessors = GetActiveProcessorCount(static_cast(procGroupIndex)); - // We have to schedule to 2 different processors and & the affinities we get. - // Otherwise our processor choice could influence the resulting affinity. - // We assume the processor IDs within the group are filled sequentially from 0. + // We have to schedule to two different processors + // and & the affinities we get. Otherwise our processor + // choice could influence the resulting affinity. + // We assume the processor IDs within the group are + // filled sequentially from 0. uint64_t procCombined = std::numeric_limits::max(); uint64_t sysCombined = std::numeric_limits::max(); @@ -346,8 +353,9 @@ inline WindowsAffinity get_process_affinity() { } } - // We have to detect the case where the affinity was not set, or is set to all processors - // so that we correctly produce as std::nullopt result. + // We have to detect the case where the affinity was not set, + // or is set to all processors so that we correctly produce as + // std::nullopt result. if (!isAffinityFull) { affinity.oldApi = std::move(cpus); @@ -369,16 +377,16 @@ inline std::set get_process_affinity() { std::set cpus; - // For unsupported systems, or in case of a soft error, we may assume all processors - // are available for use. + // For unsupported systems, or in case of a soft error, we may assume + // all processors are available for use. [[maybe_unused]] auto set_to_all_cpus = [&]() { for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c) cpus.insert(c); }; // cpu_set_t by default holds 1024 entries. This may not be enough soon, - // but there is no easy way to determine how many threads there actually is. - // In this case we just choose a reasonable upper bound. + // but there is no easy way to determine how many threads there actually + // is. In this case we just choose a reasonable upper bound. static constexpr CpuIndex MaxNumCpus = 1024 * 64; cpu_set_t* mask = CPU_ALLOC(MaxNumCpus); @@ -437,19 +445,19 @@ class NumaReplicatedAccessToken { NumaIndex n; }; -// Designed as immutable, because there is no good reason to alter an already existing config -// in a way that doesn't require recreating it completely, and it would be complex and expensive -// to maintain class invariants. -// The CPU (processor) numbers always correspond to the actual numbering used by the system. -// The NUMA node numbers MAY NOT correspond to the system's numbering of the NUMA nodes. -// In particular, empty nodes may be removed, or the user may create custom nodes. -// It is guaranteed that NUMA nodes are NOT empty, i.e. every node exposed by NumaConfig -// has at least one processor assigned. +// Designed as immutable, because there is no good reason to alter an already +// existing config in a way that doesn't require recreating it completely, and +// it would be complex and expensive to maintain class invariants. +// The CPU (processor) numbers always correspond to the actual numbering used +// by the system. The NUMA node numbers MAY NOT correspond to the system's +// numbering of the NUMA nodes. In particular, empty nodes may be removed, or +// the user may create custom nodes. It is guaranteed that NUMA nodes are NOT +// empty: every node exposed by NumaConfig has at least one processor assigned. // // We use startup affinities so as not to modify its own behaviour in time. // -// Until Stockfish doesn't support exceptions all places where an exception should be thrown -// are replaced by std::exit. +// Since Stockfish doesn't support exceptions all places where an exception +// should be thrown are replaced by std::exit. class NumaConfig { public: NumaConfig() : @@ -460,9 +468,9 @@ class NumaConfig { } // This function queries the system for the mapping of processors to NUMA nodes. - // On Linux we read from standardized kernel sysfs, with a fallback to single NUMA node. - // On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see - // comment for Windows implementation of get_process_affinity + // On Linux we read from standardized kernel sysfs, with a fallback to single NUMA + // node. On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see + // comment for Windows implementation of get_process_affinity. static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) { NumaConfig cfg = empty(); @@ -479,7 +487,6 @@ class NumaConfig { // On Linux things are straightforward, since there's no processor groups and // any thread can be scheduled on all processors. - // We try to gather this information from the sysfs first // https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node @@ -504,9 +511,9 @@ class NumaConfig { std::string path = std::string("/sys/devices/system/node/node") + std::to_string(n) + "/cpulist"; auto cpuIdsStr = read_file_to_string(path); - // Now, we only bail if the file does not exist. Some nodes may be empty, that's fine. - // An empty node still has a file that appears to have some whitespace, so we need - // to handle that. + // Now, we only bail if the file does not exist. Some nodes may be + // empty, that's fine. An empty node still has a file that appears + // to have some whitespace, so we need to handle that. if (!cpuIdsStr.has_value()) { fallback(); @@ -538,9 +545,10 @@ class NumaConfig { if (respectProcessAffinity) allowedCpus = STARTUP_PROCESSOR_AFFINITY.get_combined(); - // The affinity can't be determined in all cases on Windows, but we at least guarantee - // that the number of allowed processors is >= number of processors in the affinity mask. - // In case the user is not satisfied they must set the processor numbers explicitly. + // The affinity cannot be determined in all cases on Windows, + // but we at least guarantee that the number of allowed processors + // is >= number of processors in the affinity mask. In case the user + // is not satisfied they must set the processor numbers explicitly. auto is_cpu_allowed = [&allowedCpus](CpuIndex c) { return !allowedCpus.has_value() || allowedCpus->count(c) == 1; }; @@ -711,21 +719,22 @@ class NumaConfig { } bool suggests_binding_threads(CpuIndex numThreads) const { - // If we can reasonably determine that the threads can't be contained + // If we can reasonably determine that the threads cannot be contained // by the OS within the first NUMA node then we advise distributing // and binding threads. When the threads are not bound we can only use // NUMA memory replicated objects from the first node, so when the OS - // has to schedule on other nodes we lose performance. - // We also suggest binding if there's enough threads to distribute among nodes - // with minimal disparity. - // We try to ignore small nodes, in particular the empty ones. + // has to schedule on other nodes we lose performance. We also suggest + // binding if there's enough threads to distribute among nodes with minimal + // disparity. We try to ignore small nodes, in particular the empty ones. - // If the affinity set by the user does not match the affinity given by the OS - // then binding is necessary to ensure the threads are running on correct processors. + // If the affinity set by the user does not match the affinity given by + // the OS then binding is necessary to ensure the threads are running on + // correct processors. if (customAffinity) return true; - // We obviously can't distribute a single thread, so a single thread should never be bound. + // We obviously cannot distribute a single thread, so a single thread + // should never be bound. if (numThreads <= 1) return false; @@ -754,8 +763,8 @@ class NumaConfig { if (nodes.size() == 1) { - // special case for when there's no NUMA nodes - // doesn't buy us much, but let's keep the default path simple + // Special case for when there's no NUMA nodes. This doesn't buy us + // much, but let's keep the default path simple. ns.resize(numThreads, NumaIndex{0}); } else @@ -769,9 +778,11 @@ class NumaConfig { { float fill = static_cast(occupation[n] + 1) / static_cast(nodes[n].size()); - // NOTE: Do we want to perhaps fill the first available node up to 50% first before considering other nodes? - // Probably not, because it would interfere with running multiple instances. We basically shouldn't - // favor any particular node. + // NOTE: Do we want to perhaps fill the first available node + // up to 50% first before considering other nodes? + // Probably not, because it would interfere with running + // multiple instances. We basically shouldn't favor any + // particular node. if (fill < bestNodeFill) { bestNode = n; @@ -816,18 +827,19 @@ class NumaConfig { #elif defined(_WIN64) - // Requires Windows 11. No good way to set thread affinity spanning processor groups before that. + // Requires Windows 11. No good way to set thread affinity spanning + // processor groups before that. HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll")); auto SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t( (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks")); - // We ALWAYS set affinity with the new API if available, - // because there's no downsides, and we forcibly keep it consistent - // with the old API should we need to use it. I.e. we always keep this as a superset - // of what we set with SetThreadGroupAffinity. + // We ALWAYS set affinity with the new API if available, because + // there's no downsides, and we forcibly keep it consistent with + // the old API should we need to use it. I.e. we always keep this + // as a superset of what we set with SetThreadGroupAffinity. if (SetThreadSelectedCpuSetMasks_f != nullptr) { - // Only available on Windows 11 and Windows Server 2022 onwards. + // Only available on Windows 11 and Windows Server 2022 onwards const USHORT numProcGroups = USHORT( ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE); auto groupAffinities = std::make_unique(numProcGroups); @@ -857,22 +869,25 @@ class NumaConfig { // Sometimes we need to force the old API, but do not use it unless necessary. if (SetThreadSelectedCpuSetMasks_f == nullptr || STARTUP_USE_OLD_AFFINITY_API) { - // On earlier windows version (since windows 7) we can't run a single thread + // On earlier windows version (since windows 7) we cannot run a single thread // on multiple processor groups, so we need to restrict the group. // We assume the group of the first processor listed for this node. // Processors from outside this group will not be assigned for this thread. // Normally this won't be an issue because windows used to assign NUMA nodes - // such that they can't span processor groups. However, since Windows 10 Build 20348 - // the behaviour changed, so there's a small window of versions between this and Windows 11 - // that might exhibit problems with not all processors being utilized. - // We handle this in NumaConfig::from_system by manually splitting the nodes when - // we detect that there's no function to set affinity spanning processor nodes. - // This is required because otherwise our thread distribution code may produce - // suboptimal results. + // such that they cannot span processor groups. However, since Windows 10 + // Build 20348 the behaviour changed, so there's a small window of versions + // between this and Windows 11 that might exhibit problems with not all + // processors being utilized. + // + // We handle this in NumaConfig::from_system by manually splitting the + // nodes when we detect that there is no function to set affinity spanning + // processor nodes. This is required because otherwise our thread distribution + // code may produce suboptimal results. + // // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support GROUP_AFFINITY affinity; std::memset(&affinity, 0, sizeof(GROUP_AFFINITY)); - // We use an ordered set so we're guaranteed to get the smallest cpu number here. + // We use an ordered set to be sure to get the smallest cpu number here. const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE; affinity.Group = static_cast(forcedProcGroupIndex); for (CpuIndex c : nodes[n]) @@ -894,8 +909,8 @@ class NumaConfig { if (status == 0) std::exit(EXIT_FAILURE); - // We yield this thread just to be sure it gets rescheduled. - // This is defensive, allowed because this code is not performance critical. + // We yield this thread just to be sure it gets rescheduled. This is + // defensive, allowed because this code is not performance critical. SwitchToThread(); } @@ -1013,8 +1028,8 @@ class NumaConfig { class NumaReplicationContext; -// Instances of this class are tracked by the NumaReplicationContext instance -// NumaReplicationContext informs all tracked instances whenever NUMA configuration changes. +// Instances of this class are tracked by the NumaReplicationContext instance. +// NumaReplicationContext informs all tracked instances when NUMA configuration changes. class NumaReplicatedBase { public: NumaReplicatedBase(NumaReplicationContext& ctx); @@ -1034,9 +1049,9 @@ class NumaReplicatedBase { NumaReplicationContext* context; }; -// We force boxing with a unique_ptr. If this becomes an issue due to added indirection we -// may need to add an option for a custom boxing type. -// When the NUMA config changes the value stored at the index 0 is replicated to other nodes. +// We force boxing with a unique_ptr. If this becomes an issue due to added +// indirection we may need to add an option for a custom boxing type. When the +// NUMA config changes the value stored at the index 0 is replicated to other nodes. template class NumaReplicated: public NumaReplicatedBase { public: @@ -1090,8 +1105,8 @@ class NumaReplicated: public NumaReplicatedBase { } void on_numa_config_changed() override { - // Use the first one as the source. It doesn't matter which one we use, because they all must - // be identical, but the first one is guaranteed to exist. + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. auto source = std::move(instances[0]); replicate_from(std::move(*source)); } @@ -1167,7 +1182,7 @@ class NumaReplicationContext { private: NumaConfig config; - // std::set uses std::less by default, which is required for pointer comparison to be defined. + // std::set uses std::less by default, which is required for pointer comparison std::set trackedReplicatedObjects; }; diff --git a/src/position.h b/src/position.h index 3cfb87d0..064dd5fa 100644 --- a/src/position.h +++ b/src/position.h @@ -315,8 +315,8 @@ inline bool Position::capture(Move m) const { } // Returns true if a move is generated from the capture stage, having also -// queen promotions covered, i.e. consistency with the capture stage move generation -// is needed to avoid the generation of duplicate moves. +// queen promotions covered, i.e. consistency with the capture stage move +// generation is needed to avoid the generation of duplicate moves. inline bool Position::capture_stage(Move m) const { assert(m.is_ok()); return capture(m) || m.promotion_type() == QUEEN; diff --git a/src/search.cpp b/src/search.cpp index 47c5dc88..d3d95eda 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -78,7 +78,8 @@ constexpr int futility_move_count(bool improving, Depth depth) { return improving ? (3 + depth * depth) : (3 + depth * depth) / 2; } -// Add correctionHistory value to raw staticEval and guarantee evaluation does not hit the tablebase range +// Add correctionHistory value to raw staticEval and guarantee evaluation +// does not hit the tablebase range. Value to_corrected_static_eval(Value v, const Worker& w, const Position& pos) { auto cv = w.correctionHistory[pos.side_to_move()][pawn_structure_index(pos)]; v += cv * std::abs(cv) / 5073; @@ -333,8 +334,8 @@ void Search::Worker::iterative_deepening() { int failedHighCnt = 0; while (true) { - // Adjust the effective depth searched, but ensure at least one effective increment - // for every four searchAgain steps (see issue #2717). + // Adjust the effective depth searched, but ensure at least one + // effective increment for every four searchAgain steps (see issue #2717). Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - 3 * (searchAgainCounter + 1) / 4); rootDelta = beta - alpha; @@ -354,15 +355,15 @@ void Search::Worker::iterative_deepening() { if (threads.stop) break; - // When failing high/low give some update before a re-search. - // To avoid excessive output that could hang GUIs like Fritz 19, only start + // When failing high/low give some update before a re-search. To avoid + // excessive output that could hang GUIs like Fritz 19, only start // at nodes > 10M (rather than depth N, which can be reached quickly) if (mainThread && multiPV == 1 && (bestValue <= alpha || bestValue >= beta) && nodes > 10000000) main_manager()->pv(*this, threads, tt, rootDepth); - // In case of failing low/high increase aspiration window and - // re-search, otherwise exit the loop. + // In case of failing low/high increase aspiration window and re-search, + // otherwise exit the loop. if (bestValue <= alpha) { beta = (alpha + beta) / 2; @@ -390,10 +391,11 @@ void Search::Worker::iterative_deepening() { if (mainThread && (threads.stop || pvIdx + 1 == multiPV || nodes > 10000000) - // A thread that aborted search can have mated-in/TB-loss PV and score - // that cannot be trusted, i.e. it can be delayed or refuted if we would have - // had time to fully search other root-moves. Thus we suppress this output and - // below pick a proven score/PV for this thread (from the previous iteration). + // A thread that aborted search can have mated-in/TB-loss PV and + // score that cannot be trusted, i.e. it can be delayed or refuted + // if we would have had time to fully search other root-moves. Thus + // we suppress this output and below pick a proven score/PV for this + // thread (from the previous iteration). && !(threads.abortedSearch && rootMoves[0].uciScore <= VALUE_TB_LOSS_IN_MAX_PLY)) main_manager()->pv(*this, threads, tt, rootDepth); @@ -504,6 +506,7 @@ void Search::Worker::iterative_deepening() { skill.best ? skill.best : skill.pick_best(rootMoves, multiPV))); } +// Reset histories, usually before a new game void Search::Worker::clear() { mainHistory.fill(0); captureHistory.fill(-700); @@ -523,7 +526,7 @@ void Search::Worker::clear() { } -// Main search function for both PV and non-PV nodes. +// Main search function for both PV and non-PV nodes template Value Search::Worker::search( Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode) { @@ -538,7 +541,7 @@ Value Search::Worker::search( // Limit the depth if extensions made it too large depth = std::min(depth, MAX_PLY - 1); - // Check if we have an upcoming move that draws by repetition. + // Check if we have an upcoming move that draws by repetition if (!rootNode && alpha < VALUE_DRAW && pos.upcoming_repetition(ss->ply)) { alpha = value_draw(this->nodes); @@ -611,7 +614,7 @@ Value Search::Worker::search( Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; ss->statScore = 0; - // Step 4. Transposition table lookup. + // Step 4. Transposition table lookup excludedMove = ss->excludedMove; posKey = pos.key(); auto [ttHit, ttData, ttWriter] = tt.probe(posKey); @@ -676,7 +679,7 @@ Value Search::Worker::search( Value tbValue = VALUE_TB - ss->ply; - // use the range VALUE_TB to VALUE_TB_WIN_IN_MAX_PLY to score + // Use the range VALUE_TB to VALUE_TB_WIN_IN_MAX_PLY to score value = wdl < -drawScore ? -tbValue : wdl > drawScore ? tbValue : VALUE_DRAW + 2 * wdl * drawScore; @@ -771,8 +774,8 @@ Value Search::Worker::search( opponentWorsening = ss->staticEval + (ss - 1)->staticEval > 2; // Step 7. Razoring (~1 Elo) - // If eval is really low check with qsearch if it can exceed alpha, if it can't, - // return a fail low. + // If eval is really low, check with qsearch if we can exceed alpha. If the + // search suggests we cannot exceed alpha, return a speculative fail low. if (eval < alpha - 494 - 290 * depth * depth) { value = qsearch(pos, ss, alpha - 1, alpha); @@ -836,27 +839,26 @@ Value Search::Worker::search( if (PvNode && !ttData.move) depth -= 3; - // Use qsearch if depth <= 0. + // Use qsearch if depth <= 0 if (depth <= 0) return qsearch(pos, ss, alpha, beta); - // For cutNodes, if depth is high enough, decrease depth by 2 if there is no ttMove, or - // by 1 if there is a ttMove with an upper bound. + // For cutNodes, if depth is high enough, decrease depth by 2 if there is no ttMove, + // or by 1 if there is a ttMove with an upper bound. if (cutNode && depth >= 7 && (!ttData.move || ttData.bound == BOUND_UPPER)) depth -= 1 + !ttData.move; // Step 11. ProbCut (~10 Elo) - // If we have a good enough capture (or queen promotion) and a reduced search returns a value - // much above beta, we can (almost) safely prune the previous move. + // If we have a good enough capture (or queen promotion) and a reduced search + // returns a value much above beta, we can (almost) safely prune the previous move. probCutBeta = beta + 184 - 53 * improving; - if ( - !PvNode && depth > 3 - && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY - // If value from transposition table is lower than probCutBeta, don't attempt probCut - // there and in further interactions with transposition table cutoff depth is set to depth - 3 - // because probCut search has depth set to depth - 4 but we also do a move before it - // So effective depth is equal to depth - 3 - && !(ttData.depth >= depth - 3 && ttData.value != VALUE_NONE && ttData.value < probCutBeta)) + if (!PvNode && depth > 3 + && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY + // If value from transposition table is lower than probCutBeta, don't attempt + // probCut there and in further interactions with transposition table cutoff + // depth is set to depth - 3 because probCut search has depth set to depth - 4 + // but we also do a move before it. So effective depth is equal to depth - 3. + && !(ttData.depth >= depth - 3 && ttData.value != VALUE_NONE && ttData.value < probCutBeta)) { assert(probCutBeta < VALUE_INFINITE && probCutBeta > beta); @@ -870,7 +872,6 @@ Value Search::Worker::search( if (move == excludedMove) continue; - // Check for legality if (!pos.legal(move)) continue; @@ -1050,18 +1051,18 @@ moves_loop: // When in check, search starts here // We take care to not overdo to avoid search getting stuck. if (ss->ply < thisThread->rootDepth * 2) { - // Singular extension search (~76 Elo, ~170 nElo). If all moves but one fail - // low on a search of (alpha-s, beta-s), and just one fails high on (alpha, beta), - // then that move is singular and should be extended. To verify this we do - // a reduced search on the position excluding the ttMove and if the result - // is lower than ttValue minus a margin, then we will extend the ttMove. - // Recursive singular search is avoided. + // Singular extension search (~76 Elo, ~170 nElo). If all moves but one + // fail low on a search of (alpha-s, beta-s), and just one fails high on + // (alpha, beta), then that move is singular and should be extended. To + // verify this we do a reduced search on the position excluding the ttMove + // and if the result is lower than ttValue minus a margin, then we will + // extend the ttMove. Recursive singular search is avoided. - // Note: the depth margin and singularBeta margin are known for having non-linear - // scaling. Their values are optimized to time controls of 180+1.8 and longer - // so changing them requires tests at these types of time controls. - // Generally, higher singularBeta (i.e closer to ttValue) and lower extension - // margins scale well. + // Note: the depth margin and singularBeta margin are known for having + // non-linear scaling. Their values are optimized to time controls of + // 180+1.8 and longer so changing them requires tests at these types of + // time controls. Generally, higher singularBeta (i.e closer to ttValue) + // and lower extension margins scale well. if (!rootNode && move == ttData.move && !excludedMove && depth >= 4 - (thisThread->completedDepth > 36) + ss->ttPv @@ -1089,28 +1090,31 @@ moves_loop: // When in check, search starts here // Multi-cut pruning // Our ttMove is assumed to fail high based on the bound of the TT entry, - // and if after excluding the ttMove with a reduced search we fail high over the original beta, - // we assume this expected cut-node is not singular (multiple moves fail high), - // and we can prune the whole subtree by returning a softbound. + // and if after excluding the ttMove with a reduced search we fail high + // over the original beta, we assume this expected cut-node is not + // singular (multiple moves fail high), and we can prune the whole + // subtree by returning a softbound. else if (value >= beta && std::abs(value) < VALUE_TB_WIN_IN_MAX_PLY) return value; // Negative extensions - // If other moves failed high over (ttValue - margin) without the ttMove on a reduced search, - // but we cannot do multi-cut because (ttValue - margin) is lower than the original beta, - // we do not know if the ttMove is singular or can do a multi-cut, - // so we reduce the ttMove in favor of other moves based on some conditions: + // If other moves failed high over (ttValue - margin) without the + // ttMove on a reduced search, but we cannot do multi-cut because + // (ttValue - margin) is lower than the original beta, we do not know + // if the ttMove is singular or can do a multi-cut, so we reduce the + // ttMove in favor of other moves based on some conditions: // If the ttMove is assumed to fail high over current beta (~7 Elo) else if (ttData.value >= beta) extension = -3; - // If we are on a cutNode but the ttMove is not assumed to fail high over current beta (~1 Elo) + // If we are on a cutNode but the ttMove is not assumed to fail high + // over current beta (~1 Elo) else if (cutNode) extension = -2; } - // Extension for capturing the previous moved piece (~0 Elo on STC, ~1 Elo on LTC) + // Extension for capturing the previous moved piece (~1 Elo at LTC) else if (PvNode && move.to_sq() == prevSq && thisThread->captureHistory[movedPiece][move.to_sq()] [type_of(pos.piece_on(move.to_sq()))] @@ -1136,9 +1140,9 @@ moves_loop: // When in check, search starts here pos.do_move(move, st, givesCheck); // These reduction adjustments have proven non-linear scaling. - // They are optimized to time controls of 180 + 1.8 and longer so - // changing them or adding conditions that are similar - // requires tests at these types of time controls. + // They are optimized to time controls of 180 + 1.8 and longer, + // so changing them or adding conditions that are similar requires + // tests at these types of time controls. // Decrease reduction if position is or has been on the PV (~7 Elo) if (ss->ttPv) @@ -1148,7 +1152,7 @@ moves_loop: // When in check, search starts here if (PvNode) r--; - // These reduction adjustments have no proven non-linear scaling. + // These reduction adjustments have no proven non-linear scaling // Increase reduction for cut nodes (~4 Elo) if (cutNode) @@ -1163,8 +1167,8 @@ moves_loop: // When in check, search starts here if ((ss + 1)->cutoffCnt > 3) r += 1 + !(PvNode || cutNode); - // For first picked move (ttMove) reduce reduction - // but never allow it to go below 0 (~3 Elo) + // For first picked move (ttMove) reduce reduction, but never allow + // reduction to go below 0 (~3 Elo) else if (move == ttData.move) r = std::max(0, r - 2); @@ -1190,8 +1194,8 @@ moves_loop: // When in check, search starts here // Do a full-depth search when reduced LMR search fails high if (value > alpha && d < newDepth) { - // Adjust full-depth search based on LMR results - if the result - // was good enough search deeper, if it was bad enough search shallower. + // Adjust full-depth search based on LMR results - if the result was + // good enough search deeper, if it was bad enough search shallower. const bool doDeeperSearch = value > (bestValue + 35 + 2 * newDepth); // (~1 Elo) const bool doShallowerSearch = value < bestValue + newDepth; // (~2 Elo) @@ -1237,8 +1241,8 @@ moves_loop: // When in check, search starts here // Step 20. Check for a new best move // Finished searching the move. If a stop occurred, the return value of - // the search cannot be trusted, and we return immediately without - // updating best move, PV and TT. + // the search cannot be trusted, and we return immediately without updating + // best move, principal variation nor transposition table. if (threads.stop.load(std::memory_order_relaxed)) return VALUE_ZERO; @@ -1351,7 +1355,8 @@ moves_loop: // When in check, search starts here if (!moveCount) bestValue = excludedMove ? alpha : ss->inCheck ? mated_in(ss->ply) : VALUE_DRAW; - // If there is a move that produces search value greater than alpha we update the stats of searched moves + // If there is a move that produces search value greater than alpha, + // we update the stats of searched moves. else if (bestMove) update_all_stats(pos, ss, *this, bestMove, prevSq, quietsSearched, quietCount, capturesSearched, captureCount, depth); @@ -1385,8 +1390,8 @@ moves_loop: // When in check, search starts here if (bestValue <= alpha) ss->ttPv = ss->ttPv || ((ss - 1)->ttPv && depth > 3); - // Write gathered information in transposition table - // Static evaluation is saved as it was before correction history + // Write gathered information in transposition table. Note that the + // static evaluation is saved as it was before correction history. if (!excludedMove && !(rootNode && thisThread->pvIdx)) ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, bestValue >= beta ? BOUND_LOWER @@ -1410,12 +1415,12 @@ moves_loop: // When in check, search starts here } -// Quiescence search function, which is called by the main search function with zero depth, or -// recursively with further decreasing depth per call. With depth <= 0, we "should" be using -// static eval only, but tactical moves may confuse the static eval. To fight this horizon effect, -// we implement this qsearch of tactical moves only. -// See https://www.chessprogramming.org/Horizon_Effect and https://www.chessprogramming.org/Quiescence_Search -// (~155 Elo) +// Quiescence search function, which is called by the main search function with +// depth zero, or recursively with further decreasing depth. With depth <= 0, we +// "should" be using static eval only, but tactical moves may confuse the static eval. +// To fight this horizon effect, we implement this qsearch of tactical moves (~155 Elo). +// See https://www.chessprogramming.org/Horizon_Effect +// and https://www.chessprogramming.org/Quiescence_Search template Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth) { @@ -1426,7 +1431,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, assert(PvNode || (alpha == beta - 1)); assert(depth <= 0); - // Check if we have an upcoming move that draws by repetition. (~1 Elo) + // Check if we have an upcoming move that draws by repetition (~1 Elo) if (alpha < VALUE_DRAW && pos.upcoming_repetition(ss->ply)) { alpha = value_draw(this->nodes); @@ -1469,9 +1474,10 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, assert(0 <= ss->ply && ss->ply < MAX_PLY); - // Note that unlike regular search, which stores the literal depth into the TT, from QS we - // only store the current movegen stage as "depth". If in check, we search all evasions and - // thus store DEPTH_QS_CHECKS. (Evasions may be quiet, and _CHECKS includes quiets.) + // Note that unlike regular search, which stores the literal depth into the + // transposition table, from qsearch we only store the current movegen stage + // as "depth". If in check, we search all evasions and thus store DEPTH_QS_CHECKS. + // Evasions may be quiet, and _CHECKS includes quiets. Depth qsTtDepth = ss->inCheck || depth >= DEPTH_QS_CHECKS ? DEPTH_QS_CHECKS : DEPTH_QS_NORMAL; // Step 3. Transposition table lookup @@ -1512,7 +1518,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, } else { - // In case of null move search, use previous static eval with a different sign + // In case of null move search, use previous static eval with opposite sign unadjustedStaticEval = (ss - 1)->currentMove != Move::null() ? evaluate(networks[numaAccessToken], pos, refreshTable, thisThread->optimism[us]) @@ -1542,21 +1548,20 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, const PieceToHistory* contHist[] = {(ss - 1)->continuationHistory, (ss - 2)->continuationHistory}; - // Initialize a MovePicker object for the current position, and prepare to search the moves. - // We presently use two stages of qs movegen, first captures+checks, then captures only. - // (When in check, we simply search all evasions.) - // (Presently, having the checks stage is worth only 1 Elo, and may be removable in the near future, - // which would result in only a single stage of QS movegen.) + // Initialize a MovePicker object for the current position, and prepare to search + // the moves. We presently use two stages of move generator in quiescence search: + // first captures+checks, then captures only (but when in check, we simply search + // all evasions). Square prevSq = ((ss - 1)->currentMove).is_ok() ? ((ss - 1)->currentMove).to_sq() : SQ_NONE; MovePicker mp(pos, ttData.move, depth, &thisThread->mainHistory, &thisThread->captureHistory, contHist, &thisThread->pawnHistory); - // Step 5. Loop through all pseudo-legal moves until no moves remain or a beta cutoff occurs. + // Step 5. Loop through all pseudo-legal moves until no moves remain or a beta + // cutoff occurs. while ((move = mp.next_move()) != Move::none()) { assert(move.is_ok()); - // Check for legality if (!pos.legal(move)) continue; @@ -1577,24 +1582,24 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Value futilityValue = futilityBase + PieceValue[pos.piece_on(move.to_sq())]; - // If static eval + value of piece we are going to capture is much lower - // than alpha we can prune this move. (~2 Elo) + // If static eval + value of piece we are going to capture is + // much lower than alpha, we can prune this move. (~2 Elo) if (futilityValue <= alpha) { bestValue = std::max(bestValue, futilityValue); continue; } - // If static eval is much lower than alpha and move is not winning material - // we can prune this move. (~2 Elo) + // If static eval is much lower than alpha and move is + // not winning material, we can prune this move. (~2 Elo) if (futilityBase <= alpha && !pos.see_ge(move, 1)) { bestValue = std::max(bestValue, futilityBase); continue; } - // If static exchange evaluation is much worse than what is needed to not - // fall below alpha we can prune this move. + // If static exchange evaluation is much worse than what + // is needed to not fall below alpha, we can prune this move. if (futilityBase > alpha && !pos.see_ge(move, (alpha - futilityBase) * 4)) { bestValue = alpha; @@ -1654,8 +1659,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, } // Step 9. Check for mate - // All legal moves have been searched. A special case: if we're in check - // and no legal moves were found, it is checkmate. + // All legal moves have been searched. A special case: if we are + // in check and no legal moves were found, it is checkmate. if (ss->inCheck && bestValue == -VALUE_INFINITE) { assert(!MoveList(pos).size()); @@ -1665,8 +1670,8 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta, if (std::abs(bestValue) < VALUE_TB_WIN_IN_MAX_PLY && bestValue >= beta) bestValue = (3 * bestValue + beta) / 4; - // Save gathered info in transposition table - // Static evaluation is saved as it was before adjustment by correction history + // Save gathered info in transposition table. The static evaluation + // is saved as it was before adjustment by correction history. ttWriter.write(posKey, value_to_tt(bestValue, ss->ply), pvHit, bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, qsTtDepth, bestMove, unadjustedStaticEval, tt.generation()); @@ -1697,8 +1702,8 @@ TimePoint Search::Worker::elapsed_time() const { return main_manager()->tm.elaps namespace { -// Adjusts a mate or TB score from "plies to mate from the root" -// to "plies to mate from the current position". Standard scores are unchanged. +// Adjusts a mate or TB score from "plies to mate from the root" to +// "plies to mate from the current position". Standard scores are unchanged. // The function is called before storing a value in the transposition table. Value value_to_tt(Value v, int ply) { @@ -1707,11 +1712,11 @@ Value value_to_tt(Value v, int ply) { } -// Inverse of value_to_tt(): it adjusts a mate or TB score -// from the transposition table (which refers to the plies to mate/be mated from -// current position) to "plies to mate/be mated (TB win/loss) from the root". -// However, to avoid potentially false mate or TB scores related to the 50 moves rule -// and the graph history interaction, we return the highest non-TB score instead. +// Inverse of value_to_tt(): it adjusts a mate or TB score from the transposition +// table (which refers to the plies to mate/be mated from current position) to +// "plies to mate/be mated (TB win/loss) from the root". However, to avoid +// potentially false mate or TB scores related to the 50 moves rule and the +// graph history interaction, we return the highest non-TB score instead. Value value_from_tt(Value v, int ply, int r50c) { if (v == VALUE_NONE) @@ -1810,8 +1815,8 @@ void update_all_stats(const Position& pos, } -// Updates histories of the move pairs formed -// by moves at ply -1, -2, -3, -4, and -6 with current move. +// Updates histories of the move pairs formed by moves +// at ply -1, -2, -3, -4, and -6 with current move. void update_continuation_histories(Stack* ss, Piece pc, Square to, int bonus) { bonus = bonus * 52 / 64; @@ -1859,8 +1864,8 @@ void update_quiet_stats( } -// When playing with strength handicap, choose the best move among a set of RootMoves -// using a statistical rule dependent on 'level'. Idea by Heinz van Saanen. +// When playing with strength handicap, choose the best move among a set of +// RootMoves using a statistical rule dependent on 'level'. Idea by Heinz van Saanen. Move Skill::pick_best(const RootMoves& rootMoves, size_t multiPV) { static PRNG rng(now()); // PRNG sequence should be non-deterministic @@ -1891,8 +1896,8 @@ Move Skill::pick_best(const RootMoves& rootMoves, size_t multiPV) { } -// Used to print debug info and, more importantly, -// to detect when we are out of available time and thus stop the search. +// Used to print debug info and, more importantly, to detect +// when we are out of available time and thus stop the search. void SearchManager::check_time(Search::Worker& worker) { if (--callsCnt > 0) return; @@ -1926,8 +1931,9 @@ void SearchManager::check_time(Search::Worker& worker) { } // Used to correct and extend PVs for moves that have a TB (but not a mate) score. -// Keeps the search based PV for as long as it is verified to maintain the game outcome, truncates afterwards. -// Finally, extends to mate the PV, providing a possible continuation (but not a proven mating line). +// Keeps the search based PV for as long as it is verified to maintain the game +// outcome, truncates afterwards. Finally, extends to mate the PV, providing a +// possible continuation (but not a proven mating line). void syzygy_extend_pv(const OptionsMap& options, const Search::LimitsType& limits, Position& pos, @@ -1937,7 +1943,7 @@ void syzygy_extend_pv(const OptionsMap& options, auto t_start = std::chrono::steady_clock::now(); int moveOverhead = int(options["Move Overhead"]); - // Do not use more than moveOverhead / 2 time, if time management is active. + // Do not use more than moveOverhead / 2 time, if time management is active auto time_abort = [&t_start, &moveOverhead, &limits]() -> bool { auto t_end = std::chrono::steady_clock::now(); return limits.use_time_management() @@ -1968,7 +1974,7 @@ void syzygy_extend_pv(const OptionsMap& options, auto& st = sts.emplace_back(); pos.do_move(pvMove, st); - // don't allow for repetitions or drawing moves along the PV in TB regime. + // Do not allow for repetitions or drawing moves along the PV in TB regime if (config.rootInTB && pos.is_draw(ply)) { pos.undo_move(pvMove); @@ -1976,17 +1982,18 @@ void syzygy_extend_pv(const OptionsMap& options, break; } - // Full PV shown will thus be validated and end TB. - // If we can't validate the full PV in time, we don't show it. + // Full PV shown will thus be validated and end in TB. + // If we cannot validate the full PV in time, we do not show it. if (config.rootInTB && time_abort()) break; } - // resize the PV to the correct part + // Resize the PV to the correct part rootMove.pv.resize(ply); - // Step 2, now extend the PV to mate, as if the user explores syzygy-tables.info using - // top ranked moves (minimal DTZ), which gives optimal mates only for simple endgames e.g. KRvK + // Step 2, now extend the PV to mate, as if the user explored syzygy-tables.info + // using top ranked moves (minimal DTZ), which gives optimal mates only for simple + // endgames e.g. KRvK. while (!pos.is_draw(0)) { if (time_abort()) @@ -1998,8 +2005,8 @@ void syzygy_extend_pv(const OptionsMap& options, auto& rm = legalMoves.emplace_back(m); StateInfo tmpSI; pos.do_move(m, tmpSI); - // Give a score of each move to break DTZ ties - // restricting opponent mobility, but not giving the opponent a capture. + // Give a score of each move to break DTZ ties restricting opponent mobility, + // but not giving the opponent a capture. for (const auto& mOpp : MoveList(pos)) rm.tbRank -= pos.capture(mOpp) ? 100 : 1; pos.undo_move(m); @@ -2009,16 +2016,16 @@ void syzygy_extend_pv(const OptionsMap& options, if (legalMoves.size() == 0) break; - // sort moves according to their above assigned rank, + // Sort moves according to their above assigned rank. // This will break ties for moves with equal DTZ in rank_root_moves. std::stable_sort( legalMoves.begin(), legalMoves.end(), [](const Search::RootMove& a, const Search::RootMove& b) { return a.tbRank > b.tbRank; }); - // The winning side tries to minimize DTZ, the losing side maximizes it. + // The winning side tries to minimize DTZ, the losing side maximizes it Tablebases::Config config = Tablebases::rank_root_moves(options, pos, legalMoves, true); - // If DTZ is not available we might not find a mate, so we bail out. + // If DTZ is not available we might not find a mate, so we bail out if (!config.rootInTB || config.cardinality > 0) break; @@ -2030,23 +2037,24 @@ void syzygy_extend_pv(const OptionsMap& options, pos.do_move(pvMove, st); } - // Finding a draw in this function is an exceptional case, that cannot happen during engine game play, - // since we have a winning score, and play correctly with TB support. - // However, it can be that a position is draw due to the 50 move rule if it has been been reached - // on the board with a non-optimal 50 move counter e.g. 8/8/6k1/3B4/3K4/4N3/8/8 w - - 54 106 - // which TB with dtz counter rounding cannot always correctly rank. See also + // Finding a draw in this function is an exceptional case, that cannot happen + // during engine game play, since we have a winning score, and play correctly + // with TB support. However, it can be that a position is draw due to the 50 move + // rule if it has been been reached on the board with a non-optimal 50 move counter + // (e.g. 8/8/6k1/3B4/3K4/4N3/8/8 w - - 54 106 ) which TB with dtz counter rounding + // cannot always correctly rank. See also // https://github.com/official-stockfish/Stockfish/issues/5175#issuecomment-2058893495 - // We adjust the score to match the found PV. Note that a TB loss score can be displayed - // if the engine did not find a drawing move yet, but eventually search will figure it out. - // E.g. 1kq5/q2r4/5K2/8/8/8/8/7Q w - - 96 1 + // We adjust the score to match the found PV. Note that a TB loss score can be + // displayed if the engine did not find a drawing move yet, but eventually search + // will figure it out (e.g. 1kq5/q2r4/5K2/8/8/8/8/7Q w - - 96 1 ) if (pos.is_draw(0)) v = VALUE_DRAW; - // Undo the PV moves. + // Undo the PV moves for (auto it = rootMove.pv.rbegin(); it != rootMove.pv.rend(); ++it) pos.undo_move(*it); - // Inform if we couldn't get a full extension in time. + // Inform if we couldn't get a full extension in time if (time_abort()) sync_cout << "info string Syzygy based PV extension requires more time, increase Move Overhead as needed." @@ -2092,7 +2100,7 @@ void SearchManager::pv(Search::Worker& worker, for (Move m : rootMoves[i].pv) pv += UCIEngine::move(m, pos.is_chess960()) + " "; - // remove last whitespace + // Remove last whitespace if (!pv.empty()) pv.pop_back(); diff --git a/src/search.h b/src/search.h index 122cd549..57596754 100644 --- a/src/search.h +++ b/src/search.h @@ -236,8 +236,8 @@ class Worker { public: Worker(SharedState&, std::unique_ptr, size_t, NumaReplicatedAccessToken); - // Called at instantiation to initialize Reductions tables - // Reset histories, usually before a new game + // Called at instantiation to initialize reductions tables. + // Reset histories, usually before a new game. void clear(); // Called when the program receives the UCI 'go' command. @@ -256,7 +256,7 @@ class Worker { private: void iterative_deepening(); - // Main search function for both PV and non-PV nodes + // This is the main search function, for both PV and non-PV nodes template Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode); @@ -266,8 +266,7 @@ class Worker { Depth reduction(bool i, Depth d, int mn, int delta) const; - // Get a pointer to the search manager, only allowed to be called by the - // main thread. + // Pointer to the search manager, only allowed to be called by the main thread SearchManager* main_manager() const { assert(threadIdx == 0); return static_cast(manager.get()); diff --git a/src/thread.cpp b/src/thread.cpp index 4acb9854..f17fc4a5 100644 --- a/src/thread.cpp +++ b/src/thread.cpp @@ -50,8 +50,8 @@ Thread::Thread(Search::SharedState& sharedState, run_custom_job([this, &binder, &sharedState, &sm, n]() { // Use the binder to [maybe] bind the threads to a NUMA node before doing - // the Worker allocation. - // Ideally we would also allocate the SearchManager here, but that's minor. + // the Worker allocation. Ideally we would also allocate the SearchManager + // here, but that's minor. this->numaAccessToken = binder(); this->worker = std::make_unique(sharedState, std::move(sm), n, this->numaAccessToken); @@ -72,27 +72,26 @@ Thread::~Thread() { stdThread.join(); } - // Wakes up the thread that will start the search void Thread::start_searching() { assert(worker != nullptr); run_custom_job([this]() { worker->start_searching(); }); } -// Wakes up the thread that will start the search +// Clears the histories for the thread worker (usually before a new game) void Thread::clear_worker() { assert(worker != nullptr); run_custom_job([this]() { worker->clear(); }); } -// Blocks on the condition variable -// until the thread has finished searching. +// Blocks on the condition variable until the thread has finished searching void Thread::wait_for_search_finished() { std::unique_lock lk(mutex); cv.wait(lk, [&] { return !searching; }); } +// Launching a function in the thread void Thread::run_custom_job(std::function f) { { std::unique_lock lk(mutex); @@ -103,8 +102,8 @@ void Thread::run_custom_job(std::function f) { cv.notify_one(); } -// Thread gets parked here, blocked on the -// condition variable, when it has no work to do. +// Thread gets parked here, blocked on the condition variable +// when the thread has no work to do. void Thread::idle_loop() { while (true) @@ -233,8 +232,9 @@ void ThreadPool::wait_on_thread(size_t threadId) { size_t ThreadPool::num_threads() const { return threads.size(); } -// Wakes up main thread waiting in idle_loop() and -// returns immediately. Main thread will wake up other threads and start the search. + +// Wakes up main thread waiting in idle_loop() and returns immediately. +// Main thread will wake up other threads and start the search. void ThreadPool::start_thinking(const OptionsMap& options, Position& pos, StateListPtr& states, @@ -274,8 +274,8 @@ void ThreadPool::start_thinking(const OptionsMap& options, // We use Position::set() to set root position across threads. But there are // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot // be deduced from a fen string, so set() clears them and they are set from - // setupStates->back() later. The rootState is per thread, earlier states are shared - // since they are read-only. + // setupStates->back() later. The rootState is per thread, earlier states are + // shared since they are read-only. for (auto&& th : threads) { th->run_custom_job([&]() { @@ -335,7 +335,7 @@ Thread* ThreadPool::get_best_thread() const { const bool newThreadInProvenLoss = newThreadScore != -VALUE_INFINITE && newThreadScore <= VALUE_TB_LOSS_IN_MAX_PLY; - // Note that we make sure not to pick a thread with truncated-PV for better viewer experience. + // We make sure not to pick a thread with truncated principal variation const bool betterVotingValue = thread_voting_value(th.get()) * int(newThreadPV.size() > 2) > thread_voting_value(bestThread) * int(bestThreadPV.size() > 2); @@ -363,8 +363,8 @@ Thread* ThreadPool::get_best_thread() const { } -// Start non-main threads -// Will be invoked by main thread after it has started searching +// Start non-main threads. +// Will be invoked by main thread after it has started searching. void ThreadPool::start_searching() { for (auto&& th : threads) @@ -374,7 +374,6 @@ void ThreadPool::start_searching() { // Wait for non-main threads - void ThreadPool::wait_for_search_finished() const { for (auto&& th : threads) diff --git a/src/types.h b/src/types.h index 10ad1fac..8a9400bb 100644 --- a/src/types.h +++ b/src/types.h @@ -137,9 +137,9 @@ enum Bound { BOUND_EXACT = BOUND_UPPER | BOUND_LOWER }; -// Value is used as an alias for int16_t, this is done to differentiate between -// a search value and any other integer value. The values used in search are always -// supposed to be in the range (-VALUE_NONE, VALUE_NONE] and should not exceed this range. +// Value is used as an alias for int, this is done to differentiate between a search +// value and any other integer value. The values used in search are always supposed +// to be in the range (-VALUE_NONE, VALUE_NONE] and should not exceed this range. using Value = int; constexpr Value VALUE_ZERO = 0; @@ -187,15 +187,20 @@ constexpr Value PieceValue[PIECE_NB] = { using Depth = int; enum : int { - // The following DEPTH_ constants are used for TT entries and QS movegen stages. In regular search, - // TT depth is literal: the search depth (effort) used to make the corresponding TT value. - // In qsearch, however, TT entries only store the current QS movegen stage (which should thus compare + // The following DEPTH_ constants are used for transposition table entries + // and quiescence search move generation stages. In regular search, the + // depth stored in the transposition table is literal: the search depth + // (effort) used to make the corresponding transposition table value. In + // quiescence search, however, the transposition table entries only store + // the current quiescence move generation stage (which should thus compare // lower than any regular search depth). DEPTH_QS_CHECKS = 0, DEPTH_QS_NORMAL = -1, - // For TT entries where no searching at all was done (whether regular or qsearch) we use - // _UNSEARCHED, which should thus compare lower than any QS or regular depth. _ENTRY_OFFSET is used - // only for the TT entry occupancy check (see tt.cpp), and should thus be lower than _UNSEARCHED. + // For transposition table entries where no searching at all was done + // (whether regular or qsearch) we use DEPTH_UNSEARCHED, which should thus + // compare lower than any quiescence or regular depth. DEPTH_ENTRY_OFFSET + // is used only for the transposition table entry occupancy check (see tt.cpp), + // and should thus be lower than DEPTH_UNSEARCHED. DEPTH_UNSEARCHED = -2, DEPTH_ENTRY_OFFSET = -3 }; @@ -356,9 +361,10 @@ enum MoveType { // bit 14-15: special move flag: promotion (1), en passant (2), castling (3) // NOTE: en passant bit is set only when a pawn can be captured // -// Special cases are Move::none() and Move::null(). We can sneak these in because in -// any normal move destination square is always different from origin square -// while Move::none() and Move::null() have the same origin and destination square. +// Special cases are Move::none() and Move::null(). We can sneak these in because +// in any normal move the destination square and origin square are always different, +// but Move::none() and Move::null() have the same origin and destination square. + class Move { public: Move() = default;