Add support for Windows large pages

for users that set the needed privilige "Lock Pages in Memory" large pages will be automatically enabled (see Readme.md). This expert setting might improve speed, 5% - 30%, depending on the hardware, the number of threads and hash size. More for large hashes, large number of threads and NUMA. If the operating system can not allocate large pages (easier after a reboot), default allocation is used automatically. The engine log provides details. closes https://github.com/official-stockfish/Stockfish/pull/2656 fixes https://github.com/official-stockfish/Stockfish/issues/2619 No functional change
2025-04-30 08:43:09 +00:00 · 2020-05-04 20:49:27 +03:00 · 2020-05-04 20:49:27 +03:00 · d4763424d2
commit d4763424d2
parent 86ee4eb84d
5 changed files with 120 additions and 2 deletions
--- a/Readme.md
+++ b/Readme.md
@ -42,7 +42,7 @@ Currently, Stockfish has the following UCI options:
    this equal to the number of CPU cores available.
  * #### Hash
-    The size of the hash table in MB.
+    The size of the hash table in MB. It is recommended to set Hash after setting Threads.
  * #### Clear Hash
    Clear the hash table.
@ -138,6 +138,30 @@ more compact than Nalimov tablebases, while still storing all information
 needed for optimal play and in addition being able to take into account
 the 50-move rule.
 ## Large Pages
 Stockfish supports large pages on Linux and Windows. Large pages make
 the hash access more efficient, improving the engine speed, especially
 on large hash sizes. Typical increases are 5..10% in terms of nps, but
 speed increases up to 30% have been measured. The support is
 automatic. Stockfish attempts to use large pages when available and
 will fall back to regular memory allocation when this is not the case.
 ### Support on Linux
 Large page support on Linux is obtained by the Linux kernel
 transparent huge pages functionality. Typically, transparent huge pages
 are already enabled and no configuration is needed.
 ### Support on Windows
 The use of large pages requires "Lock Pages in Memory" privilege. See
 [Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows)
 on how to enable this privilege. Logout/login may be needed
 afterwards. Due to memory fragmentation, it may not always be
 possible to allocate large pages even when enabled. A reboot
 might alleviate this problem. To determine whether large pages
 are in use, see the engine log.
 ## Compiling Stockfish yourself from the sources
--- a/src/main.cpp
+++ b/src/main.cpp
@ -49,6 +49,7 @@ int main(int argc, char* argv[]) {
  UCI::loop(argc, argv);
  TT.resize(0);
  Threads.set(0);
  return 0;
 }
--- a/src/misc.cpp
+++ b/src/misc.cpp
@ -309,6 +309,69 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
  return mem;
 }
 #elif defined(_WIN64)
 static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
  HANDLE hProcessToken { };
  LUID luid { };
  void* mem = nullptr;
  const size_t largePageSize = GetLargePageMinimum();
  if (!largePageSize)
      return nullptr;
  // We need SeLockMemoryPrivilege, so try to enable it for the process
  if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
      return nullptr;
  if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid))
  {
      TOKEN_PRIVILEGES tp { };
      TOKEN_PRIVILEGES prevTp { };
      DWORD prevTpLen = 0;
      tp.PrivilegeCount = 1;
      tp.Privileges[0].Luid = luid;
      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
      // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds,
      // we still need to query GetLastError() to ensure that the privileges were actually obtained...
      if (AdjustTokenPrivileges(
              hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) &&
          GetLastError() == ERROR_SUCCESS)
      {
          // round up size to full pages and allocate
          allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1);
          mem = VirtualAlloc(
              NULL, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
          // privilege no longer needed, restore previous state
          AdjustTokenPrivileges(hProcessToken, FALSE, &prevTp, 0, NULL, NULL);
      }
  }
  CloseHandle(hProcessToken);
  return mem;
 }
 void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
  // try to allocate large pages
  mem = aligned_ttmem_alloc_large_pages(allocSize);
  if (mem)
      sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
  else
      sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
  // fall back to regular, page aligned, allocation if necessary
  if (!mem)
      mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
  return mem;
 }
 #else
 void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
@ -322,6 +385,28 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
 #endif
 /// aligned_ttmem_free will free the previously allocated ttmem
 #if defined(_WIN64)
 void aligned_ttmem_free(void* mem) {
  if (!VirtualFree(mem, 0, MEM_RELEASE))
  {
      DWORD err = GetLastError();
      std::cerr << "Failed to free transposition table. Error code: 0x" <<
          std::hex << err << std::dec << std::endl;
      exit(EXIT_FAILURE);
  }
 }
 #else
 void aligned_ttmem_free(void *mem) {
  free(mem);
 }
 #endif
 namespace WinProcGroup {
--- a/src/misc.h
+++ b/src/misc.h
@ -34,6 +34,7 @@ const std::string compiler_info();
 void prefetch(void* addr);
 void start_logger(const std::string& fname);
 void* aligned_ttmem_alloc(size_t size, void*& mem);
 void aligned_ttmem_free(void* mem);
 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
--- a/src/tt.cpp
+++ b/src/tt.cpp
@ -63,7 +63,14 @@ void TranspositionTable::resize(size_t mbSize) {
  Threads.main()->wait_for_search_finished();
-  free(mem);
+  if (mem)
      aligned_ttmem_free(mem);
  if (!mbSize)
  {
      mem = nullptr;
      return;
  }
  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));