diff --git a/src/misc.cpp b/src/misc.cpp index 484d0b21..0bae9f1e 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -47,6 +47,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); #include #include +#ifdef __linux__ +#include +#include +#endif + #include "misc.h" #include "thread.h" @@ -190,7 +195,7 @@ const std::string compiler_info() { compiler += "(unknown version)"; #endif - #if defined(__APPLE__) + #if defined(__APPLE__) compiler += " on Apple"; #elif defined(__CYGWIN__) compiler += " on Cygwin"; @@ -288,6 +293,35 @@ void prefetch(void* addr) { #endif + +/// aligned_ttmem_alloc will return suitably aligned memory, and if possible use large pages. +/// The returned pointer is the aligned one, while the mem argument is the one that needs to be passed to free. +/// With c++17 some of this functionality can be simplified. +#ifdef __linux__ + +void* aligned_ttmem_alloc(size_t allocSize, void** mem) { + + constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes + size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment + *mem = aligned_alloc(alignment, size); + madvise(*mem, allocSize, MADV_HUGEPAGE); + return *mem; +} + +#else + +void* aligned_ttmem_alloc(size_t allocSize, void** mem) { + + constexpr size_t alignment = 64; // assumed cache line size + size_t size = allocSize + alignment - 1; // allocate some extra space + *mem = malloc(size); + void* ret = reinterpret_cast((uintptr_t(*mem) + alignment - 1) & ~uintptr_t(alignment - 1)); + return ret; +} + +#endif + + namespace WinProcGroup { #ifndef _WIN32 diff --git a/src/misc.h b/src/misc.h index b11c5aa8..45d9951a 100644 --- a/src/misc.h +++ b/src/misc.h @@ -33,6 +33,7 @@ const std::string engine_info(bool to_uci = false); const std::string compiler_info(); void prefetch(void* addr); void start_logger(const std::string& fname); +void* aligned_ttmem_alloc(size_t size, void** mem); void dbg_hit_on(bool b); void dbg_hit_on(bool c, bool b); diff --git a/src/tt.cpp b/src/tt.cpp index 0b4a59de..080d3a6b 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -63,11 +63,10 @@ void TranspositionTable::resize(size_t mbSize) { Threads.main()->wait_for_search_finished(); - clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); - free(mem); - mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1); + clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); + table = static_cast(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), &mem)); if (!mem) { std::cerr << "Failed to allocate " << mbSize @@ -75,7 +74,6 @@ void TranspositionTable::resize(size_t mbSize) { exit(EXIT_FAILURE); } - table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1)); clear(); } diff --git a/src/tt.h b/src/tt.h index 98b054d3..142afd90 100644 --- a/src/tt.h +++ b/src/tt.h @@ -57,24 +57,22 @@ private: }; -/// A TranspositionTable consists of a power of 2 number of clusters and each -/// cluster consists of ClusterSize number of TTEntry. Each non-empty entry -/// contains information of exactly one position. The size of a cluster should -/// divide the size of a cache line size, to ensure that clusters never cross -/// cache lines. This ensures best cache performance, as the cacheline is -/// prefetched, as soon as possible. +/// A TranspositionTable is an array of Cluster, of size clusterCount. Each +/// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry +/// contains information on exactly one position. The size of a Cluster should +/// divide the size of a cache line for best performance, +/// as the cacheline is prefetched when possible. class TranspositionTable { - static constexpr int CacheLineSize = 64; static constexpr int ClusterSize = 3; struct Cluster { TTEntry entry[ClusterSize]; - char padding[2]; // Align to a divisor of the cache line size + char padding[2]; // Pad to 32 bytes }; - static_assert(CacheLineSize % sizeof(Cluster) == 0, "Cluster size incorrect"); + static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size"); public: ~TranspositionTable() { free(mem); }