mirror of
https://github.com/sockspls/badfish
synced 2025-05-01 09:13:08 +00:00
Tweaks to bitcount functions
Seems even a bit faster now (almost 1% in 32bit case). No functional change.
This commit is contained in:
parent
1cb2722c95
commit
9204a60dbb
1 changed files with 20 additions and 26 deletions
|
@ -33,8 +33,8 @@ enum BitCountType {
|
|||
};
|
||||
|
||||
/// Determine at compile time the best popcount<> specialization according if
|
||||
/// platform is 32 or 64 bits, to the maximum number of nonzero bits to count or
|
||||
/// use hardware popcnt instruction when available.
|
||||
/// platform is 32 or 64 bits, to the maximum number of nonzero bits to count
|
||||
/// and if hardware popcnt instruction is available.
|
||||
const BitCountType Full = HasPopCnt ? CNT_HW_POPCNT : Is64Bit ? CNT_64 : CNT_32;
|
||||
const BitCountType Max15 = HasPopCnt ? CNT_HW_POPCNT : Is64Bit ? CNT_64_MAX15 : CNT_32_MAX15;
|
||||
|
||||
|
@ -44,44 +44,38 @@ template<BitCountType> inline int popcount(Bitboard);
|
|||
|
||||
template<>
|
||||
inline int popcount<CNT_64>(Bitboard b) {
|
||||
b -= ((b>>1) & 0x5555555555555555ULL);
|
||||
b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL);
|
||||
b = ((b>>4) + b) & 0x0F0F0F0F0F0F0F0FULL;
|
||||
b *= 0x0101010101010101ULL;
|
||||
return int(b >> 56);
|
||||
b -= (b >> 1) & 0x5555555555555555ULL;
|
||||
b = ((b >> 2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL);
|
||||
b = ((b >> 4) + b) & 0x0F0F0F0F0F0F0F0FULL;
|
||||
return (b * 0x0101010101010101ULL) >> 56;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline int popcount<CNT_64_MAX15>(Bitboard b) {
|
||||
b -= (b>>1) & 0x5555555555555555ULL;
|
||||
b = ((b>>2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL);
|
||||
b *= 0x1111111111111111ULL;
|
||||
return int(b >> 60);
|
||||
b -= (b >> 1) & 0x5555555555555555ULL;
|
||||
b = ((b >> 2) & 0x3333333333333333ULL) + (b & 0x3333333333333333ULL);
|
||||
return (b * 0x1111111111111111ULL) >> 60;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline int popcount<CNT_32>(Bitboard b) {
|
||||
unsigned w = unsigned(b >> 32), v = unsigned(b);
|
||||
v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits
|
||||
w -= (w >> 1) & 0x55555555;
|
||||
v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits
|
||||
w = ((w >> 2) & 0x33333333) + (w & 0x33333333);
|
||||
v = ((v >> 4) + v) & 0x0F0F0F0F; // 0-8 in 8 bits
|
||||
v += (((w >> 4) + w) & 0x0F0F0F0F); // 0-16 in 8 bits
|
||||
v *= 0x01010101; // mul is fast on amd procs
|
||||
return int(v >> 24);
|
||||
v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits
|
||||
w -= (w >> 1) & 0x55555555;
|
||||
v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits
|
||||
w = ((w >> 2) & 0x33333333) + (w & 0x33333333);
|
||||
v = ((v >> 4) + v + (w >> 4) + w) & 0x0F0F0F0F;
|
||||
return (v * 0x01010101) >> 24;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline int popcount<CNT_32_MAX15>(Bitboard b) {
|
||||
unsigned w = unsigned(b >> 32), v = unsigned(b);
|
||||
v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits
|
||||
w -= (w >> 1) & 0x55555555;
|
||||
v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits
|
||||
w = ((w >> 2) & 0x33333333) + (w & 0x33333333);
|
||||
v += w; // 0-8 in 4 bits
|
||||
v *= 0x11111111;
|
||||
return int(v >> 28);
|
||||
v -= (v >> 1) & 0x55555555; // 0-2 in 2 bits
|
||||
w -= (w >> 1) & 0x55555555;
|
||||
v = ((v >> 2) & 0x33333333) + (v & 0x33333333); // 0-4 in 4 bits
|
||||
w = ((w >> 2) & 0x33333333) + (w & 0x33333333);
|
||||
return ((v + w) * 0x11111111) >> 28;
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
Loading…
Add table
Reference in a new issue