// Copyright 2019 The Fuchsia Authors | |

// | |

// Use of this source code is governed by a MIT-style | |

// license that can be found in the LICENSE file or at | |

// https://opensource.org/licenses/MIT | |

#include <stdint.h> | |

// When the compiler doesn't inline __builtin_popcount{,l}, it calls these. | |

extern "C" int __popcountsi2(uint32_t val); | |

extern "C" int __popcountdi2(uint64_t val); | |

extern "C" int __popcountsi2(uint32_t val) { | |

// Implement a log2(bits) popcount, with a few tricks to save some | |

// instructions. | |

// | |

// The general idea here is to simply add up the bits in the word in | |

// parallel, storing intermediate results in different bit positions in | |

// the word as we go, and eventually arriving at a sum which is the | |

// popcount of the word. | |

// | |

// For example, consider an 8 bit integer written as | |

// | |

// b7 b6 b5 b4 b3 b2 b1 b0 | |

// | |

// Each of the bits in the word represent a partial sum that, when added | |

// together, produce our result. We can compute... | |

// | |

// ( | |

// b7 b6 b5 b4 b3 b2 b1 b0 & | |

// 0 1 0 1 0 1 0 1 = | |

// 0 b6 0 b4 0 b2 0 b0 | |

// ) + ( | |

// b7 b6 b5 b4 b3 b2 b1 b0 >> 1 = | |

// 0 b7 b6 b5 b4 b3 b2 b1 & | |

// 0 1 0 1 0 1 0 1 = | |

// 0 b7 0 b5 0 b3 0 b1 | |

// ) = | |

// b7+b6 b5+b4 b3+b2 b1+b0 | |

// | |

// And now we have 4 partial sums, each of which takes two bits of | |

// storage instead of one. Repeating this process with appropriate | |

// masks and shifts will eventually give us the answer we are looking | |

// for. After the next step, we will have partial sums which require 3 | |

// bits of storage, but aligned to 4 bit boundaries; so... | |

// | |

// 0 (b7+b6+b5+b4) 0 (b3+b2+b1+b0) | |

// | |

// After the last step, we will get the final sum stored in 4 bits, with | |

// guaranteed 0s for the upper bits. | |

// aligned to ... | |

// | |

// 0 0 0 0 (b7+b6+b5+b4+b3+b2+b1+b0) | |

// | |

// While this is the general idea, it turns out that we only need to | |

// follow this exact process during step #2. For each of the other | |

// steps (there will be 5 for a 32 bit integer) we can shave a few | |

// cycles by taking advantage of some of the particular properties of | |

// each step. See the comments below. | |

constexpr uint32_t mask1 = 0x55555555; | |

constexpr uint32_t mask2 = 0x33333333; | |

constexpr uint32_t mask3 = 0x0F0F0F0F; | |

constexpr uint32_t mult1 = 0x01010101; | |

// Step 1: | |

// While we could compute (val & mask) + ((val >> 1) & mask), we can | |

// actually save one of the mask operations by subtracting instead of | |

// adding. Consider the following truth table... | |

// | |

// In | >>1 | & | - | | |

// X00 | XX0 | X00 | X00 | | |

// X01 | XX0 | X00 | X01 | | |

// X10 | XX1 | X01 | X01 | | |

// X11 | XX1 | X01 | X10 | | |

// | |

val = val - ((val >> 1) & mask1); | |

// Step 2: | |

// This is simply the operation described in the overview, adding a | |

// bunch of 2 bit partials to produce a 3 bit result, but which is | |

// aligned on 4 bit boundaries (where the MSB of each 4 bit nibble is | |

// guaranteed to be 0) | |

val = (val & mask2) + ((val >> 2) & mask2); | |

// Step 3: | |

// Again, we can save a mask operation here. This time, it is because | |

// the we know that all bit positions ((i * 4) + 3) are guaranteed to be | |

// zero. We can simply shift by 4 and add the partial sums. Any | |

// overflow will go into the ((i * 4) + 3) position and not interfere | |

// with any of the other partial sums. | |

val = (val + (val >> 4)) & mask3; | |

// Step 4 + 5: | |

// Finally, we can combine steps 4 and 5 using a multiply. We have 4 | |

// remaining partial sums, each of which is contained in 4 bits and | |

// aligned to 8 bit boundaries (with 4 bits of zero in-between each | |

// sum). Now, we have enough space that if we could perform all 4 sums | |

// at once, we know that the result would fit in (at most) 6 bits, which | |

// easily fits in our 8 bits of space. | |

// | |

// At its heart, multiplying is the equivalent of repeated shifting, and | |

// conditional adding. IOW - if I multiply by 0101b, it is basically | |

// the same as saying "shift X left by 0, then add that to shift X left | |

// by 2" (because bits 0 and bits 2 are the only bits set). | |

// | |

// So, we have, 0x0A0B0C0D, and want to compute (A+B+C+D). Multiplying | |

// our register by 0x01010101 basically does this for us. We are | |

// summing our register shifted left by 0 (A), 8 (B), 16 (C) and 24 (D), | |

// which puts A+B+C+D into the upper 8 bits of the register. The lower | |

// 24 bits are junk, but we know (because of the 0s which separate A-D) | |

// that this junk is not going to overflow into our result. All we need | |

// to do is right shift by 24, and we are done. | |

return (val * mult1) >> 24; | |

} | |

extern "C" int __popcountdi2(uint64_t val) { | |

// See notes above for how this works. The 64 bit version of this is | |

// identical to the 32 bit version, the only difference is that during | |

// the final stage, we use the multiply trick to combine steps 4, 5, and | |

// 6 to sum all 8 4-bit partial sums at once, fitting the result into a | |

// 7 bit value. | |

constexpr uint64_t mask1 = 0x5555555555555555; | |

constexpr uint64_t mask2 = 0x3333333333333333; | |

constexpr uint64_t mask3 = 0x0F0F0F0F0F0F0F0F; | |

constexpr uint64_t mult1 = 0x0101010101010101; | |

val = val - ((val >> 1) & mask1); | |

val = (val & mask2) + ((val >> 2) & mask2); | |

val = (val + (val >> 4)) & mask3; | |

return static_cast<int>((val * mult1) >> 56); | |

} |