blob: 579f8dfe84edd1ffc9f9347704e44ad8400e5150 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <stdint.h>
// When the compiler doesn't inline __builtin_popcount{,l}, it calls these.
extern "C" int __popcountsi2(uint32_t val);
extern "C" int __popcountdi2(uint64_t val);
extern "C" int __popcountsi2(uint32_t val) {
// Implement a log2(bits) popcount, with a few tricks to save some
// instructions.
//
// The general idea here is to simply add up the bits in the word in
// parallel, storing intermediate results in different bit positions in
// the word as we go, and eventually arriving at a sum which is the
// popcount of the word.
//
// For example, consider an 8 bit integer written as
//
// b7 b6 b5 b4 b3 b2 b1 b0
//
// Each of the bits in the word represent a partial sum that, when added
// together, produce our result. We can compute...
//
// (
// b7 b6 b5 b4 b3 b2 b1 b0 &
// 0 1 0 1 0 1 0 1 =
// 0 b6 0 b4 0 b2 0 b0
// ) + (
// b7 b6 b5 b4 b3 b2 b1 b0 >> 1 =
// 0 b7 b6 b5 b4 b3 b2 b1 &
// 0 1 0 1 0 1 0 1 =
// 0 b7 0 b5 0 b3 0 b1
// ) =
// b7+b6 b5+b4 b3+b2 b1+b0
//
// And now we have 4 partial sums, each of which takes two bits of
// storage instead of one. Repeating this process with appropriate
// masks and shifts will eventually give us the answer we are looking
// for. After the next step, we will have partial sums which require 3
// bits of storage, but aligned to 4 bit boundaries; so...
//
// 0 (b7+b6+b5+b4) 0 (b3+b2+b1+b0)
//
// After the last step, we will get the final sum stored in 4 bits, with
// guaranteed 0s for the upper bits.
// aligned to ...
//
// 0 0 0 0 (b7+b6+b5+b4+b3+b2+b1+b0)
//
// While this is the general idea, it turns out that we only need to
// follow this exact process during step #2. For each of the other
// steps (there will be 5 for a 32 bit integer) we can shave a few
// cycles by taking advantage of some of the particular properties of
// each step. See the comments below.
constexpr uint32_t mask1 = 0x55555555;
constexpr uint32_t mask2 = 0x33333333;
constexpr uint32_t mask3 = 0x0F0F0F0F;
constexpr uint32_t mult1 = 0x01010101;
// Step 1:
// While we could compute (val & mask) + ((val >> 1) & mask), we can
// actually save one of the mask operations by subtracting instead of
// adding. Consider the following truth table...
//
// In | >>1 | & | - |
// X00 | XX0 | X00 | X00 |
// X01 | XX0 | X00 | X01 |
// X10 | XX1 | X01 | X01 |
// X11 | XX1 | X01 | X10 |
//
val = val - ((val >> 1) & mask1);
// Step 2:
// This is simply the operation described in the overview, adding a
// bunch of 2 bit partials to produce a 3 bit result, but which is
// aligned on 4 bit boundaries (where the MSB of each 4 bit nibble is
// guaranteed to be 0)
val = (val & mask2) + ((val >> 2) & mask2);
// Step 3:
// Again, we can save a mask operation here. This time, it is because
// the we know that all bit positions ((i * 4) + 3) are guaranteed to be
// zero. We can simply shift by 4 and add the partial sums. Any
// overflow will go into the ((i * 4) + 3) position and not interfere
// with any of the other partial sums.
val = (val + (val >> 4)) & mask3;
// Step 4 + 5:
// Finally, we can combine steps 4 and 5 using a multiply. We have 4
// remaining partial sums, each of which is contained in 4 bits and
// aligned to 8 bit boundaries (with 4 bits of zero in-between each
// sum). Now, we have enough space that if we could perform all 4 sums
// at once, we know that the result would fit in (at most) 6 bits, which
// easily fits in our 8 bits of space.
//
// At its heart, multiplying is the equivalent of repeated shifting, and
// conditional adding. IOW - if I multiply by 0101b, it is basically
// the same as saying "shift X left by 0, then add that to shift X left
// by 2" (because bits 0 and bits 2 are the only bits set).
//
// So, we have, 0x0A0B0C0D, and want to compute (A+B+C+D). Multiplying
// our register by 0x01010101 basically does this for us. We are
// summing our register shifted left by 0 (A), 8 (B), 16 (C) and 24 (D),
// which puts A+B+C+D into the upper 8 bits of the register. The lower
// 24 bits are junk, but we know (because of the 0s which separate A-D)
// that this junk is not going to overflow into our result. All we need
// to do is right shift by 24, and we are done.
return (val * mult1) >> 24;
}
extern "C" int __popcountdi2(uint64_t val) {
// See notes above for how this works. The 64 bit version of this is
// identical to the 32 bit version, the only difference is that during
// the final stage, we use the multiply trick to combine steps 4, 5, and
// 6 to sum all 8 4-bit partial sums at once, fitting the result into a
// 7 bit value.
constexpr uint64_t mask1 = 0x5555555555555555;
constexpr uint64_t mask2 = 0x3333333333333333;
constexpr uint64_t mask3 = 0x0F0F0F0F0F0F0F0F;
constexpr uint64_t mult1 = 0x0101010101010101;
val = val - ((val >> 1) & mask1);
val = (val & mask2) + ((val >> 2) & mask2);
val = (val + (val >> 4)) & mask3;
return static_cast<int>((val * mult1) >> 56);
}