zircon/kernel/lib/ktl/popcount.cc - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <stdint.h>

 // When the compiler doesn't inline __builtin_popcount{,l}, it calls these.
 extern "C" int __popcountsi2(uint32_t val);
 extern "C" int __popcountdi2(uint64_t val);

 extern "C" int __popcountsi2(uint32_t val) {
   // Implement a log2(bits) popcount, with a few tricks to save some
   // instructions.
   //
   // The general idea here is to simply add up the bits in the word in
   // parallel, storing intermediate results in different bit positions in
   // the word as we go, and eventually arriving at a sum which is the
   // popcount of the word.
   //
   // For example, consider an 8 bit integer written as
   //
   // b7 b6 b5 b4 b3 b2 b1 b0
   //
   // Each of the bits in the word represent a partial sum that, when added
   // together, produce our result.  We can compute...
   //
   // (
   // b7 b6 b5 b4 b3 b2 b1 b0  &
   //  0  1  0  1  0  1  0  1  =
   //  0 b6  0 b4  0 b2  0 b0
   // ) + (
   // b7 b6 b5 b4 b3 b2 b1 b0  >> 1 =
   //  0 b7 b6 b5 b4 b3 b2 b1  &
   //  0  1  0  1  0  1  0  1  =
   //  0 b7  0 b5  0 b3  0 b1
   // )                        =
   // b7+b6 b5+b4 b3+b2 b1+b0
   //
   // And now we have 4 partial sums, each of which takes two bits of
   // storage instead of one.  Repeating this process with appropriate
   // masks and shifts will eventually give us the answer we are looking
   // for.  After the next step, we will have partial sums which require 3
   // bits of storage, but aligned to 4 bit boundaries; so...
   //
   // 0 (b7+b6+b5+b4) 0 (b3+b2+b1+b0)
   //
   // After the last step, we will get the final sum stored in 4 bits, with
   // guaranteed 0s for the upper bits.
   // aligned to ...
   //
   // 0 0 0 0 (b7+b6+b5+b4+b3+b2+b1+b0)
   //
   // While this is the general idea, it turns out that we only need to
   // follow this exact process during step #2.  For each of the other
   // steps (there will be 5 for a 32 bit integer) we can shave a few
   // cycles by taking advantage of some of the particular properties of
   // each step.  See the comments below.
   constexpr uint32_t mask1 = 0x55555555;
   constexpr uint32_t mask2 = 0x33333333;
   constexpr uint32_t mask3 = 0x0F0F0F0F;
   constexpr uint32_t mult1 = 0x01010101;

   // Step 1:
   // While we could compute (val & mask) + ((val >> 1) & mask), we can
   // actually save one of the mask operations by subtracting instead of
   // adding.  Consider the following truth table...
   //
   //  In | >>1 |  &  |  -  |
   // X00 | XX0 | X00 | X00 |
   // X01 | XX0 | X00 | X01 |
   // X10 | XX1 | X01 | X01 |
   // X11 | XX1 | X01 | X10 |
   //
   val = val - ((val >> 1) & mask1);

   // Step 2:
   // This is simply the operation described in the overview, adding a
   // bunch of 2 bit partials to produce a 3 bit result, but which is
   // aligned on 4 bit boundaries (where the MSB of each 4 bit nibble is
   // guaranteed to be 0)
   val = (val & mask2) + ((val >> 2) & mask2);

   // Step 3:
   // Again, we can save a mask operation here.  This time, it is because
   // the we know that all bit positions ((i * 4) + 3) are guaranteed to be
   // zero.  We can simply shift by 4 and add the partial sums.  Any
   // overflow will go into the ((i * 4) + 3) position and not interfere
   // with any of the other partial sums.
   val = (val + (val >> 4)) & mask3;

   // Step 4 + 5:
   // Finally, we can combine steps 4 and 5 using a multiply.  We have 4
   // remaining partial sums, each of which is contained in 4 bits and
   // aligned to 8 bit boundaries (with 4 bits of zero in-between each
   // sum).  Now, we have enough space that if we could perform all 4 sums
   // at once, we know that the result would fit in (at most) 6 bits, which
   // easily fits in our 8 bits of space.
   //
   // At its heart, multiplying is the equivalent of repeated shifting, and
   // conditional adding.  IOW - if I multiply by 0101b, it is basically
   // the same as saying "shift X left by 0, then add that to shift X left
   // by 2" (because bits 0 and bits 2 are the only bits set).
   //
   // So, we have, 0x0A0B0C0D, and want to compute (A+B+C+D).  Multiplying
   // our register by 0x01010101 basically does this for us.  We are
   // summing our register shifted left by 0 (A), 8 (B), 16 (C) and 24 (D),
   // which puts A+B+C+D into the upper 8 bits of the register.  The lower
   // 24 bits are junk, but we know (because of the 0s which separate A-D)
   // that this junk is not going to overflow into our result.  All we need
   // to do is right shift by 24, and we are done.
   return (val * mult1) >> 24;
 }

 extern "C" int __popcountdi2(uint64_t val) {
   // See notes above for how this works.  The 64 bit version of this is
   // identical to the 32 bit version, the only difference is that during
   // the final stage, we use the multiply trick to combine steps 4, 5, and
   // 6 to sum all 8 4-bit partial sums at once, fitting the result into a
   // 7 bit value.
   constexpr uint64_t mask1 = 0x5555555555555555;
   constexpr uint64_t mask2 = 0x3333333333333333;
   constexpr uint64_t mask3 = 0x0F0F0F0F0F0F0F0F;
   constexpr uint64_t mult1 = 0x0101010101010101;

   val = val - ((val >> 1) & mask1);
   val = (val & mask2) + ((val >> 2) & mask2);
   val = (val + (val >> 4)) & mask3;
   return static_cast<int>((val * mult1) >> 56);
 }
	// Copyright 2019 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#include <stdint.h>

	// When the compiler doesn't inline __builtin_popcount{,l}, it calls these.
	extern "C" int __popcountsi2(uint32_t val);
	extern "C" int __popcountdi2(uint64_t val);

	extern "C" int __popcountsi2(uint32_t val) {
	// Implement a log2(bits) popcount, with a few tricks to save some
	// instructions.
	//
	// The general idea here is to simply add up the bits in the word in
	// parallel, storing intermediate results in different bit positions in
	// the word as we go, and eventually arriving at a sum which is the
	// popcount of the word.
	//
	// For example, consider an 8 bit integer written as
	//
	// b7 b6 b5 b4 b3 b2 b1 b0
	//
	// Each of the bits in the word represent a partial sum that, when added
	// together, produce our result. We can compute...
	//
	// (
	// b7 b6 b5 b4 b3 b2 b1 b0 &
	// 0 1 0 1 0 1 0 1 =
	// 0 b6 0 b4 0 b2 0 b0
	// ) + (
	// b7 b6 b5 b4 b3 b2 b1 b0 >> 1 =
	// 0 b7 b6 b5 b4 b3 b2 b1 &
	// 0 1 0 1 0 1 0 1 =
	// 0 b7 0 b5 0 b3 0 b1
	// ) =
	// b7+b6 b5+b4 b3+b2 b1+b0
	//
	// And now we have 4 partial sums, each of which takes two bits of
	// storage instead of one. Repeating this process with appropriate
	// masks and shifts will eventually give us the answer we are looking
	// for. After the next step, we will have partial sums which require 3
	// bits of storage, but aligned to 4 bit boundaries; so...
	//
	// 0 (b7+b6+b5+b4) 0 (b3+b2+b1+b0)
	//
	// After the last step, we will get the final sum stored in 4 bits, with
	// guaranteed 0s for the upper bits.
	// aligned to ...
	//
	// 0 0 0 0 (b7+b6+b5+b4+b3+b2+b1+b0)
	//
	// While this is the general idea, it turns out that we only need to
	// follow this exact process during step #2. For each of the other
	// steps (there will be 5 for a 32 bit integer) we can shave a few
	// cycles by taking advantage of some of the particular properties of
	// each step. See the comments below.
	constexpr uint32_t mask1 = 0x55555555;
	constexpr uint32_t mask2 = 0x33333333;
	constexpr uint32_t mask3 = 0x0F0F0F0F;
	constexpr uint32_t mult1 = 0x01010101;

	// Step 1:
	// While we could compute (val & mask) + ((val >> 1) & mask), we can
	// actually save one of the mask operations by subtracting instead of
	// adding. Consider the following truth table...
	//
	// In \| >>1 \| & \| - \|
	// X00 \| XX0 \| X00 \| X00 \|
	// X01 \| XX0 \| X00 \| X01 \|
	// X10 \| XX1 \| X01 \| X01 \|
	// X11 \| XX1 \| X01 \| X10 \|
	//
	val = val - ((val >> 1) & mask1);

	// Step 2:
	// This is simply the operation described in the overview, adding a
	// bunch of 2 bit partials to produce a 3 bit result, but which is
	// aligned on 4 bit boundaries (where the MSB of each 4 bit nibble is
	// guaranteed to be 0)
	val = (val & mask2) + ((val >> 2) & mask2);

	// Step 3:
	// Again, we can save a mask operation here. This time, it is because
	// the we know that all bit positions ((i * 4) + 3) are guaranteed to be
	// zero. We can simply shift by 4 and add the partial sums. Any
	// overflow will go into the ((i * 4) + 3) position and not interfere
	// with any of the other partial sums.
	val = (val + (val >> 4)) & mask3;

	// Step 4 + 5:
	// Finally, we can combine steps 4 and 5 using a multiply. We have 4
	// remaining partial sums, each of which is contained in 4 bits and
	// aligned to 8 bit boundaries (with 4 bits of zero in-between each
	// sum). Now, we have enough space that if we could perform all 4 sums
	// at once, we know that the result would fit in (at most) 6 bits, which
	// easily fits in our 8 bits of space.
	//
	// At its heart, multiplying is the equivalent of repeated shifting, and
	// conditional adding. IOW - if I multiply by 0101b, it is basically
	// the same as saying "shift X left by 0, then add that to shift X left
	// by 2" (because bits 0 and bits 2 are the only bits set).
	//
	// So, we have, 0x0A0B0C0D, and want to compute (A+B+C+D). Multiplying
	// our register by 0x01010101 basically does this for us. We are
	// summing our register shifted left by 0 (A), 8 (B), 16 (C) and 24 (D),
	// which puts A+B+C+D into the upper 8 bits of the register. The lower
	// 24 bits are junk, but we know (because of the 0s which separate A-D)
	// that this junk is not going to overflow into our result. All we need
	// to do is right shift by 24, and we are done.
	return (val * mult1) >> 24;
	}

	extern "C" int __popcountdi2(uint64_t val) {
	// See notes above for how this works. The 64 bit version of this is
	// identical to the 32 bit version, the only difference is that during
	// the final stage, we use the multiply trick to combine steps 4, 5, and
	// 6 to sum all 8 4-bit partial sums at once, fitting the result into a
	// 7 bit value.
	constexpr uint64_t mask1 = 0x5555555555555555;
	constexpr uint64_t mask2 = 0x3333333333333333;
	constexpr uint64_t mask3 = 0x0F0F0F0F0F0F0F0F;
	constexpr uint64_t mult1 = 0x0101010101010101;

	val = val - ((val >> 1) & mask1);
	val = (val & mask2) + ((val >> 2) & mask2);
	val = (val + (val >> 4)) & mask3;
	return static_cast<int>((val * mult1) >> 56);
	}