include/jemalloc/internal/bit_util.h - third_party/github.com/jemalloc/jemalloc - Git at Google

 #ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
 #define JEMALLOC_INTERNAL_BIT_UTIL_H

 #include "jemalloc/internal/assert.h"

 /* Sanity check. */
 #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
     || !defined(JEMALLOC_INTERNAL_FFS)
 #  error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
 #endif

 /*
  * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
  * input, and returns the position of the lowest bit set (as opposed to the
  * posix versions, which return 1 larger than that position and use a return
  * value of zero as a sentinel.  This tends to simplify logic in callers, and
  * allows for consistency with the builtins we build fls on top of.
  */
 static inline unsigned
 ffs_llu(unsigned long long x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
 }

 static inline unsigned
 ffs_lu(unsigned long x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFSL(x) - 1;
 }

 static inline unsigned
 ffs_u(unsigned x) {
 	util_assume(x != 0);
 	return JEMALLOC_INTERNAL_FFS(x) - 1;
 }

 #define DO_FLS_SLOW(x, suffix) do {					\
 	util_assume(x != 0);						\
 	x |= (x >> 1);							\
 	x |= (x >> 2);							\
 	x |= (x >> 4);							\
 	x |= (x >> 8);							\
 	x |= (x >> 16);							\
 	if (sizeof(x) > 4) {						\
 		/*							\
 		 * If sizeof(x) is 4, then the expression "x >> 32"	\
 		 * will generate compiler warnings even if the code	\
 		 * never executes.  This circumvents the warning, and	\
 		 * gets compiled out in optimized builds.		\
 		 */							\
 		int constant_32 = sizeof(x) * 4;			\
 		x |= (x >> constant_32);				\
 	}								\
 	x++;								\
 	if (x == 0) {							\
 		return 8 * sizeof(x) - 1;				\
 	}								\
 	return ffs_##suffix(x) - 1;					\
 } while(0)

 static inline unsigned
 fls_llu_slow(unsigned long long x) {
 	DO_FLS_SLOW(x, llu);
 }

 static inline unsigned
 fls_lu_slow(unsigned long x) {
 	DO_FLS_SLOW(x, lu);
 }

 static inline unsigned
 fls_u_slow(unsigned x) {
 	DO_FLS_SLOW(x, u);
 }

 #undef DO_FLS_SLOW

 #ifdef JEMALLOC_HAVE_BUILTIN_CLZ
 static inline unsigned
 fls_llu(unsigned long long x) {
 	util_assume(x != 0);
 	/*
 	 * Note that the xor here is more naturally written as subtraction; the
 	 * last bit set is the number of bits in the type minus the number of
 	 * leading zero bits.  But GCC implements that as:
 	 *    bsr     edi, edi
 	 *    mov     eax, 31
 	 *    xor     edi, 31
 	 *    sub     eax, edi
 	 * If we write it as xor instead, then we get
 	 *    bsr     eax, edi
 	 * as desired.
 	 */
 	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
 }

 static inline unsigned
 fls_lu(unsigned long x) {
 	util_assume(x != 0);
 	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
 }

 static inline unsigned
 fls_u(unsigned x) {
 	util_assume(x != 0);
 	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
 }
 #elif defined(_MSC_VER)

 #if LG_SIZEOF_PTR == 3
 #define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
 #else
 /*
  * This never actually runs; we're just dodging a compiler error for the
  * never-taken branch where sizeof(void *) == 8.
  */
 #define DO_BSR64(bit, x) bit = 0; unreachable()
 #endif

 #define DO_FLS(x) do {							\
 	if (x == 0) {							\
 		return 8 * sizeof(x);					\
 	}								\
 	unsigned long bit;						\
 	if (sizeof(x) == 4) {						\
 		_BitScanReverse(&bit, (unsigned)x);			\
 		return (unsigned)bit;					\
 	}								\
 	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
 		DO_BSR64(bit, x);					\
 		return (unsigned)bit;					\
 	}								\
 	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
 		/* Dodge a compiler warning, as above. */		\
 		int constant_32 = sizeof(x) * 4;			\
 		if (_BitScanReverse(&bit,				\
 		    (unsigned)(x >> constant_32))) {			\
 			return 32 + (unsigned)bit;			\
 		} else {						\
 			_BitScanReverse(&bit, (unsigned)x);		\
 			return (unsigned)bit;				\
 		}							\
 	}								\
 	unreachable();							\
 } while (0)

 static inline unsigned
 fls_llu(unsigned long long x) {
 	DO_FLS(x);
 }

 static inline unsigned
 fls_lu(unsigned long x) {
 	DO_FLS(x);
 }

 static inline unsigned
 fls_u(unsigned x) {
 	DO_FLS(x);
 }

 #undef DO_FLS
 #undef DO_BSR64
 #else

 static inline unsigned
 fls_llu(unsigned long long x) {
 	return fls_llu_slow(x);
 }

 static inline unsigned
 fls_lu(unsigned long x) {
 	return fls_lu_slow(x);
 }

 static inline unsigned
 fls_u(unsigned x) {
 	return fls_u_slow(x);
 }
 #endif

 #if LG_SIZEOF_LONG_LONG > 3
 #  error "Haven't implemented popcount for 16-byte ints."
 #endif

 #define DO_POPCOUNT(x, type) do {					\
 	/*								\
 	 * Algorithm from an old AMD optimization reference manual.	\
 	 * We're putting a little bit more work than you might expect	\
 	 * into the no-instrinsic case, since we only support the	\
 	 * GCC intrinsics spelling of popcount (for now).  Detecting	\
 	 * whether or not the popcount builtin is actually useable in	\
 	 * MSVC is nontrivial.						\
 	 */								\
 									\
 	type bmul = (type)0x0101010101010101ULL;			\
 									\
 	/*								\
 	 * Replace each 2 bits with the sideways sum of the original	\
 	 * values.  0x5 = 0b0101.					\
 	 *								\
 	 * You might expect this to be:					\
 	 *   x = (x & 0x55...) + ((x >> 1) & 0x55...).			\
 	 * That costs an extra mask relative to this, though.		\
 	 */								\
 	x = x - ((x >> 1) & (0x55U * bmul));				\
 	/* Replace each 4 bits with their sideays sum.  0x3 = 0b0011. */\
 	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U));		\
 	/*								\
 	 * Replace each 8 bits with their sideways sum.  Note that we	\
 	 * can't overflow within each 4-bit sum here, so we can skip	\
 	 * the initial mask.						\
 	 */								\
 	x = (x + (x >> 4)) & (bmul * 0x0FU);				\
 	/*								\
 	 * None of the partial sums in this multiplication (viewed in	\
 	 * base-256) can overflow into the next digit.  So the least	\
 	 * significant byte of the product will be the least		\
 	 * significant byte of the original value, the second least	\
 	 * significant byte will be the sum of the two least		\
 	 * significant bytes of the original value, and so on.		\
 	 * Importantly, the high byte will be the byte-wise sum of all	\
 	 * the bytes of the original value.				\
 	 */								\
 	x = x * bmul;							\
 	x >>= ((sizeof(x) - 1) * 8);					\
 	return (unsigned)x;						\
 } while(0)

 static inline unsigned
 popcount_u_slow(unsigned bitmap) {
 	DO_POPCOUNT(bitmap, unsigned);
 }

 static inline unsigned
 popcount_lu_slow(unsigned long bitmap) {
 	DO_POPCOUNT(bitmap, unsigned long);
 }

 static inline unsigned
 popcount_llu_slow(unsigned long long bitmap) {
 	DO_POPCOUNT(bitmap, unsigned long long);
 }

 #undef DO_POPCOUNT

 static inline unsigned
 popcount_u(unsigned bitmap) {
 #ifdef JEMALLOC_INTERNAL_POPCOUNT
 	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
 #else
 	return popcount_u_slow(bitmap);
 #endif
 }

 static inline unsigned
 popcount_lu(unsigned long bitmap) {
 #ifdef JEMALLOC_INTERNAL_POPCOUNTL
 	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
 #else
 	return popcount_lu_slow(bitmap);
 #endif
 }

 static inline unsigned
 popcount_llu(unsigned long long bitmap) {
 #ifdef JEMALLOC_INTERNAL_POPCOUNTLL
 	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
 #else
 	return popcount_llu_slow(bitmap);
 #endif
 }

 /*
  * Clears first unset bit in bitmap, and returns
  * place of bit.  bitmap *must not* be 0.
  */

 static inline size_t
 cfs_lu(unsigned long* bitmap) {
 	util_assume(*bitmap != 0);
 	size_t bit = ffs_lu(*bitmap);
 	*bitmap ^= ZU(1) << bit;
 	return bit;
 }

 static inline unsigned
 ffs_zu(size_t x) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
 	return ffs_u(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
 	return ffs_lu(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
 	return ffs_llu(x);
 #else
 #error No implementation for size_t ffs()
 #endif
 }

 static inline unsigned
 fls_zu(size_t x) {
 #if LG_SIZEOF_PTR == LG_SIZEOF_INT
 	return fls_u(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
 	return fls_lu(x);
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
 	return fls_llu(x);
 #else
 #error No implementation for size_t fls()
 #endif
 }


 static inline unsigned
 ffs_u64(uint64_t x) {
 #if LG_SIZEOF_LONG == 3
 	return ffs_lu(x);
 #elif LG_SIZEOF_LONG_LONG == 3
 	return ffs_llu(x);
 #else
 #error No implementation for 64-bit ffs()
 #endif
 }

 static inline unsigned
 fls_u64(uint64_t x) {
 #if LG_SIZEOF_LONG == 3
 	return fls_lu(x);
 #elif LG_SIZEOF_LONG_LONG == 3
 	return fls_llu(x);
 #else
 #error No implementation for 64-bit fls()
 #endif
 }

 static inline unsigned
 ffs_u32(uint32_t x) {
 #if LG_SIZEOF_INT == 2
 	return ffs_u(x);
 #else
 #error No implementation for 32-bit ffs()
 #endif
 	return ffs_u(x);
 }

 static inline unsigned
 fls_u32(uint32_t x) {
 #if LG_SIZEOF_INT == 2
 	return fls_u(x);
 #else
 #error No implementation for 32-bit fls()
 #endif
 	return fls_u(x);
 }

 static inline uint64_t
 pow2_ceil_u64(uint64_t x) {
 	if (unlikely(x <= 1)) {
 		return x;
 	}
 	size_t msb_on_index = fls_u64(x - 1);
 	/*
 	 * Range-check; it's on the callers to ensure that the result of this
 	 * call won't overflow.
 	 */
 	assert(msb_on_index < 63);
 	return 1ULL << (msb_on_index + 1);
 }

 static inline uint32_t
 pow2_ceil_u32(uint32_t x) {
 	if (unlikely(x <= 1)) {
 	    return x;
 	}
 	size_t msb_on_index = fls_u32(x - 1);
 	/* As above. */
 	assert(msb_on_index < 31);
 	return 1U << (msb_on_index + 1);
 }

 /* Compute the smallest power of 2 that is >= x. */
 static inline size_t
 pow2_ceil_zu(size_t x) {
 #if (LG_SIZEOF_PTR == 3)
 	return pow2_ceil_u64(x);
 #else
 	return pow2_ceil_u32(x);
 #endif
 }

 static inline unsigned
 lg_floor(size_t x) {
 	util_assume(x != 0);
 #if (LG_SIZEOF_PTR == 3)
 	return fls_u64(x);
 #else
 	return fls_u32(x);
 #endif
 }

 static inline unsigned
 lg_ceil(size_t x) {
 	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
 }

 /* A compile-time version of lg_floor and lg_ceil. */
 #define LG_FLOOR_1(x) 0
 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
 #define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
 #define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
 #define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
 #define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
 #define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
 #if LG_SIZEOF_PTR == 2
 #  define LG_FLOOR(x) LG_FLOOR_32((x))
 #else
 #  define LG_FLOOR(x) LG_FLOOR_64((x))
 #endif

 #define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))

 #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
	#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
	#define JEMALLOC_INTERNAL_BIT_UTIL_H

	#include "jemalloc/internal/assert.h"

	/* Sanity check. */
	#if !defined(JEMALLOC_INTERNAL_FFSLL) \|\| !defined(JEMALLOC_INTERNAL_FFSL) \
	\|\| !defined(JEMALLOC_INTERNAL_FFS)
	# error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
	#endif

	/*
	* Unlike the builtins and posix ffs functions, our ffs requires a non-zero
	* input, and returns the position of the lowest bit set (as opposed to the
	* posix versions, which return 1 larger than that position and use a return
	* value of zero as a sentinel. This tends to simplify logic in callers, and
	* allows for consistency with the builtins we build fls on top of.
	*/
	static inline unsigned
	ffs_llu(unsigned long long x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
	}

	static inline unsigned
	ffs_lu(unsigned long x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFSL(x) - 1;
	}

	static inline unsigned
	ffs_u(unsigned x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFS(x) - 1;
	}

	#define DO_FLS_SLOW(x, suffix) do { \
	util_assume(x != 0); \
	x \|= (x >> 1); \
	x \|= (x >> 2); \
	x \|= (x >> 4); \
	x \|= (x >> 8); \
	x \|= (x >> 16); \
	if (sizeof(x) > 4) { \
	/* \
	* If sizeof(x) is 4, then the expression "x >> 32" \
	* will generate compiler warnings even if the code \
	* never executes. This circumvents the warning, and \
	* gets compiled out in optimized builds. \
	*/ \
	int constant_32 = sizeof(x) * 4; \
	x \|= (x >> constant_32); \
	} \
	x++; \
	if (x == 0) { \
	return 8 * sizeof(x) - 1; \
	} \
	return ffs_##suffix(x) - 1; \
	} while(0)

	static inline unsigned
	fls_llu_slow(unsigned long long x) {
	DO_FLS_SLOW(x, llu);
	}

	static inline unsigned
	fls_lu_slow(unsigned long x) {
	DO_FLS_SLOW(x, lu);
	}

	static inline unsigned
	fls_u_slow(unsigned x) {
	DO_FLS_SLOW(x, u);
	}

	#undef DO_FLS_SLOW

	#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
	static inline unsigned
	fls_llu(unsigned long long x) {
	util_assume(x != 0);
	/*
	* Note that the xor here is more naturally written as subtraction; the
	* last bit set is the number of bits in the type minus the number of
	* leading zero bits. But GCC implements that as:
	* bsr edi, edi
	* mov eax, 31
	* xor edi, 31
	* sub eax, edi
	* If we write it as xor instead, then we get
	* bsr eax, edi
	* as desired.
	*/
	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
	}

	static inline unsigned
	fls_lu(unsigned long x) {
	util_assume(x != 0);
	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
	}

	static inline unsigned
	fls_u(unsigned x) {
	util_assume(x != 0);
	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
	}
	#elif defined(_MSC_VER)

	#if LG_SIZEOF_PTR == 3
	#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
	#else
	/*
	* This never actually runs; we're just dodging a compiler error for the
	* never-taken branch where sizeof(void *) == 8.
	*/
	#define DO_BSR64(bit, x) bit = 0; unreachable()
	#endif

	#define DO_FLS(x) do { \
	if (x == 0) { \
	return 8 * sizeof(x); \
	} \
	unsigned long bit; \
	if (sizeof(x) == 4) { \
	_BitScanReverse(&bit, (unsigned)x); \
	return (unsigned)bit; \
	} \
	if (sizeof(x) == 8 && sizeof(void *) == 8) { \
	DO_BSR64(bit, x); \
	return (unsigned)bit; \
	} \
	if (sizeof(x) == 8 && sizeof(void *) == 4) { \
	/* Dodge a compiler warning, as above. */ \
	int constant_32 = sizeof(x) * 4; \
	if (_BitScanReverse(&bit, \
	(unsigned)(x >> constant_32))) { \
	return 32 + (unsigned)bit; \
	} else { \
	_BitScanReverse(&bit, (unsigned)x); \
	return (unsigned)bit; \
	} \
	} \
	unreachable(); \
	} while (0)

	static inline unsigned
	fls_llu(unsigned long long x) {
	DO_FLS(x);
	}

	static inline unsigned
	fls_lu(unsigned long x) {
	DO_FLS(x);
	}

	static inline unsigned
	fls_u(unsigned x) {
	DO_FLS(x);
	}

	#undef DO_FLS
	#undef DO_BSR64
	#else

	static inline unsigned
	fls_llu(unsigned long long x) {
	return fls_llu_slow(x);
	}

	static inline unsigned
	fls_lu(unsigned long x) {
	return fls_lu_slow(x);
	}

	static inline unsigned
	fls_u(unsigned x) {
	return fls_u_slow(x);
	}
	#endif

	#if LG_SIZEOF_LONG_LONG > 3
	# error "Haven't implemented popcount for 16-byte ints."
	#endif

	#define DO_POPCOUNT(x, type) do { \
	/* \
	* Algorithm from an old AMD optimization reference manual. \
	* We're putting a little bit more work than you might expect \
	* into the no-instrinsic case, since we only support the \
	* GCC intrinsics spelling of popcount (for now). Detecting \
	* whether or not the popcount builtin is actually useable in \
	* MSVC is nontrivial. \
	*/ \
	\
	type bmul = (type)0x0101010101010101ULL; \
	\
	/* \
	* Replace each 2 bits with the sideways sum of the original \
	* values. 0x5 = 0b0101. \
	* \
	* You might expect this to be: \
	* x = (x & 0x55...) + ((x >> 1) & 0x55...). \
	* That costs an extra mask relative to this, though. \
	*/ \
	x = x - ((x >> 1) & (0x55U * bmul)); \
	/* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\
	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \
	/* \
	* Replace each 8 bits with their sideways sum. Note that we \
	* can't overflow within each 4-bit sum here, so we can skip \
	* the initial mask. \
	*/ \
	x = (x + (x >> 4)) & (bmul * 0x0FU); \
	/* \
	* None of the partial sums in this multiplication (viewed in \
	* base-256) can overflow into the next digit. So the least \
	* significant byte of the product will be the least \
	* significant byte of the original value, the second least \
	* significant byte will be the sum of the two least \
	* significant bytes of the original value, and so on. \
	* Importantly, the high byte will be the byte-wise sum of all \
	* the bytes of the original value. \
	*/ \
	x = x * bmul; \
	x >>= ((sizeof(x) - 1) * 8); \
	return (unsigned)x; \
	} while(0)

	static inline unsigned
	popcount_u_slow(unsigned bitmap) {
	DO_POPCOUNT(bitmap, unsigned);
	}

	static inline unsigned
	popcount_lu_slow(unsigned long bitmap) {
	DO_POPCOUNT(bitmap, unsigned long);
	}

	static inline unsigned
	popcount_llu_slow(unsigned long long bitmap) {
	DO_POPCOUNT(bitmap, unsigned long long);
	}

	#undef DO_POPCOUNT

	static inline unsigned
	popcount_u(unsigned bitmap) {
	#ifdef JEMALLOC_INTERNAL_POPCOUNT
	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
	#else
	return popcount_u_slow(bitmap);
	#endif
	}

	static inline unsigned
	popcount_lu(unsigned long bitmap) {
	#ifdef JEMALLOC_INTERNAL_POPCOUNTL
	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
	#else
	return popcount_lu_slow(bitmap);
	#endif
	}

	static inline unsigned
	popcount_llu(unsigned long long bitmap) {
	#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
	#else
	return popcount_llu_slow(bitmap);
	#endif
	}

	/*
	* Clears first unset bit in bitmap, and returns
	* place of bit. bitmap must not be 0.
	*/

	static inline size_t
	cfs_lu(unsigned long* bitmap) {
	util_assume(*bitmap != 0);
	size_t bit = ffs_lu(*bitmap);
	*bitmap ^= ZU(1) << bit;
	return bit;
	}

	static inline unsigned
	ffs_zu(size_t x) {
	#if LG_SIZEOF_PTR == LG_SIZEOF_INT
	return ffs_u(x);
	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
	return ffs_lu(x);
	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
	return ffs_llu(x);
	#else
	#error No implementation for size_t ffs()
	#endif
	}

	static inline unsigned
	fls_zu(size_t x) {
	#if LG_SIZEOF_PTR == LG_SIZEOF_INT
	return fls_u(x);
	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
	return fls_lu(x);
	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
	return fls_llu(x);
	#else
	#error No implementation for size_t fls()
	#endif
	}


	static inline unsigned
	ffs_u64(uint64_t x) {
	#if LG_SIZEOF_LONG == 3
	return ffs_lu(x);
	#elif LG_SIZEOF_LONG_LONG == 3
	return ffs_llu(x);
	#else
	#error No implementation for 64-bit ffs()
	#endif
	}

	static inline unsigned
	fls_u64(uint64_t x) {
	#if LG_SIZEOF_LONG == 3
	return fls_lu(x);
	#elif LG_SIZEOF_LONG_LONG == 3
	return fls_llu(x);
	#else
	#error No implementation for 64-bit fls()
	#endif
	}

	static inline unsigned
	ffs_u32(uint32_t x) {
	#if LG_SIZEOF_INT == 2
	return ffs_u(x);
	#else
	#error No implementation for 32-bit ffs()
	#endif
	return ffs_u(x);
	}

	static inline unsigned
	fls_u32(uint32_t x) {
	#if LG_SIZEOF_INT == 2
	return fls_u(x);
	#else
	#error No implementation for 32-bit fls()
	#endif
	return fls_u(x);
	}

	static inline uint64_t
	pow2_ceil_u64(uint64_t x) {
	if (unlikely(x <= 1)) {
	return x;
	}
	size_t msb_on_index = fls_u64(x - 1);
	/*
	* Range-check; it's on the callers to ensure that the result of this
	* call won't overflow.
	*/
	assert(msb_on_index < 63);
	return 1ULL << (msb_on_index + 1);
	}

	static inline uint32_t
	pow2_ceil_u32(uint32_t x) {
	if (unlikely(x <= 1)) {
	return x;
	}
	size_t msb_on_index = fls_u32(x - 1);
	/* As above. */
	assert(msb_on_index < 31);
	return 1U << (msb_on_index + 1);
	}

	/* Compute the smallest power of 2 that is >= x. */
	static inline size_t
	pow2_ceil_zu(size_t x) {
	#if (LG_SIZEOF_PTR == 3)
	return pow2_ceil_u64(x);
	#else
	return pow2_ceil_u32(x);
	#endif
	}

	static inline unsigned
	lg_floor(size_t x) {
	util_assume(x != 0);
	#if (LG_SIZEOF_PTR == 3)
	return fls_u64(x);
	#else
	return fls_u32(x);
	#endif
	}

	static inline unsigned
	lg_ceil(size_t x) {
	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
	}

	/* A compile-time version of lg_floor and lg_ceil. */
	#define LG_FLOOR_1(x) 0
	#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
	#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
	#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
	#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
	#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
	#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
	#if LG_SIZEOF_PTR == 2
	# define LG_FLOOR(x) LG_FLOOR_32((x))
	#else
	# define LG_FLOOR(x) LG_FLOOR_64((x))
	#endif

	#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))

	#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */