constant_fast_div.h - third_party/github.com/ridiculousfish/libdivide - Git at Google

 /*
 * When dividing by a known compile time constant, the division can be replaced
 * by a multiply+shift operation. GCC will do this automatically,
 * *BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER*.
 *
 * So on an 8-bit system, 16-bit divides will *NOT* be optimised.
 *
 * The macros here manually apply the multiply+shift operation for 16-bit numbers.
 *
 * Testing on an AtMega2560, -O3 optimizations:
 *   Performance improvement of 85% to 90%+ speed up (division by non-powers of 2)
 *   Zero increase in RAM usage
 *   Average of 25 bytes Flash used per call site
 *     Be careful calling this in a loop with aggressive loop unrolling!
 *
 * Note: testing of the multiply+shift technique on 8-bit division showed a
 * slight slow down over native code on AtMega2560. So the 8 bit equivalent
 * macros have not been included
 */

 #pragma once
 #include "libdivide.h"
 #include "u16_ldparams.h"
 #include "s16_ldparams.h"

 #define CAT_HELPER(a, b) a ## b
 #define CONCAT(A, B) CAT_HELPER(A, B)

 // GCC will optimise division by a power of 2
 // So allow that.
 #define S16_ISPOW2_NEG(denom) \
  (denom==-2 || \
   denom==-4 || \
   denom==-8 || \
   denom==-16 || \
   denom==-32 || \
   denom==-64 || \
   denom==-128 || \
   denom==-256 || \
   denom==-512 || \
   denom==-1024 || \
   denom==-2048 || \
   denom==-4096 || \
   denom==-8192 || \
   denom==-16384)
 #define S16_ISPOW2_POS(denom) \
  (denom==2 || \
   denom==4 || \
   denom==8 || \
   denom==16 || \
   denom==32 || \
   denom==64 || \
   denom==128 || \
   denom==256 || \
   denom==512 || \
   denom==1024 || \
   denom==2048 || \
   denom==4096 || \
   denom==8192 || \
   denom==16384)
 #define U16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || denom==32768)
 #define S16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || S16_ISPOW2_NEG(denom))

 // Apply the libdivide namespace if necessary
 #ifdef __cplusplus
 #define LIB_DIV_NAMESPACE libdivide::
 #else
 #define LIB_DIV_NAMESPACE
 #endif

 /*
 * Wrapper for *unsigned* 16-bit DIVISION. The divisor must be a compile time
 * constant.
 * E.g. FAST_DIV16U(value, 100)
 */
 #define U16_MAGIC(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MAGIC)
 #define U16_MORE(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MORE)
 #define FAST_DIV16U(a, d) (U16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_u16_do_raw(a, U16_MAGIC(d), U16_MORE(d)))

 /*
 * Wrapper for *signed* 16-bit DIVISION by a *POSITIVE* compile time constant.
 * E.g. FAST_DIV16(-value, 777)
 *
 * This only works for positive parmeters :-(
 * A negative number results in a hypen in the macro name, which is not allowed
 */
 #define S16_MAGIC(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MAGIC)
 #define S16_MORE(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MORE)
 #define FAST_DIV16(a, d) (S16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC(d), S16_MORE(d)))

 /*
 * Wrapper for *signed* 16-bit DIVISION by a *NEGATIVE* compile time constant.
 * E.g. FAST_DIV16_NEG(-value, 777) // <-- It's converted to negative. Really.
 *
 * This only works for positive parmeters :-(
 * A negative number results in a hypen in the macro name, which is not allowed
 */
 #define S16_MAGIC_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MAGIC)
 #define S16_MORE_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MORE)
 #define FAST_DIV16_NEG(a, d) (S16_ISPOW2(d) ? a/-d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC_NEG(d), S16_MORE_NEG(d)))

 /*
 * Wrapper for *unsigned* 16-bit MODULUS. The divisor must be a compile time
 * constant.
 * E.g. FAST_MOD16U(value, 6)
 */
 #define FAST_MOD16U(a, d) (a - (FAST_DIV16U(a, d) * d))
	/*
	* When dividing by a known compile time constant, the division can be replaced
	* by a multiply+shift operation. GCC will do this automatically,
	* BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER.
	*
	* So on an 8-bit system, 16-bit divides will NOT be optimised.
	*
	* The macros here manually apply the multiply+shift operation for 16-bit numbers.
	*
	* Testing on an AtMega2560, -O3 optimizations:
	* Performance improvement of 85% to 90%+ speed up (division by non-powers of 2)
	* Zero increase in RAM usage
	* Average of 25 bytes Flash used per call site
	* Be careful calling this in a loop with aggressive loop unrolling!
	*
	* Note: testing of the multiply+shift technique on 8-bit division showed a
	* slight slow down over native code on AtMega2560. So the 8 bit equivalent
	* macros have not been included
	*/

	#pragma once
	#include "libdivide.h"
	#include "u16_ldparams.h"
	#include "s16_ldparams.h"

	#define CAT_HELPER(a, b) a ## b
	#define CONCAT(A, B) CAT_HELPER(A, B)

	// GCC will optimise division by a power of 2
	// So allow that.
	#define S16_ISPOW2_NEG(denom) \
	(denom==-2 \|\| \
	denom==-4 \|\| \
	denom==-8 \|\| \
	denom==-16 \|\| \
	denom==-32 \|\| \
	denom==-64 \|\| \
	denom==-128 \|\| \
	denom==-256 \|\| \
	denom==-512 \|\| \
	denom==-1024 \|\| \
	denom==-2048 \|\| \
	denom==-4096 \|\| \
	denom==-8192 \|\| \
	denom==-16384)
	#define S16_ISPOW2_POS(denom) \
	(denom==2 \|\| \
	denom==4 \|\| \
	denom==8 \|\| \
	denom==16 \|\| \
	denom==32 \|\| \
	denom==64 \|\| \
	denom==128 \|\| \
	denom==256 \|\| \
	denom==512 \|\| \
	denom==1024 \|\| \
	denom==2048 \|\| \
	denom==4096 \|\| \
	denom==8192 \|\| \
	denom==16384)
	#define U16_ISPOW2(denom) (S16_ISPOW2_POS(denom) \|\| denom==32768)
	#define S16_ISPOW2(denom) (S16_ISPOW2_POS(denom) \|\| S16_ISPOW2_NEG(denom))

	// Apply the libdivide namespace if necessary
	#ifdef __cplusplus
	#define LIB_DIV_NAMESPACE libdivide::
	#else
	#define LIB_DIV_NAMESPACE
	#endif

	/*
	* Wrapper for unsigned 16-bit DIVISION. The divisor must be a compile time
	* constant.
	* E.g. FAST_DIV16U(value, 100)
	*/
	#define U16_MAGIC(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MAGIC)
	#define U16_MORE(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MORE)
	#define FAST_DIV16U(a, d) (U16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_u16_do_raw(a, U16_MAGIC(d), U16_MORE(d)))

	/*
	* Wrapper for signed 16-bit DIVISION by a POSITIVE compile time constant.
	* E.g. FAST_DIV16(-value, 777)
	*
	* This only works for positive parmeters :-(
	* A negative number results in a hypen in the macro name, which is not allowed
	*/
	#define S16_MAGIC(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MAGIC)
	#define S16_MORE(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MORE)
	#define FAST_DIV16(a, d) (S16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC(d), S16_MORE(d)))

	/*
	* Wrapper for signed 16-bit DIVISION by a NEGATIVE compile time constant.
	* E.g. FAST_DIV16_NEG(-value, 777) // <-- It's converted to negative. Really.
	*
	* This only works for positive parmeters :-(
	* A negative number results in a hypen in the macro name, which is not allowed
	*/
	#define S16_MAGIC_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MAGIC)
	#define S16_MORE_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MORE)
	#define FAST_DIV16_NEG(a, d) (S16_ISPOW2(d) ? a/-d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC_NEG(d), S16_MORE_NEG(d)))

	/*
	* Wrapper for unsigned 16-bit MODULUS. The divisor must be a compile time
	* constant.
	* E.g. FAST_MOD16U(value, 6)
	*/
	#define FAST_MOD16U(a, d) (a - (FAST_DIV16U(a, d) * d))