bn_fast_s_mp_sqr.c - third_party/dropbear - Git at Google

 #include <tommath.h>
 #ifdef BN_FAST_S_MP_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
  * LibTomMath is a library that provides multiple-precision
  * integer arithmetic as well as number theoretic functionality.
  *
  * The library was designed directly after the MPI library by
  * Michael Fromberger but has been written from scratch with
  * additional optimizations in place.
  *
  * The library is free for all purposes without any express
  * guarantee it works.
  *
  * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
  */

 /* fast squaring
  *
  * This is the comba method where the columns of the product
  * are computed first then the carries are computed.  This
  * has the effect of making a very simple inner loop that
  * is executed the most
  *
  * W2 represents the outer products and W the inner.
  *
  * A further optimizations is made because the inner
  * products are of the form "A * B * 2".  The *2 part does
  * not need to be computed until the end which is good
  * because 64-bit shifts are slow!
  *
  * Based on Algorithm 14.16 on pp.597 of HAC.
  *
  */
 /* the jist of squaring...

 you do like mult except the offset of the tmpx [one that starts closer to zero]
 can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
 (ty-tx) so that it never happens.  You double all those you add in the inner loop

 After that loop you do the squares and add them in.

 Remove W2 and don't memset W

 */

 int fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
   int       olduse, res, pa, ix, iz;
   mp_digit   W[MP_WARRAY], *tmpx;
   mp_word   W1;

   /* grow the destination as required */
   pa = a->used + a->used;
   if (b->alloc < pa) {
     if ((res = mp_grow (b, pa)) != MP_OKAY) {
       return res;
     }
   }

   /* number of output digits to produce */
   W1 = 0;
   for (ix = 0; ix <= pa; ix++) {
       int      tx, ty, iy;
       mp_word  _W;
       mp_digit *tmpy;

       /* clear counter */
       _W = 0;

       /* get offsets into the two bignums */
       ty = MIN(a->used-1, ix);
       tx = ix - ty;

       /* setup temp aliases */
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;

       /* this is the number of times the loop will iterrate, essentially its
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);

       /* now for squaring tx can never equal ty
        * we halve the distance since they approach at a rate of 2x
        * and we have to round because odd cases need to be executed
        */
       iy = MIN(iy, (ty-tx+1)>>1);

       /* execute loop */
       for (iz = 0; iz < iy; iz++) {
          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
       }

       /* double the inner product and add carry */
       _W = _W + _W + W1;

       /* even columns have the square term in them */
       if ((ix&1) == 0) {
          _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
       }

       /* store it */
       W[ix] = _W;

       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
   }

   /* setup dest */
   olduse  = b->used;
   b->used = a->used+a->used;

   {
     mp_digit *tmpb;
     tmpb = b->dp;
     for (ix = 0; ix < pa; ix++) {
       *tmpb++ = W[ix] & MP_MASK;
     }

     /* clear unused digits [that existed in the old copy of c] */
     for (; ix < olduse; ix++) {
       *tmpb++ = 0;
     }
   }
   mp_clamp (b);
   return MP_OKAY;
 }
 #endif
	#include <tommath.h>
	#ifdef BN_FAST_S_MP_SQR_C
	/* LibTomMath, multiple-precision integer library -- Tom St Denis
	*
	* LibTomMath is a library that provides multiple-precision
	* integer arithmetic as well as number theoretic functionality.
	*
	* The library was designed directly after the MPI library by
	* Michael Fromberger but has been written from scratch with
	* additional optimizations in place.
	*
	* The library is free for all purposes without any express
	* guarantee it works.
	*
	* Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
	*/

	/* fast squaring
	*
	* This is the comba method where the columns of the product
	* are computed first then the carries are computed. This
	* has the effect of making a very simple inner loop that
	* is executed the most
	*
	* W2 represents the outer products and W the inner.
	*
	* A further optimizations is made because the inner
	* products are of the form "A * B * 2". The *2 part does
	* not need to be computed until the end which is good
	* because 64-bit shifts are slow!
	*
	* Based on Algorithm 14.16 on pp.597 of HAC.
	*
	*/
	/* the jist of squaring...

	you do like mult except the offset of the tmpx [one that starts closer to zero]
	can't equal the offset of tmpy. So basically you set up iy like before then you min it with
	(ty-tx) so that it never happens. You double all those you add in the inner loop

	After that loop you do the squares and add them in.

	Remove W2 and don't memset W

	*/

	int fast_s_mp_sqr (mp_int * a, mp_int * b)
	{
	int olduse, res, pa, ix, iz;
	mp_digit W[MP_WARRAY], *tmpx;
	mp_word W1;

	/* grow the destination as required */
	pa = a->used + a->used;
	if (b->alloc < pa) {
	if ((res = mp_grow (b, pa)) != MP_OKAY) {
	return res;
	}
	}

	/* number of output digits to produce */
	W1 = 0;
	for (ix = 0; ix <= pa; ix++) {
	int tx, ty, iy;
	mp_word _W;
	mp_digit *tmpy;

	/* clear counter */
	_W = 0;

	/* get offsets into the two bignums */
	ty = MIN(a->used-1, ix);
	tx = ix - ty;

	/* setup temp aliases */
	tmpx = a->dp + tx;
	tmpy = a->dp + ty;

	/* this is the number of times the loop will iterrate, essentially its
	while (tx++ < a->used && ty-- >= 0) { ... }
	*/
	iy = MIN(a->used-tx, ty+1);

	/* now for squaring tx can never equal ty
	* we halve the distance since they approach at a rate of 2x
	* and we have to round because odd cases need to be executed
	*/
	iy = MIN(iy, (ty-tx+1)>>1);

	/* execute loop */
	for (iz = 0; iz < iy; iz++) {
	_W += ((mp_word)tmpx++)((mp_word)*tmpy--);
	}

	/* double the inner product and add carry */
	_W = _W + _W + W1;

	/* even columns have the square term in them */
	if ((ix&1) == 0) {
	_W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
	}

	/* store it */
	W[ix] = _W;

	/* make next carry */
	W1 = _W >> ((mp_word)DIGIT_BIT);
	}

	/* setup dest */
	olduse = b->used;
	b->used = a->used+a->used;

	{
	mp_digit *tmpb;
	tmpb = b->dp;
	for (ix = 0; ix < pa; ix++) {
	*tmpb++ = W[ix] & MP_MASK;
	}

	/* clear unused digits [that existed in the old copy of c] */
	for (; ix < olduse; ix++) {
	*tmpb++ = 0;
	}
	}
	mp_clamp (b);
	return MP_OKAY;
	}
	#endif