Added an option to build internal LibTomMath with faster div routine

At the cost of about 1 kB of additional binary size, the internal LibTomMath can be configured to include faster div routine to speed up DH and RSA. This can be enabled with CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV=y in .config.
2008-06-06 10:11:17 +03:00 · 2008-06-06 10:11:17 +03:00 · ec0205a87a
commit ec0205a87a
parent b6ab429402
3 changed files with 294 additions and 0 deletions
--- a/src/tls/libtommath.c
+++ b/src/tls/libtommath.c
@ -26,6 +26,15 @@
 #define BN_S_MP_SQR_C
 #define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
 				 * would require other than mp_reduce */
 #ifdef LTM_FAST_DIV
 /* Use faster div at the cost of about 1 kB */
 #define BN_MP_MUL_D_C
 #else /* LTM_FAST_DIV */
 #define BN_MP_DIV_SMALL
 #define BN_MP_INIT_MULTI_C
 #define BN_MP_CLEAR_MULTI_C
 #define BN_MP_ABS_C
 #endif /* LTM_FAST_DIV */
 #ifdef LTM_FAST_EXPTMOD
 /* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
@ -124,8 +133,12 @@ static int s_mp_mul_high_digs(mp_int * a, mp_int * b, mp_int * c, int digs);
 static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
 #ifdef BN_MP_INIT_MULTI_C
 static int mp_init_multi(mp_int *mp, ...);
 #endif
 #ifdef BN_MP_CLEAR_MULTI_C
 static void mp_clear_multi(mp_int *mp, ...);
 #endif
 static int mp_lshd(mp_int * a, int b);
 static void mp_set(mp_int * a, mp_digit b);
 static void mp_clamp(mp_int * a);
@ -147,7 +160,9 @@ static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d);
 static int mp_mod(mp_int * a, mp_int * b, mp_int * c);
 static int mp_grow(mp_int * a, int size);
 static int mp_cmp_mag(mp_int * a, mp_int * b);
 #ifdef BN_MP_ABS_C
 static int mp_abs(mp_int * a, mp_int * b);
 #endif
 static int mp_sqr(mp_int * a, mp_int * b);
 static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
 static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
@ -161,6 +176,9 @@ static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int
 #ifdef BN_FAST_S_MP_SQR_C
 static int fast_s_mp_sqr (mp_int * a, mp_int * b);
 #endif /* BN_FAST_S_MP_SQR_C */
 #ifdef BN_MP_MUL_D_C
 static int mp_mul_d (mp_int * a, mp_digit b, mp_int * c);
 #endif /* BN_MP_MUL_D_C */
@ -1263,6 +1281,7 @@ static int mp_grow (mp_int * a, int size)
 }
 #ifdef BN_MP_ABS_C
 /* b = |a| 
 *
 * Simple function copies the input and fixes the sign to positive
@ -1283,6 +1302,7 @@ static int mp_abs (mp_int * a, mp_int * b)
  return MP_OKAY;
 }
 #endif
 /* set to a digit */
@ -1409,6 +1429,7 @@ static int mp_mul_2d (mp_int * a, int b, mp_int * c)
 }
 #ifdef BN_MP_INIT_MULTI_C
 static int mp_init_multi(mp_int *mp, ...) 
 {
    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
@ -1444,8 +1465,10 @@ static int mp_init_multi(mp_int *mp, ...)
    va_end(args);
    return res;                /* Assumed ok, if error flagged above. */
 }
 #endif
 #ifdef BN_MP_CLEAR_MULTI_C
 static void mp_clear_multi(mp_int *mp, ...) 
 {
    mp_int* next_mp = mp;
@ -1457,6 +1480,7 @@ static void mp_clear_multi(mp_int *mp, ...)
    }
    va_end(args);
 }
 #endif
 /* shift left a certain amount of digits */
@ -1564,6 +1588,8 @@ static int mp_mod_2d (mp_int * a, int b, mp_int * c)
 }
 #ifdef BN_MP_DIV_SMALL
 /* slower bit-bang division... also smaller */
 static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
@ -1632,6 +1658,206 @@ LBL_ERR:
   return res;
 }
 #else
 /* integer signed division. 
 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
 * HAC pp.598 Algorithm 14.20
 *
 * Note that the description in HAC is horribly 
 * incomplete.  For example, it doesn't consider 
 * the case where digits are removed from 'x' in 
 * the inner loop.  It also doesn't consider the 
 * case that y has fewer than three digits, etc..
 *
 * The overall algorithm is as described as 
 * 14.20 from HAC but fixed to treat these cases.
 */
 static int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
  mp_int  q, x, y, t1, t2;
  int     res, n, t, i, norm, neg;
  /* is divisor zero ? */
  if (mp_iszero (b) == 1) {
    return MP_VAL;
  }
  /* if a < b then q=0, r = a */
  if (mp_cmp_mag (a, b) == MP_LT) {
    if (d != NULL) {
      res = mp_copy (a, d);
    } else {
      res = MP_OKAY;
    }
    if (c != NULL) {
      mp_zero (c);
    }
    return res;
  }
  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
    return res;
  }
  q.used = a->used + 2;
  if ((res = mp_init (&t1)) != MP_OKAY) {
    goto LBL_Q;
  }
  if ((res = mp_init (&t2)) != MP_OKAY) {
    goto LBL_T1;
  }
  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
    goto LBL_T2;
  }
  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
    goto LBL_X;
  }
  /* fix the sign */
  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
  x.sign = y.sign = MP_ZPOS;
  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
  norm = mp_count_bits(&y) % DIGIT_BIT;
  if (norm < (int)(DIGIT_BIT-1)) {
     norm = (DIGIT_BIT-1) - norm;
     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
       goto LBL_Y;
     }
     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
       goto LBL_Y;
     }
  } else {
     norm = 0;
  }
  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
  n = x.used - 1;
  t = y.used - 1;
  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
    goto LBL_Y;
  }
  while (mp_cmp (&x, &y) != MP_LT) {
    ++(q.dp[n - t]);
    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
      goto LBL_Y;
    }
  }
  /* reset y by shifting it back down */
  mp_rshd (&y, n - t);
  /* step 3. for i from n down to (t + 1) */
  for (i = n; i >= (t + 1); i--) {
    if (i > x.used) {
      continue;
    }
    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
    if (x.dp[i] == y.dp[t]) {
      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
    } else {
      mp_word tmp;
      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
      tmp |= ((mp_word) x.dp[i - 1]);
      tmp /= ((mp_word) y.dp[t]);
      if (tmp > (mp_word) MP_MASK)
        tmp = MP_MASK;
      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
    }
    /* while (q{i-t-1} * (yt * b + y{t-1})) > 
             xi * b**2 + xi-1 * b + xi-2 
       do q{i-t-1} -= 1; 
    */
    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
    do {
      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
      /* find left hand */
      mp_zero (&t1);
      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
      t1.dp[1] = y.dp[t];
      t1.used = 2;
      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
        goto LBL_Y;
      }
      /* find right hand */
      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
      t2.dp[2] = x.dp[i];
      t2.used = 3;
    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
      goto LBL_Y;
    }
    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
      goto LBL_Y;
    }
    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
      goto LBL_Y;
    }
    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
    if (x.sign == MP_NEG) {
      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
        goto LBL_Y;
      }
      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
        goto LBL_Y;
      }
      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
        goto LBL_Y;
      }
      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
    }
  }
  /* now q is the quotient and x is the remainder 
   * [which we have to normalize] 
   */
  /* get sign before writing to c */
  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
  if (c != NULL) {
    mp_clamp (&q);
    mp_exch (&q, c);
    c->sign = neg;
  }
  if (d != NULL) {
    mp_div_2d (&x, norm, &x, NULL);
    mp_exch (&x, d);
  }
  res = MP_OKAY;
 LBL_Y:mp_clear (&y);
 LBL_X:mp_clear (&x);
 LBL_T2:mp_clear (&t2);
 LBL_T1:mp_clear (&t1);
 LBL_Q:mp_clear (&q);
  return res;
 }
 #endif
 #ifdef MP_LOW_MEM
   #define TAB_SIZE 32
@ -3092,3 +3318,64 @@ static int fast_s_mp_sqr (mp_int * a, mp_int * b)
  return MP_OKAY;
 }
 #endif
 #ifdef BN_MP_MUL_D_C
 /* multiply by a digit */
 static int
 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
  mp_digit u, *tmpa, *tmpc;
  mp_word  r;
  int      ix, res, olduse;
  /* make sure c is big enough to hold a*b */
  if (c->alloc < a->used + 1) {
    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
      return res;
    }
  }
  /* get the original destinations used count */
  olduse = c->used;
  /* set the sign */
  c->sign = a->sign;
  /* alias for a->dp [source] */
  tmpa = a->dp;
  /* alias for c->dp [dest] */
  tmpc = c->dp;
  /* zero carry */
  u = 0;
  /* compute columns */
  for (ix = 0; ix < a->used; ix++) {
    /* compute product and carry sum for this term */
    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
    /* mask off higher bits to get a single digit */
    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
    /* send carry into next iteration */
    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
  }
  /* store final carry [if any] and increment ix offset  */
  *tmpc++ = u;
  ++ix;
  /* now zero digits above the top */
  while (ix++ < olduse) {
     *tmpc++ = 0;
  }
  /* set used count */
  c->used = a->used + 1;
  mp_clamp(c);
  return MP_OKAY;
 }
 #endif
--- a/wpa_supplicant/Makefile
+++ b/wpa_supplicant/Makefile
@ -627,6 +627,9 @@ endif
 ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR
 CFLAGS += -DLTM_FAST_SQR
 endif
 ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV
 CFLAGS += -DLTM_FAST_DIV
 endif
 else
 LIBS += -ltommath
 LIBS_p += -ltommath
--- a/wpa_supplicant/defconfig
+++ b/wpa_supplicant/defconfig
@ -314,6 +314,10 @@ CONFIG_PEERKEY=y
 # LibTomMath can be configured to include faster sqr routine to speed up DH and
 # RSA.
 #CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y
 # At the cost of about 1 kB of additional binary size, the internal
 # LibTomMath can be configured to include faster div routine to speed up DH and
 # RSA.
 #CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV=y
 # Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
 # This is only for Windows builds and requires WMI-related header files and