Added an option to build internal LibTomMath with faster div routine

At the cost of about 1 kB of additional binary size, the internal LibTomMath can be configured to include faster div routine to speed up DH and RSA. This can be enabled with CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV=y in .config.
2008-06-06 10:11:17 +03:00 · 2008-06-06 10:11:17 +03:00 · ec0205a87a
commit ec0205a87a
parent b6ab429402
3 changed files with 294 additions and 0 deletions
--- a/src/tls/libtommath.c
+++ b/src/tls/libtommath.c
@ -26,6 +26,15 @@
 #define BN_S_MP_SQR_C
 #define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
 				 * would require other than mp_reduce */
+#ifdef LTM_FAST_DIV
+/* Use faster div at the cost of about 1 kB */
+#define BN_MP_MUL_D_C
+#else /* LTM_FAST_DIV */
+#define BN_MP_DIV_SMALL
+#define BN_MP_INIT_MULTI_C
+#define BN_MP_CLEAR_MULTI_C
+#define BN_MP_ABS_C
+#endif /* LTM_FAST_DIV */

 #ifdef LTM_FAST_EXPTMOD
 /* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
@ -124,8 +133,12 @@ static int s_mp_mul_high_digs(mp_int * a, mp_int * b, mp_int * c, int digs);

 static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);

+#ifdef BN_MP_INIT_MULTI_C
 static int mp_init_multi(mp_int *mp, ...);
+#endif
+#ifdef BN_MP_CLEAR_MULTI_C
 static void mp_clear_multi(mp_int *mp, ...);
+#endif
 static int mp_lshd(mp_int * a, int b);
 static void mp_set(mp_int * a, mp_digit b);
 static void mp_clamp(mp_int * a);
@ -147,7 +160,9 @@ static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d);
 static int mp_mod(mp_int * a, mp_int * b, mp_int * c);
 static int mp_grow(mp_int * a, int size);
 static int mp_cmp_mag(mp_int * a, mp_int * b);
+#ifdef BN_MP_ABS_C
 static int mp_abs(mp_int * a, mp_int * b);
+#endif
 static int mp_sqr(mp_int * a, mp_int * b);
 static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
 static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
@ -161,6 +176,9 @@ static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int
 #ifdef BN_FAST_S_MP_SQR_C
 static int fast_s_mp_sqr (mp_int * a, mp_int * b);
 #endif /* BN_FAST_S_MP_SQR_C */
+#ifdef BN_MP_MUL_D_C
+static int mp_mul_d (mp_int * a, mp_digit b, mp_int * c);
+#endif /* BN_MP_MUL_D_C */



@ -1263,6 +1281,7 @@ static int mp_grow (mp_int * a, int size)
 }


+#ifdef BN_MP_ABS_C
 /* b = |a| 
 *
 * Simple function copies the input and fixes the sign to positive
@ -1283,6 +1302,7 @@ static int mp_abs (mp_int * a, mp_int * b)

  return MP_OKAY;
 }
+#endif


 /* set to a digit */
@ -1409,6 +1429,7 @@ static int mp_mul_2d (mp_int * a, int b, mp_int * c)
 }


+#ifdef BN_MP_INIT_MULTI_C
 static int mp_init_multi(mp_int *mp, ...) 
 {
    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
@ -1444,8 +1465,10 @@ static int mp_init_multi(mp_int *mp, ...)
    va_end(args);
    return res;                /* Assumed ok, if error flagged above. */
 }
+#endif


+#ifdef BN_MP_CLEAR_MULTI_C
 static void mp_clear_multi(mp_int *mp, ...) 
 {
    mp_int* next_mp = mp;
@ -1457,6 +1480,7 @@ static void mp_clear_multi(mp_int *mp, ...)
    }
    va_end(args);
 }
+#endif


 /* shift left a certain amount of digits */
@ -1564,6 +1588,8 @@ static int mp_mod_2d (mp_int * a, int b, mp_int * c)
 }


+#ifdef BN_MP_DIV_SMALL
+
 /* slower bit-bang division... also smaller */
 static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
@ -1632,6 +1658,206 @@ LBL_ERR:
   return res;
 }

+#else
+
+/* integer signed division. 
+ * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+ * HAC pp.598 Algorithm 14.20
+ *
+ * Note that the description in HAC is horribly 
+ * incomplete.  For example, it doesn't consider 
+ * the case where digits are removed from 'x' in 
+ * the inner loop.  It also doesn't consider the 
+ * case that y has fewer than three digits, etc..
+ *
+ * The overall algorithm is as described as 
+ * 14.20 from HAC but fixed to treat these cases.
+*/
+static int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+
+  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
+    return res;
+  }
+  q.used = a->used + 2;
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    goto LBL_Q;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto LBL_T1;
+  }
+
+  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
+    goto LBL_T2;
+  }
+
+  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
+    goto LBL_X;
+  }
+
+  /* fix the sign */
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  x.sign = y.sign = MP_ZPOS;
+
+  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (int)(DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto LBL_Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto LBL_Y;
+     }
+  } else {
+     norm = 0;
+  }
+
+  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+  n = x.used - 1;
+  t = y.used - 1;
+
+  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
+    goto LBL_Y;
+  }
+
+  while (mp_cmp (&x, &y) != MP_LT) {
+    ++(q.dp[n - t]);
+    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+  }
+
+  /* reset y by shifting it back down */
+  mp_rshd (&y, n - t);
+
+  /* step 3. for i from n down to (t + 1) */
+  for (i = n; i >= (t + 1); i--) {
+    if (i > x.used) {
+      continue;
+    }
+
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
+     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    if (x.dp[i] == y.dp[t]) {
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+    } else {
+      mp_word tmp;
+      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+      tmp |= ((mp_word) x.dp[i - 1]);
+      tmp /= ((mp_word) y.dp[t]);
+      if (tmp > (mp_word) MP_MASK)
+        tmp = MP_MASK;
+      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+    }
+
+    /* while (q{i-t-1} * (yt * b + y{t-1})) > 
+             xi * b**2 + xi-1 * b + xi-2 
+     
+       do q{i-t-1} -= 1; 
+    */
+    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+    do {
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+
+      /* find left hand */
+      mp_zero (&t1);
+      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+      t1.dp[1] = y.dp[t];
+      t1.used = 2;
+      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+
+      /* find right hand */
+      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+      t2.dp[2] = x.dp[i];
+      t2.used = 3;
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
+
+    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
+    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
+    if (x.sign == MP_NEG) {
+      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+    }
+  }
+
+  /* now q is the quotient and x is the remainder 
+   * [which we have to normalize] 
+   */
+  
+  /* get sign before writing to c */
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
+
+  if (c != NULL) {
+    mp_clamp (&q);
+    mp_exch (&q, c);
+    c->sign = neg;
+  }
+
+  if (d != NULL) {
+    mp_div_2d (&x, norm, &x, NULL);
+    mp_exch (&x, d);
+  }
+
+  res = MP_OKAY;
+
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
+  return res;
+}
+
+#endif
+

 #ifdef MP_LOW_MEM
   #define TAB_SIZE 32
@ -3092,3 +3318,64 @@ static int fast_s_mp_sqr (mp_int * a, mp_int * b)
  return MP_OKAY;
 }
 #endif
+
+
+#ifdef BN_MP_MUL_D_C
+/* multiply by a digit */
+static int
+mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_digit u, *tmpa, *tmpc;
+  mp_word  r;
+  int      ix, res, olduse;
+
+  /* make sure c is big enough to hold a*b */
+  if (c->alloc < a->used + 1) {
+    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get the original destinations used count */
+  olduse = c->used;
+
+  /* set the sign */
+  c->sign = a->sign;
+
+  /* alias for a->dp [source] */
+  tmpa = a->dp;
+
+  /* alias for c->dp [dest] */
+  tmpc = c->dp;
+
+  /* zero carry */
+  u = 0;
+
+  /* compute columns */
+  for (ix = 0; ix < a->used; ix++) {
+    /* compute product and carry sum for this term */
+    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+
+    /* mask off higher bits to get a single digit */
+    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* send carry into next iteration */
+    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+  }
+
+  /* store final carry [if any] and increment ix offset  */
+  *tmpc++ = u;
+  ++ix;
+
+  /* now zero digits above the top */
+  while (ix++ < olduse) {
+     *tmpc++ = 0;
+  }
+
+  /* set used count */
+  c->used = a->used + 1;
+  mp_clamp(c);
+
+  return MP_OKAY;
+}
+#endif
--- a/wpa_supplicant/Makefile
+++ b/wpa_supplicant/Makefile
@ -627,6 +627,9 @@ endif
 ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR
 CFLAGS += -DLTM_FAST_SQR
 endif
+ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV
+CFLAGS += -DLTM_FAST_DIV
+endif
 else
 LIBS += -ltommath
 LIBS_p += -ltommath
--- a/wpa_supplicant/defconfig
+++ b/wpa_supplicant/defconfig
@ -314,6 +314,10 @@ CONFIG_PEERKEY=y
 # LibTomMath can be configured to include faster sqr routine to speed up DH and
 # RSA.
 #CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y
+# At the cost of about 1 kB of additional binary size, the internal
+# LibTomMath can be configured to include faster div routine to speed up DH and
+# RSA.
+#CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV=y

 # Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
 # This is only for Windows builds and requires WMI-related header files and