Added an option to build internal LibTomMath with faster div routine
At the cost of about 1 kB of additional binary size, the internal LibTomMath can be configured to include faster div routine to speed up DH and RSA. This can be enabled with CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV=y in .config.
This commit is contained in:
parent
b6ab429402
commit
ec0205a87a
3 changed files with 294 additions and 0 deletions
|
@ -26,6 +26,15 @@
|
|||
#define BN_S_MP_SQR_C
|
||||
#define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
|
||||
* would require other than mp_reduce */
|
||||
#ifdef LTM_FAST_DIV
|
||||
/* Use faster div at the cost of about 1 kB */
|
||||
#define BN_MP_MUL_D_C
|
||||
#else /* LTM_FAST_DIV */
|
||||
#define BN_MP_DIV_SMALL
|
||||
#define BN_MP_INIT_MULTI_C
|
||||
#define BN_MP_CLEAR_MULTI_C
|
||||
#define BN_MP_ABS_C
|
||||
#endif /* LTM_FAST_DIV */
|
||||
|
||||
#ifdef LTM_FAST_EXPTMOD
|
||||
/* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
|
||||
|
@ -124,8 +133,12 @@ static int s_mp_mul_high_digs(mp_int * a, mp_int * b, mp_int * c, int digs);
|
|||
|
||||
static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
|
||||
|
||||
#ifdef BN_MP_INIT_MULTI_C
|
||||
static int mp_init_multi(mp_int *mp, ...);
|
||||
#endif
|
||||
#ifdef BN_MP_CLEAR_MULTI_C
|
||||
static void mp_clear_multi(mp_int *mp, ...);
|
||||
#endif
|
||||
static int mp_lshd(mp_int * a, int b);
|
||||
static void mp_set(mp_int * a, mp_digit b);
|
||||
static void mp_clamp(mp_int * a);
|
||||
|
@ -147,7 +160,9 @@ static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d);
|
|||
static int mp_mod(mp_int * a, mp_int * b, mp_int * c);
|
||||
static int mp_grow(mp_int * a, int size);
|
||||
static int mp_cmp_mag(mp_int * a, mp_int * b);
|
||||
#ifdef BN_MP_ABS_C
|
||||
static int mp_abs(mp_int * a, mp_int * b);
|
||||
#endif
|
||||
static int mp_sqr(mp_int * a, mp_int * b);
|
||||
static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
|
||||
static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
|
||||
|
@ -161,6 +176,9 @@ static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int
|
|||
#ifdef BN_FAST_S_MP_SQR_C
|
||||
static int fast_s_mp_sqr (mp_int * a, mp_int * b);
|
||||
#endif /* BN_FAST_S_MP_SQR_C */
|
||||
#ifdef BN_MP_MUL_D_C
|
||||
static int mp_mul_d (mp_int * a, mp_digit b, mp_int * c);
|
||||
#endif /* BN_MP_MUL_D_C */
|
||||
|
||||
|
||||
|
||||
|
@ -1263,6 +1281,7 @@ static int mp_grow (mp_int * a, int size)
|
|||
}
|
||||
|
||||
|
||||
#ifdef BN_MP_ABS_C
|
||||
/* b = |a|
|
||||
*
|
||||
* Simple function copies the input and fixes the sign to positive
|
||||
|
@ -1283,6 +1302,7 @@ static int mp_abs (mp_int * a, mp_int * b)
|
|||
|
||||
return MP_OKAY;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* set to a digit */
|
||||
|
@ -1409,6 +1429,7 @@ static int mp_mul_2d (mp_int * a, int b, mp_int * c)
|
|||
}
|
||||
|
||||
|
||||
#ifdef BN_MP_INIT_MULTI_C
|
||||
static int mp_init_multi(mp_int *mp, ...)
|
||||
{
|
||||
mp_err res = MP_OKAY; /* Assume ok until proven otherwise */
|
||||
|
@ -1444,8 +1465,10 @@ static int mp_init_multi(mp_int *mp, ...)
|
|||
va_end(args);
|
||||
return res; /* Assumed ok, if error flagged above. */
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef BN_MP_CLEAR_MULTI_C
|
||||
static void mp_clear_multi(mp_int *mp, ...)
|
||||
{
|
||||
mp_int* next_mp = mp;
|
||||
|
@ -1457,6 +1480,7 @@ static void mp_clear_multi(mp_int *mp, ...)
|
|||
}
|
||||
va_end(args);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* shift left a certain amount of digits */
|
||||
|
@ -1564,6 +1588,8 @@ static int mp_mod_2d (mp_int * a, int b, mp_int * c)
|
|||
}
|
||||
|
||||
|
||||
#ifdef BN_MP_DIV_SMALL
|
||||
|
||||
/* slower bit-bang division... also smaller */
|
||||
static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
|
||||
{
|
||||
|
@ -1632,6 +1658,206 @@ LBL_ERR:
|
|||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* integer signed division.
|
||||
* c*b + d == a [e.g. a/b, c=quotient, d=remainder]
|
||||
* HAC pp.598 Algorithm 14.20
|
||||
*
|
||||
* Note that the description in HAC is horribly
|
||||
* incomplete. For example, it doesn't consider
|
||||
* the case where digits are removed from 'x' in
|
||||
* the inner loop. It also doesn't consider the
|
||||
* case that y has fewer than three digits, etc..
|
||||
*
|
||||
* The overall algorithm is as described as
|
||||
* 14.20 from HAC but fixed to treat these cases.
|
||||
*/
|
||||
static int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
|
||||
{
|
||||
mp_int q, x, y, t1, t2;
|
||||
int res, n, t, i, norm, neg;
|
||||
|
||||
/* is divisor zero ? */
|
||||
if (mp_iszero (b) == 1) {
|
||||
return MP_VAL;
|
||||
}
|
||||
|
||||
/* if a < b then q=0, r = a */
|
||||
if (mp_cmp_mag (a, b) == MP_LT) {
|
||||
if (d != NULL) {
|
||||
res = mp_copy (a, d);
|
||||
} else {
|
||||
res = MP_OKAY;
|
||||
}
|
||||
if (c != NULL) {
|
||||
mp_zero (c);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
q.used = a->used + 2;
|
||||
|
||||
if ((res = mp_init (&t1)) != MP_OKAY) {
|
||||
goto LBL_Q;
|
||||
}
|
||||
|
||||
if ((res = mp_init (&t2)) != MP_OKAY) {
|
||||
goto LBL_T1;
|
||||
}
|
||||
|
||||
if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
|
||||
goto LBL_T2;
|
||||
}
|
||||
|
||||
if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
|
||||
goto LBL_X;
|
||||
}
|
||||
|
||||
/* fix the sign */
|
||||
neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
|
||||
x.sign = y.sign = MP_ZPOS;
|
||||
|
||||
/* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
|
||||
norm = mp_count_bits(&y) % DIGIT_BIT;
|
||||
if (norm < (int)(DIGIT_BIT-1)) {
|
||||
norm = (DIGIT_BIT-1) - norm;
|
||||
if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
} else {
|
||||
norm = 0;
|
||||
}
|
||||
|
||||
/* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
|
||||
n = x.used - 1;
|
||||
t = y.used - 1;
|
||||
|
||||
/* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
|
||||
if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
|
||||
goto LBL_Y;
|
||||
}
|
||||
|
||||
while (mp_cmp (&x, &y) != MP_LT) {
|
||||
++(q.dp[n - t]);
|
||||
if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
}
|
||||
|
||||
/* reset y by shifting it back down */
|
||||
mp_rshd (&y, n - t);
|
||||
|
||||
/* step 3. for i from n down to (t + 1) */
|
||||
for (i = n; i >= (t + 1); i--) {
|
||||
if (i > x.used) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* step 3.1 if xi == yt then set q{i-t-1} to b-1,
|
||||
* otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
|
||||
if (x.dp[i] == y.dp[t]) {
|
||||
q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
|
||||
} else {
|
||||
mp_word tmp;
|
||||
tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
|
||||
tmp |= ((mp_word) x.dp[i - 1]);
|
||||
tmp /= ((mp_word) y.dp[t]);
|
||||
if (tmp > (mp_word) MP_MASK)
|
||||
tmp = MP_MASK;
|
||||
q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
|
||||
}
|
||||
|
||||
/* while (q{i-t-1} * (yt * b + y{t-1})) >
|
||||
xi * b**2 + xi-1 * b + xi-2
|
||||
|
||||
do q{i-t-1} -= 1;
|
||||
*/
|
||||
q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
|
||||
do {
|
||||
q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
|
||||
|
||||
/* find left hand */
|
||||
mp_zero (&t1);
|
||||
t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
|
||||
t1.dp[1] = y.dp[t];
|
||||
t1.used = 2;
|
||||
if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
|
||||
/* find right hand */
|
||||
t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
|
||||
t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
|
||||
t2.dp[2] = x.dp[i];
|
||||
t2.used = 3;
|
||||
} while (mp_cmp_mag(&t1, &t2) == MP_GT);
|
||||
|
||||
/* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
|
||||
if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
|
||||
if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
|
||||
if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
|
||||
/* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
|
||||
if (x.sign == MP_NEG) {
|
||||
if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
|
||||
goto LBL_Y;
|
||||
}
|
||||
|
||||
q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
/* now q is the quotient and x is the remainder
|
||||
* [which we have to normalize]
|
||||
*/
|
||||
|
||||
/* get sign before writing to c */
|
||||
x.sign = x.used == 0 ? MP_ZPOS : a->sign;
|
||||
|
||||
if (c != NULL) {
|
||||
mp_clamp (&q);
|
||||
mp_exch (&q, c);
|
||||
c->sign = neg;
|
||||
}
|
||||
|
||||
if (d != NULL) {
|
||||
mp_div_2d (&x, norm, &x, NULL);
|
||||
mp_exch (&x, d);
|
||||
}
|
||||
|
||||
res = MP_OKAY;
|
||||
|
||||
LBL_Y:mp_clear (&y);
|
||||
LBL_X:mp_clear (&x);
|
||||
LBL_T2:mp_clear (&t2);
|
||||
LBL_T1:mp_clear (&t1);
|
||||
LBL_Q:mp_clear (&q);
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef MP_LOW_MEM
|
||||
#define TAB_SIZE 32
|
||||
|
@ -3092,3 +3318,64 @@ static int fast_s_mp_sqr (mp_int * a, mp_int * b)
|
|||
return MP_OKAY;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef BN_MP_MUL_D_C
|
||||
/* multiply by a digit */
|
||||
static int
|
||||
mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
|
||||
{
|
||||
mp_digit u, *tmpa, *tmpc;
|
||||
mp_word r;
|
||||
int ix, res, olduse;
|
||||
|
||||
/* make sure c is big enough to hold a*b */
|
||||
if (c->alloc < a->used + 1) {
|
||||
if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
/* get the original destinations used count */
|
||||
olduse = c->used;
|
||||
|
||||
/* set the sign */
|
||||
c->sign = a->sign;
|
||||
|
||||
/* alias for a->dp [source] */
|
||||
tmpa = a->dp;
|
||||
|
||||
/* alias for c->dp [dest] */
|
||||
tmpc = c->dp;
|
||||
|
||||
/* zero carry */
|
||||
u = 0;
|
||||
|
||||
/* compute columns */
|
||||
for (ix = 0; ix < a->used; ix++) {
|
||||
/* compute product and carry sum for this term */
|
||||
r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
|
||||
|
||||
/* mask off higher bits to get a single digit */
|
||||
*tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
|
||||
|
||||
/* send carry into next iteration */
|
||||
u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
|
||||
}
|
||||
|
||||
/* store final carry [if any] and increment ix offset */
|
||||
*tmpc++ = u;
|
||||
++ix;
|
||||
|
||||
/* now zero digits above the top */
|
||||
while (ix++ < olduse) {
|
||||
*tmpc++ = 0;
|
||||
}
|
||||
|
||||
/* set used count */
|
||||
c->used = a->used + 1;
|
||||
mp_clamp(c);
|
||||
|
||||
return MP_OKAY;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -627,6 +627,9 @@ endif
|
|||
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR
|
||||
CFLAGS += -DLTM_FAST_SQR
|
||||
endif
|
||||
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV
|
||||
CFLAGS += -DLTM_FAST_DIV
|
||||
endif
|
||||
else
|
||||
LIBS += -ltommath
|
||||
LIBS_p += -ltommath
|
||||
|
|
|
@ -314,6 +314,10 @@ CONFIG_PEERKEY=y
|
|||
# LibTomMath can be configured to include faster sqr routine to speed up DH and
|
||||
# RSA.
|
||||
#CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y
|
||||
# At the cost of about 1 kB of additional binary size, the internal
|
||||
# LibTomMath can be configured to include faster div routine to speed up DH and
|
||||
# RSA.
|
||||
#CONFIG_INTERNAL_LIBTOMMATH_FAST_DIV=y
|
||||
|
||||
# Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
|
||||
# This is only for Windows builds and requires WMI-related header files and
|
||||
|
|
Loading…
Reference in a new issue