diff --git a/src/tls/libtommath.c b/src/tls/libtommath.c
index f5442ff86..b75cb9bb4 100644
--- a/src/tls/libtommath.c
+++ b/src/tls/libtommath.c
@@ -30,6 +30,15 @@
 #define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
 				 * would require other than mp_reduce */
 
+#ifdef LTM_FAST_EXPTMOD
+/* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
+#define BN_MP_EXPTMOD_FAST_C
+#define BN_MP_MONTGOMERY_SETUP_C
+#define BN_FAST_MP_MONTGOMERY_REDUCE_C
+#define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+#define BN_MP_MUL_2_C
+#endif /* LTM_FAST_EXPTMOD */
+
 /* Current uses do not require support for negative exponent in exptmod, so we
  * can save about 1.5 kB in leaving out invmod. */
 #define LTM_NO_NEG_EXP
@@ -144,6 +153,9 @@ static int mp_2expt(mp_int * a, int b);
 static int mp_reduce_setup(mp_int * a, mp_int * b);
 static int mp_reduce(mp_int * x, mp_int * m, mp_int * mu);
 static int mp_init_size(mp_int * a, int size);
+#ifdef BN_MP_EXPTMOD_FAST_C
+static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
+#endif /* BN_MP_EXPTMOD_FAST_C */
 
 
 
@@ -2383,3 +2395,599 @@ static int s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   mp_clear (&t);
   return MP_OKAY;
 }
+
+
+#ifdef BN_MP_MONTGOMERY_SETUP_C
+/* setups the montgomery reduction stuff */
+static int
+mp_montgomery_setup (mp_int * n, mp_digit * rho)
+{
+  mp_digit x, b;
+
+/* fast inversion mod 2**k
+ *
+ * Based on the fact that
+ *
+ * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+ *                    =>  2*X*A - X*X*A*A = 1
+ *                    =>  2*(1) - (1)     = 1
+ */
+  b = n->dp[0];
+
+  if ((b & 1) == 0) {
+    return MP_VAL;
+  }
+
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+#endif
+
+  /* rho = -1/m mod b */
+  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+
+  return MP_OKAY;
+}
+#endif
+
+
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+/* computes xR**-1 == x (mod N) via Montgomery Reduction
+ *
+ * This is an optimized implementation of montgomery_reduce
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, olduse;
+  mp_word W[MP_WARRAY];
+
+  /* get old used count */
+  olduse = x->used;
+
+  /* grow a as required */
+  if (x->alloc < n->used + 1) {
+    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* first we have to get the digits of the input into
+   * an array of double precision words W[...]
+   */
+  {
+    register mp_word *_W;
+    register mp_digit *tmpx;
+
+    /* alias for the W[] array */
+    _W   = W;
+
+    /* alias for the digits of  x*/
+    tmpx = x->dp;
+
+    /* copy the digits of a into W[0..a->used-1] */
+    for (ix = 0; ix < x->used; ix++) {
+      *_W++ = *tmpx++;
+    }
+
+    /* zero the high words of W[a->used..m->used*2] */
+    for (; ix < n->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
+  }
+
+  /* now we proceed to zero successive digits
+   * from the least significant upwards
+   */
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b
+     *
+     * We avoid a double precision multiplication (which isn't required)
+     * by casting the value down to a mp_digit.  Note this requires
+     * that W[ix-1] have  the carry cleared (see after the inner loop)
+     */
+    register mp_digit mu;
+    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
+
+    /* a = a + mu * m * b**i
+     *
+     * This is computed in place and on the fly.  The multiplication
+     * by b**i is handled by offseting which columns the results
+     * are added to.
+     *
+     * Note the comba method normally doesn't handle carries in the
+     * inner loop In this case we fix the carry from the previous
+     * column since the Montgomery reduction requires digits of the
+     * result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The
+     * carry fixups are done in order so after these loops the
+     * first m->used words of W[] have the carries fixed
+     */
+    {
+      register int iy;
+      register mp_digit *tmpn;
+      register mp_word *_W;
+
+      /* alias for the digits of the modulus */
+      tmpn = n->dp;
+
+      /* Alias for the columns set by an offset of ix */
+      _W = W + ix;
+
+      /* inner loop */
+      for (iy = 0; iy < n->used; iy++) {
+          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
+      }
+    }
+
+    /* now fix carry for next digit, W[ix+1] */
+    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+  }
+
+  /* now we have to propagate the carries and
+   * shift the words downward [all those least
+   * significant digits we zeroed].
+   */
+  {
+    register mp_digit *tmpx;
+    register mp_word *_W, *_W1;
+
+    /* nox fix rest of carries */
+
+    /* alias for current word */
+    _W1 = W + ix;
+
+    /* alias for next word, where the carry goes */
+    _W = W + ++ix;
+
+    for (; ix <= n->used * 2 + 1; ix++) {
+      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+    }
+
+    /* copy out, A = A/b**n
+     *
+     * The result is A/b**n but instead of converting from an
+     * array of mp_word to mp_digit than calling mp_rshd
+     * we just copy them in the right order
+     */
+
+    /* alias for destination word */
+    tmpx = x->dp;
+
+    /* alias for shifted double precision result */
+    _W = W + n->used;
+
+    for (ix = 0; ix < n->used + 1; ix++) {
+      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
+    }
+
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits
+     */
+    for (; ix < olduse; ix++) {
+      *tmpx++ = 0;
+    }
+  }
+
+  /* set the max used and clamp */
+  x->used = n->used + 1;
+  mp_clamp (x);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+  return MP_OKAY;
+}
+#endif
+
+
+#ifdef BN_MP_MUL_2_C
+/* b = a*2 */
+static int mp_mul_2(mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* alias for source */
+    tmpa = a->dp;
+    
+    /* alias for dest */
+    tmpb = b->dp;
+
+    /* carry */
+    r = 0;
+    for (x = 0; x < a->used; x++) {
+    
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+      
+      /* now shift up this digit, add in the carry [from the previous] */
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      /* add a MSB which is always 1 at this point */
+      *tmpb = 1;
+      ++(b->used);
+    }
+
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+#endif
+
+
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+/*
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+static int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  if (b->used > 1) {
+     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+        return res;
+     }
+  } else {
+     mp_set(a, 1);
+     bits = 1;
+  }
+
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+        return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+
+#ifdef BN_MP_EXPTMOD_FAST_C
+/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+ *
+ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
+ * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+ */
+
+static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[TAB_SIZE], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* use a pointer to the reduction algorithm.  This allows us to use
+   * one of many reduction algorithms without modding the guts of
+   * the code with if statements everywhere.
+   */
+  int     (*redux)(mp_int*,mp_int*,mp_digit);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err;
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* determine and setup reduction code */
+  if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_SETUP_C     
+     /* now setup montgomery  */
+     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
+        goto LBL_M;
+     }
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+     if (((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+     } else 
+#endif
+     {
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
+        /* use slower baseline Montgomery method */
+        redux = mp_montgomery_reduce;
+#else
+        err = MP_VAL;
+        goto LBL_M;
+#endif
+     }
+  } else if (redmode == 1) {
+#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
+     /* setup DR reduction for moduli of the form B**k - b */
+     mp_dr_setup(P, &mp);
+     redux = mp_dr_reduce;
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+  } else {
+#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
+     /* setup DR reduction for moduli of the form 2**k - b */
+     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
+        goto LBL_M;
+     }
+     redux = mp_reduce_2k;
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto LBL_M;
+  }
+
+  /* create M table
+   *
+
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+
+  if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+     /* now we need R mod m */
+     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
+       goto LBL_RES;
+     }
+#else 
+     err = MP_VAL;
+     goto LBL_RES;
+#endif
+
+     /* now set M[1] to G * R mod m */
+     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
+       goto LBL_RES;
+     }
+  } else {
+     mp_set(&res, 1);
+     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+        goto LBL_RES;
+     }
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto LBL_RES;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto LBL_RES;
+    }
+    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+      goto LBL_RES;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto LBL_RES;
+    }
+    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
+      goto LBL_RES;
+    }
+  }
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits so break */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int)DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode    = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode   = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* get next bit of the window */
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+    }
+  }
+
+  if (redmode == 0) {
+     /* fixup result if Montgomery reduction is used
+      * recall that any value in a Montgomery system is
+      * actually multiplied by R mod n.  So we have
+      * to reduce one more time to cancel out the factor
+      * of R.
+      */
+     if ((err = redux(&res, P, mp)) != MP_OKAY) {
+       goto LBL_RES;
+     }
+  }
+
+  /* swap res with Y */
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+LBL_RES:mp_clear (&res);
+LBL_M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+#endif
diff --git a/wpa_supplicant/Makefile b/wpa_supplicant/Makefile
index f185b590e..d1c6a0883 100644
--- a/wpa_supplicant/Makefile
+++ b/wpa_supplicant/Makefile
@@ -621,6 +621,9 @@ CFLAGS += -DCONFIG_TLS_INTERNAL_CLIENT
 ifeq ($(CONFIG_CRYPTO), internal)
 ifdef CONFIG_INTERNAL_LIBTOMMATH
 CFLAGS += -DCONFIG_INTERNAL_LIBTOMMATH
+ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD
+CFLAGS += -DLTM_FAST_EXPTMOD
+endif
 else
 LIBS += -ltommath
 LIBS_p += -ltommath
diff --git a/wpa_supplicant/defconfig b/wpa_supplicant/defconfig
index 8b89ccb99..80e5261bb 100644
--- a/wpa_supplicant/defconfig
+++ b/wpa_supplicant/defconfig
@@ -307,6 +307,9 @@ CONFIG_PEERKEY=y
 #LIBS += -L$(LTM_PATH)
 #LIBS_p += -L$(LTM_PATH)
 #endif
+# Add a cost of about 2.5 kB of additional cost, the internal LibTomMath can be
+# configured to include fast exptmod routine to speed up DH and RSA.
+#CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y
 
 # Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
 # This is only for Windows builds and requires WMI-related header files and