security/nss/lib/freebl/mpi/mpmontg.c

   1 /* ***** BEGIN LICENSE BLOCK *****
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * The contents of this file are subject to the Mozilla Public License Version
   5  * 1.1 (the "License"); you may not use this file except in compliance with
   6  * the License. You may obtain a copy of the License at
   7  * http://www.mozilla.org/MPL/
   8  *
   9  * Software distributed under the License is distributed on an "AS IS" basis,
  10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11  * for the specific language governing rights and limitations under the
  12  * License.
  13  *
  14  * The Original Code is the Netscape security libraries.
  15  *
  16  * The Initial Developer of the Original Code is
  17  * Netscape Communications Corporation.
  18  * Portions created by the Initial Developer are Copyright (C) 2000
  19  * the Initial Developer. All Rights Reserved.
  20  *
  21  * Contributor(s):
  22  *   Sheueling Chang Shantz <sheueling.chang@sun.com>,
  23  *   Stephen Fung <stephen.fung@sun.com>, and
  24  *   Douglas Stebila <douglas@stebila.ca> of Sun Laboratories.
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either the GNU General Public License Version 2 or later (the "GPL"), or
  28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39 /* $Id: mpmontg.c,v 1.20 2006/08/29 02:41:38 nelson%bolyard.com Exp $ */
  40
  41 /* This file implements moduluar exponentiation using Montgomery's
  42  * method for modular reduction.  This file implements the method
  43  * described as "Improvement 1" in the paper "A Cryptogrpahic Library for
  44  * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr.
  45  * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90"
  46  * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244,
  47  * published by Springer Verlag.
  48  */
  49
  50 #define MP_USING_CACHE_SAFE_MOD_EXP 1
  51 #include <string.h>
  52 #include "mpi-priv.h"
  53 #include "mplogic.h"
  54 #include "mpprime.h"
  55 #ifdef MP_USING_MONT_MULF
  56 #include "montmulf.h"
  57 #endif
  58 #include <stddef.h> /* ptrdiff_t */
  59
  60 /* if MP_CHAR_STORE_SLOW is defined, we  */
  61 /* need to know endianness of this platform. */
  62 #ifdef MP_CHAR_STORE_SLOW
  63 #if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN)
  64 #error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \
  65        "  if you define MP_CHAR_STORE_SLOW."
  66 #endif
  67 #endif
  68
  69 #define STATIC
  70
  71 #define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */
  72
  73 #if defined(_WIN32_WCE)
  74 #define ABORT  res = MP_UNDEF; goto CLEANUP
  75 #else
  76 #define ABORT abort()
  77 #endif
  78
  79 /* computes T = REDC(T), 2^b == R */
  80 mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
  81 {
  82   mp_err res;
  83   mp_size i;
  84
  85   i = MP_USED(T) + MP_USED(&mmm->N) + 2;
  86   MP_CHECKOK( s_mp_pad(T, i) );
  87   for (i = 0; i < MP_USED(&mmm->N); ++i ) {
  88     mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime;
  89     /* T += N * m_i * (MP_RADIX ** i); */
  90     MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) );
  91   }
  92   s_mp_clamp(T);
  93
  94   /* T /= R */
  95   s_mp_div_2d(T, mmm->b);
  96
  97   if ((res = s_mp_cmp(T, &mmm->N)) >= 0) {
  98     /* T = T - N */
  99     MP_CHECKOK( s_mp_sub(T, &mmm->N) );
 100 #ifdef DEBUG
 101     if ((res = mp_cmp(T, &mmm->N)) >= 0) {
 102       res = MP_UNDEF;
 103       goto CLEANUP;
 104     }
 105 #endif
 106   }
 107   res = MP_OKAY;
 108 CLEANUP:
 109   return res;
 110 }
 111
 112 #if !defined(MP_ASSEMBLY_MUL_MONT) && !defined(MP_MONT_USE_MP_MUL)
 113 mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c,
 114                    mp_mont_modulus *mmm)
 115 {
 116   mp_digit *pb;
 117   mp_digit m_i;
 118   mp_err   res;
 119   mp_size  ib;
 120   mp_size  useda, usedb;
 121
 122   ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
 123
 124   if (MP_USED(a) < MP_USED(b)) {
 125     const mp_int *xch = b;      /* switch a and b, to do fewer outer loops */
 126     b = a;
 127     a = xch;
 128   }
 129
 130   MP_USED(c) = 1; MP_DIGIT(c, 0) = 0;
 131   ib = MP_USED(a) + MP_MAX(MP_USED(b), MP_USED(&mmm->N)) + 2;
 132   if((res = s_mp_pad(c, ib)) != MP_OKAY)
 133     goto CLEANUP;
 134
 135   useda = MP_USED(a);
 136   pb = MP_DIGITS(b);
 137   s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c));
 138   s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1));
 139   m_i = MP_DIGIT(c, 0) * mmm->n0prime;
 140   s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0);
 141
 142   /* Outer loop:  Digits of b */
 143   usedb = MP_USED(b);
 144   for (ib = 1; ib < usedb; ib++) {
 145     mp_digit b_i    = *pb++;
 146
 147     /* Inner product:  Digits of a */
 148     if (b_i)
 149       s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib);
 150     m_i = MP_DIGIT(c, ib) * mmm->n0prime;
 151     s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
 152   }
 153   if (usedb < MP_USED(&mmm->N)) {
 154     for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) {
 155       m_i = MP_DIGIT(c, ib) * mmm->n0prime;
 156       s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
 157     }
 158   }
 159   s_mp_clamp(c);
 160   s_mp_div_2d(c, mmm->b);
 161   if (s_mp_cmp(c, &mmm->N) >= 0) {
 162     MP_CHECKOK( s_mp_sub(c, &mmm->N) );
 163   }
 164   res = MP_OKAY;
 165
 166 CLEANUP:
 167   return res;
 168 }
 169 #endif
 170
 171 STATIC
 172 mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont)
 173 {
 174   mp_err res;
 175
 176   /* xMont = x * R mod N   where  N is modulus */
 177   MP_CHECKOK( mpl_lsh(x, xMont, mmm->b) );              /* xMont = x << b */
 178   MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) );       /*         mod N */
 179 CLEANUP:
 180   return res;
 181 }
 182
 183 #ifdef MP_USING_MONT_MULF
 184
 185 /* the floating point multiply is already cache safe,
 186  * don't turn on cache safe unless we specifically
 187  * force it */
 188 #ifndef MP_FORCE_CACHE_SAFE
 189 #undef MP_USING_CACHE_SAFE_MOD_EXP
 190 #endif
 191
 192 unsigned int mp_using_mont_mulf = 1;
 193
 194 /* computes montgomery square of the integer in mResult */
 195 #define SQR \
 196   conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \
 197   mont_mulf_noconv(mResult, dm1, d16Tmp, \
 198                    dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
 199
 200 /* computes montgomery product of x and the integer in mResult */
 201 #define MUL(x) \
 202   conv_i32_to_d32(dm1, mResult, nLen); \
 203   mont_mulf_noconv(mResult, dm1, oddPowers[x], \
 204                    dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
 205
 206 /* Do modular exponentiation using floating point multiply code. */
 207 mp_err mp_exptmod_f(const mp_int *   montBase,
 208                     const mp_int *   exponent,
 209                     const mp_int *   modulus,
 210                     mp_int *         result,
 211                     mp_mont_modulus *mmm,
 212                     int              nLen,
 213                     mp_size          bits_in_exponent,
 214                     mp_size          window_bits,
 215                     mp_size          odd_ints)
 216 {
 217   mp_digit *mResult;
 218   double   *dBuf = 0, *dm1, *dn, *dSqr, *d16Tmp, *dTmp;
 219   double    dn0;
 220   mp_size   i;
 221   mp_err    res;
 222   int       expOff;
 223   int       dSize = 0, oddPowSize, dTmpSize;
 224   mp_int    accum1;
 225   double   *oddPowers[MAX_ODD_INTS];
 226
 227   /* function for computing n0prime only works if n0 is odd */
 228
 229   MP_DIGITS(&accum1) = 0;
 230
 231   for (i = 0; i < MAX_ODD_INTS; ++i)
 232     oddPowers[i] = 0;
 233
 234   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
 235
 236   mp_set(&accum1, 1);
 237   MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
 238   MP_CHECKOK( s_mp_pad(&accum1, nLen) );
 239
 240   oddPowSize = 2 * nLen + 1;
 241   dTmpSize   = 2 * oddPowSize;
 242   dSize = sizeof(double) * (nLen * 4 + 1 +
 243                             ((odd_ints + 1) * oddPowSize) + dTmpSize);
 244   dBuf   = (double *)malloc(dSize);
 245   dm1    = dBuf;                /* array of d32 */
 246   dn     = dBuf   + nLen;       /* array of d32 */
 247   dSqr   = dn     + nLen;       /* array of d32 */
 248   d16Tmp = dSqr   + nLen;       /* array of d16 */
 249   dTmp   = d16Tmp + oddPowSize;
 250
 251   for (i = 0; i < odd_ints; ++i) {
 252       oddPowers[i] = dTmp;
 253       dTmp += oddPowSize;
 254   }
 255   mResult = (mp_digit *)(dTmp + dTmpSize);      /* size is nLen + 1 */
 256
 257   /* Make dn and dn0 */
 258   conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen);
 259   dn0 = (double)(mmm->n0prime & 0xffff);
 260
 261   /* Make dSqr */
 262   conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(montBase), nLen);
 263   mont_mulf_noconv(mResult, dm1, oddPowers[0],
 264                    dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
 265   conv_i32_to_d32(dSqr, mResult, nLen);
 266
 267   for (i = 1; i < odd_ints; ++i) {
 268     mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1],
 269                      dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
 270     conv_i32_to_d16(oddPowers[i], mResult, nLen);
 271   }
 272
 273   s_mp_copy(MP_DIGITS(&accum1), mResult, nLen); /* from, to, len */
 274
 275   for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
 276     mp_size smallExp;
 277     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
 278     smallExp = (mp_size)res;
 279
 280     if (window_bits == 1) {
 281       if (!smallExp) {
 282         SQR;
 283       } else if (smallExp & 1) {
 284         SQR; MUL(0);
 285       } else {
 286         ABORT;
 287       }
 288     } else if (window_bits == 4) {
 289       if (!smallExp) {
 290         SQR; SQR; SQR; SQR;
 291       } else if (smallExp & 1) {
 292         SQR; SQR; SQR; SQR; MUL(smallExp/2);
 293       } else if (smallExp & 2) {
 294         SQR; SQR; SQR; MUL(smallExp/4); SQR;
 295       } else if (smallExp & 4) {
 296         SQR; SQR; MUL(smallExp/8); SQR; SQR;
 297       } else if (smallExp & 8) {
 298         SQR; MUL(smallExp/16); SQR; SQR; SQR;
 299       } else {
 300         ABORT;
 301       }
 302     } else if (window_bits == 5) {
 303       if (!smallExp) {
 304         SQR; SQR; SQR; SQR; SQR;
 305       } else if (smallExp & 1) {
 306         SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);
 307       } else if (smallExp & 2) {
 308         SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;
 309       } else if (smallExp & 4) {
 310         SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;
 311       } else if (smallExp & 8) {
 312         SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;
 313       } else if (smallExp & 0x10) {
 314         SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;
 315       } else {
 316         ABORT;
 317       }
 318     } else if (window_bits == 6) {
 319       if (!smallExp) {
 320         SQR; SQR; SQR; SQR; SQR; SQR;
 321       } else if (smallExp & 1) {
 322         SQR; SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);
 323       } else if (smallExp & 2) {
 324         SQR; SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;
 325       } else if (smallExp & 4) {
 326         SQR; SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;
 327       } else if (smallExp & 8) {
 328         SQR; SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;
 329       } else if (smallExp & 0x10) {
 330         SQR; SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;
 331       } else if (smallExp & 0x20) {
 332         SQR; MUL(smallExp/64); SQR; SQR; SQR; SQR; SQR;
 333       } else {
 334         ABORT;
 335       }
 336     } else {
 337       ABORT;
 338     }
 339   }
 340
 341   s_mp_copy(mResult, MP_DIGITS(&accum1), nLen); /* from, to, len */
 342
 343   res = s_mp_redc(&accum1, mmm);
 344   mp_exch(&accum1, result);
 345
 346 CLEANUP:
 347   mp_clear(&accum1);
 348   if (dBuf) {
 349     if (dSize)
 350       memset(dBuf, 0, dSize);
 351     free(dBuf);
 352   }
 353
 354   return res;
 355 }
 356 #undef SQR
 357 #undef MUL
 358 #endif
 359
 360 #define SQR(a,b) \
 361   MP_CHECKOK( mp_sqr(a, b) );\
 362   MP_CHECKOK( s_mp_redc(b, mmm) )
 363
 364 #if defined(MP_MONT_USE_MP_MUL)
 365 #define MUL(x,a,b) \
 366   MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \
 367   MP_CHECKOK( s_mp_redc(b, mmm) )
 368 #else
 369 #define MUL(x,a,b) \
 370   MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, mmm) )
 371 #endif
 372
 373 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
 374
 375 /* Do modular exponentiation using integer multiply code. */
 376 mp_err mp_exptmod_i(const mp_int *   montBase,
 377                     const mp_int *   exponent,
 378                     const mp_int *   modulus,
 379                     mp_int *         result,
 380                     mp_mont_modulus *mmm,
 381                     int              nLen,
 382                     mp_size          bits_in_exponent,
 383                     mp_size          window_bits,
 384                     mp_size          odd_ints)
 385 {
 386   mp_int *pa1, *pa2, *ptmp;
 387   mp_size i;
 388   mp_err  res;
 389   int     expOff;
 390   mp_int  accum1, accum2, power2, oddPowers[MAX_ODD_INTS];
 391
 392   /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
 393   /* oddPowers[i] = base ** (2*i + 1); */
 394
 395   MP_DIGITS(&accum1) = 0;
 396   MP_DIGITS(&accum2) = 0;
 397   MP_DIGITS(&power2) = 0;
 398   for (i = 0; i < MAX_ODD_INTS; ++i) {
 399     MP_DIGITS(oddPowers + i) = 0;
 400   }
 401
 402   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
 403   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
 404
 405   MP_CHECKOK( mp_init_copy(&oddPowers[0], montBase) );
 406
 407   mp_init_size(&power2, nLen + 2 * MP_USED(montBase) + 2);
 408   MP_CHECKOK( mp_sqr(montBase, &power2) );      /* power2 = montBase ** 2 */
 409   MP_CHECKOK( s_mp_redc(&power2, mmm) );
 410
 411   for (i = 1; i < odd_ints; ++i) {
 412     mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2);
 413     MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) );
 414     MP_CHECKOK( s_mp_redc(oddPowers + i, mmm) );
 415   }
 416
 417   /* set accumulator to montgomery residue of 1 */
 418   mp_set(&accum1, 1);
 419   MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
 420   pa1 = &accum1;
 421   pa2 = &accum2;
 422
 423   for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
 424     mp_size smallExp;
 425     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
 426     smallExp = (mp_size)res;
 427
 428     if (window_bits == 1) {
 429       if (!smallExp) {
 430         SQR(pa1,pa2); SWAPPA;
 431       } else if (smallExp & 1) {
 432         SQR(pa1,pa2); MUL(0,pa2,pa1);
 433       } else {
 434         ABORT;
 435       }
 436     } else if (window_bits == 4) {
 437       if (!smallExp) {
 438         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 439       } else if (smallExp & 1) {
 440         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 441         MUL(smallExp/2, pa1,pa2); SWAPPA;
 442       } else if (smallExp & 2) {
 443         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);
 444         MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
 445       } else if (smallExp & 4) {
 446         SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2);
 447         SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
 448       } else if (smallExp & 8) {
 449         SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2);
 450         SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
 451       } else {
 452         ABORT;
 453       }
 454     } else if (window_bits == 5) {
 455       if (!smallExp) {
 456         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 457         SQR(pa1,pa2); SWAPPA;
 458       } else if (smallExp & 1) {
 459         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 460         SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1);
 461       } else if (smallExp & 2) {
 462         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 463         MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1);
 464       } else if (smallExp & 4) {
 465         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);
 466         MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 467       } else if (smallExp & 8) {
 468         SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2);
 469         SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 470       } else if (smallExp & 0x10) {
 471         SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2);
 472         SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 473       } else {
 474         ABORT;
 475       }
 476     } else if (window_bits == 6) {
 477       if (!smallExp) {
 478         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 479         SQR(pa1,pa2); SQR(pa2,pa1);
 480       } else if (smallExp & 1) {
 481         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 482         SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA;
 483       } else if (smallExp & 2) {
 484         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 485         SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
 486       } else if (smallExp & 4) {
 487         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 488         MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
 489       } else if (smallExp & 8) {
 490         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);
 491         MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
 492         SQR(pa1,pa2); SWAPPA;
 493       } else if (smallExp & 0x10) {
 494         SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2);
 495         SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
 496       } else if (smallExp & 0x20) {
 497         SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2);
 498         SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
 499       } else {
 500         ABORT;
 501       }
 502     } else {
 503       ABORT;
 504     }
 505   }
 506
 507   res = s_mp_redc(pa1, mmm);
 508   mp_exch(pa1, result);
 509
 510 CLEANUP:
 511   mp_clear(&accum1);
 512   mp_clear(&accum2);
 513   mp_clear(&power2);
 514   for (i = 0; i < odd_ints; ++i) {
 515     mp_clear(oddPowers + i);
 516   }
 517   return res;
 518 }
 519 #undef SQR
 520 #undef MUL
 521
 522 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
 523 unsigned int mp_using_cache_safe_exp = 1;
 524 #endif
 525
 526 mp_err mp_set_safe_modexp(int value)
 527 {
 528 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
 529  mp_using_cache_safe_exp = value;
 530  return MP_OKAY;
 531 #else
 532  if (value == 0) {
 533    return MP_OKAY;
 534  }
 535  return MP_BADARG;
 536 #endif
 537 }
 538
 539 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
 540 #define WEAVE_WORD_SIZE 4
 541
 542 #ifndef MP_CHAR_STORE_SLOW
 543 /*
 544  * mpi_to_weave takes an array of bignums, a matrix in which each bignum
 545  * occupies all the columns of a row, and transposes it into a matrix in
 546  * which each bignum occupies a column of every row.  The first row of the
 547  * input matrix becomes the first column of the output matrix.  The n'th
 548  * row of input becomes the n'th column of output.  The input data is said
 549  * to be "interleaved" or "woven" into the output matrix.
 550  *
 551  * The array of bignums is left in this woven form.  Each time a single
 552  * bignum value is needed, it is recreated by fetching the n'th column,
 553  * forming a single row which is the new bignum.
 554  *
 555  * The purpose of this interleaving is make it impossible to determine which
 556  * of the bignums is being used in any one operation by examining the pattern
 557  * of cache misses.
 558  *
 559  * The weaving function does not transpose the entire input matrix in one call.
 560  * It transposes 4 rows of mp_ints into their respective columns of output.
 561  *
 562  * There are two different implementations of the weaving and unweaving code
 563  * in this file.  One uses byte loads and stores.  The second uses loads and
 564  * stores of mp_weave_word size values.  The weaved forms of these two
 565  * implementations differ.  Consequently, each one has its own explanation.
 566  *
 567  * Here is the explanation for the byte-at-a-time implementation.
 568  *
 569  * This implementation treats each mp_int bignum as an array of bytes,
 570  * rather than as an array of mp_digits.  It stores those bytes as a
 571  * column of bytes in the output matrix.  It doesn't care if the machine
 572  * uses big-endian or little-endian byte ordering within mp_digits.
 573  * The first byte of the mp_digit array becomes the first byte in the output
 574  * column, regardless of whether that byte is the MSB or LSB of the mp_digit.
 575  *
 576  * "bignums" is an array of mp_ints.
 577  * It points to four rows, four mp_ints, a subset of a larger array of mp_ints.
 578  *
 579  * "weaved" is the weaved output matrix.
 580  * The first byte of bignums[0] is stored in weaved[0].
 581  *
 582  * "nBignums" is the total number of bignums in the array of which "bignums"
 583  * is a part.
 584  *
 585  * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array.
 586  * mp_ints that use less than nDigits digits are logically padded with zeros
 587  * while being stored in the weaved array.
 588  */
 589 mp_err mpi_to_weave(const mp_int  *bignums,
 590                     unsigned char *weaved,
 591                     mp_size nDigits,  /* in each mp_int of input */
 592                     mp_size nBignums) /* in the entire source array */
 593 {
 594   mp_size i;
 595   unsigned char * endDest = weaved + (nDigits * nBignums * sizeof(mp_digit));
 596
 597   for (i=0; i < WEAVE_WORD_SIZE; i++) {
 598     mp_size used = MP_USED(&bignums[i]);
 599     unsigned char *pSrc   = (unsigned char *)MP_DIGITS(&bignums[i]);
 600     unsigned char *endSrc = pSrc + (used * sizeof(mp_digit));
 601     unsigned char *pDest  = weaved + i;
 602
 603     ARGCHK(MP_SIGN(&bignums[i]) == MP_ZPOS, MP_BADARG);
 604     ARGCHK(used <= nDigits, MP_BADARG);
 605
 606     for (; pSrc < endSrc; pSrc++) {
 607       *pDest = *pSrc;
 608       pDest += nBignums;
 609     }
 610     while (pDest < endDest) {
 611       *pDest = 0;
 612       pDest += nBignums;
 613     }
 614   }
 615
 616   return MP_OKAY;
 617 }
 618
 619 /* Reverse the operation above for one mp_int.
 620  * Reconstruct one mp_int from its column in the weaved array.
 621  * "pSrc" points to the offset into the weave array of the bignum we
 622  * are going to reconstruct.
 623  */
 624 mp_err weave_to_mpi(mp_int *a,                /* output, result */
 625                     const unsigned char *pSrc, /* input, byte matrix */
 626                     mp_size nDigits,          /* per mp_int output */
 627                     mp_size nBignums)         /* bignums in weaved matrix */
 628 {
 629   unsigned char *pDest   = (unsigned char *)MP_DIGITS(a);
 630   unsigned char *endDest = pDest + (nDigits * sizeof(mp_digit));
 631
 632   MP_SIGN(a) = MP_ZPOS;
 633   MP_USED(a) = nDigits;
 634
 635   for (; pDest < endDest; pSrc += nBignums, pDest++) {
 636     *pDest = *pSrc;
 637   }
 638   s_mp_clamp(a);
 639   return MP_OKAY;
 640 }
 641
 642 #else
 643
 644 /* Need a primitive that we know is 32 bits long... */
 645 /* this is true on all modern processors we know of today*/
 646 typedef unsigned int mp_weave_word;
 647
 648 /*
 649  * on some platforms character stores into memory is very expensive since they
 650  * generate a read/modify/write operation on the bus. On those platforms
 651  * we need to do integer writes to the bus. Because of some unrolled code,
 652  * in this current code the size of mp_weave_word must be four. The code that
 653  * makes this assumption explicity is called out. (on some platforms a write
 654  * of 4 bytes still requires a single read-modify-write operation.
 655  *
 656  * This function is takes the identical parameters as the function above,
 657  * however it lays out the final array differently. Where the previous function
 658  * treats the mpi_int as an byte array, this function treats it as an array of
 659  * mp_digits where each digit is stored in big endian order.
 660  *
 661  * since we need to interleave on a byte by byte basis, we need to collect
 662  * several mpi structures together into a single uint32 before we write. We
 663  * also need to make sure the uint32 is arranged so that the first value of
 664  * the first array winds up in b[0]. This means construction of that uint32
 665  * is endian specific (even though the layout of the mp_digits in the array
 666  * is always big endian).
 667  *
 668  * The final data is stored as follows :
 669  *
 670  * Our same logical array p array, m is sizeof(mp_digit),
 671  * N is still count and n is now b_size. If we define p[i].digit[j]0 as the
 672  * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as
 673  * the next most significant byte of p[i].digit[j], ...  and p[i].digit[j]m-1
 674  * is the least significant byte.
 675  * Our array would look like:
 676  * p[0].digit[0]0     p[1].digit[0]0    ...  p[N-2].digit[0]0    p[N-1].digit[0]0
 677  * p[0].digit[0]1     p[1].digit[0]1    ...  p[N-2].digit[0]1    p[N-1].digit[0]1
 678  *                .                                         .
 679  * p[0].digit[0]m-1   p[1].digit[0]m-1  ...  p[N-2].digit[0]m-1  p[N-1].digit[0]m-1
 680  * p[0].digit[1]0     p[1].digit[1]0    ...  p[N-2].digit[1]0    p[N-1].digit[1]0
 681  *                .                                         .
 682  *                .                                         .
 683  * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2
 684  * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1
 685  *
 686  */
 687 mp_err mpi_to_weave(const mp_int *a, unsigned char *b,
 688                                         mp_size b_size, mp_size count)
 689 {
 690   mp_size i;
 691   mp_digit *digitsa0;
 692   mp_digit *digitsa1;
 693   mp_digit *digitsa2;
 694   mp_digit *digitsa3;
 695   mp_size   useda0;
 696   mp_size   useda1;
 697   mp_size   useda2;
 698   mp_size   useda3;
 699   mp_weave_word *weaved = (mp_weave_word *)b;
 700
 701   count = count/sizeof(mp_weave_word);
 702
 703   /* this code pretty much depends on this ! */
 704 #if MP_ARGCHK == 2
 705   assert(WEAVE_WORD_SIZE == 4);
 706   assert(sizeof(mp_weave_word) == 4);
 707 #endif
 708
 709   digitsa0 = MP_DIGITS(&a[0]);
 710   digitsa1 = MP_DIGITS(&a[1]);
 711   digitsa2 = MP_DIGITS(&a[2]);
 712   digitsa3 = MP_DIGITS(&a[3]);
 713   useda0 = MP_USED(&a[0]);
 714   useda1 = MP_USED(&a[1]);
 715   useda2 = MP_USED(&a[2]);
 716   useda3 = MP_USED(&a[3]);
 717
 718   ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG);
 719   ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG);
 720   ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG);
 721   ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG);
 722   ARGCHK(useda0 <= b_size, MP_BADARG);
 723   ARGCHK(useda1 <= b_size, MP_BADARG);
 724   ARGCHK(useda2 <= b_size, MP_BADARG);
 725   ARGCHK(useda3 <= b_size, MP_BADARG);
 726
 727 #define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0)
 728
 729   for (i=0; i < b_size; i++) {
 730     mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i);
 731     mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i);
 732     mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i);
 733     mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i);
 734     register mp_weave_word acc;
 735
 736 /*
 737  * ONE_STEP takes the MSB of each of our current digits and places that
 738  * byte in the appropriate position for writing to the weaved array.
 739  *  On little endian:
 740  *   b3 b2 b1 b0
 741  *  On big endian:
 742  *   b0 b1 b2 b3
 743  *  When the data is written it would always wind up:
 744  *   b[0] = b0
 745  *   b[1] = b1
 746  *   b[2] = b2
 747  *   b[3] = b3
 748  *
 749  * Once we've written the MSB, we shift the whole digit up left one
 750  * byte, putting the Next Most Significant Byte in the MSB position,
 751  * so we we repeat the next one step that byte will be written.
 752  * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE
 753  * is 4.
 754  */
 755 #ifdef MP_IS_LITTLE_ENDIAN
 756 #define MPI_WEAVE_ONE_STEP \
 757     acc  = (d0 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d0 <<= 8; /*b0*/ \
 758     acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \
 759     acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \
 760     acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
 761     *weaved = acc; weaved += count;
 762 #else
 763 #define MPI_WEAVE_ONE_STEP \
 764     acc  = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
 765     acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \
 766     acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \
 767     acc |= (d3 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d3 <<= 8; /*b3*/ \
 768     *weaved = acc; weaved += count;
 769 #endif
 770    switch (sizeof(mp_digit)) {
 771    case 32:
 772     MPI_WEAVE_ONE_STEP
 773     MPI_WEAVE_ONE_STEP
 774     MPI_WEAVE_ONE_STEP
 775     MPI_WEAVE_ONE_STEP
 776     MPI_WEAVE_ONE_STEP
 777     MPI_WEAVE_ONE_STEP
 778     MPI_WEAVE_ONE_STEP
 779     MPI_WEAVE_ONE_STEP
 780     MPI_WEAVE_ONE_STEP
 781     MPI_WEAVE_ONE_STEP
 782     MPI_WEAVE_ONE_STEP
 783     MPI_WEAVE_ONE_STEP
 784     MPI_WEAVE_ONE_STEP
 785     MPI_WEAVE_ONE_STEP
 786     MPI_WEAVE_ONE_STEP
 787     MPI_WEAVE_ONE_STEP
 788    case 16:
 789     MPI_WEAVE_ONE_STEP
 790     MPI_WEAVE_ONE_STEP
 791     MPI_WEAVE_ONE_STEP
 792     MPI_WEAVE_ONE_STEP
 793     MPI_WEAVE_ONE_STEP
 794     MPI_WEAVE_ONE_STEP
 795     MPI_WEAVE_ONE_STEP
 796     MPI_WEAVE_ONE_STEP
 797    case 8:
 798     MPI_WEAVE_ONE_STEP
 799     MPI_WEAVE_ONE_STEP
 800     MPI_WEAVE_ONE_STEP
 801     MPI_WEAVE_ONE_STEP
 802    case 4:
 803     MPI_WEAVE_ONE_STEP
 804     MPI_WEAVE_ONE_STEP
 805    case 2:
 806     MPI_WEAVE_ONE_STEP
 807    case 1:
 808     MPI_WEAVE_ONE_STEP
 809     break;
 810    }
 811   }
 812
 813   return MP_OKAY;
 814 }
 815
 816 /* reverse the operation above for one entry.
 817  * b points to the offset into the weave array of the power we are
 818  * calculating */
 819 mp_err weave_to_mpi(mp_int *a, const unsigned char *b,
 820                                         mp_size b_size, mp_size count)
 821 {
 822   mp_digit *pb = MP_DIGITS(a);
 823   mp_digit *end = &pb[b_size];
 824
 825   MP_SIGN(a) = MP_ZPOS;
 826   MP_USED(a) = b_size;
 827
 828   for (; pb < end; pb++) {
 829     register mp_digit digit;
 830
 831     digit = *b << 8; b += count;
 832 #define MPI_UNWEAVE_ONE_STEP  digit |= *b; b += count; digit = digit << 8;
 833     switch (sizeof(mp_digit)) {
 834     case 32:
 835         MPI_UNWEAVE_ONE_STEP
 836         MPI_UNWEAVE_ONE_STEP
 837         MPI_UNWEAVE_ONE_STEP
 838         MPI_UNWEAVE_ONE_STEP
 839         MPI_UNWEAVE_ONE_STEP
 840         MPI_UNWEAVE_ONE_STEP
 841         MPI_UNWEAVE_ONE_STEP
 842         MPI_UNWEAVE_ONE_STEP
 843         MPI_UNWEAVE_ONE_STEP
 844         MPI_UNWEAVE_ONE_STEP
 845         MPI_UNWEAVE_ONE_STEP
 846         MPI_UNWEAVE_ONE_STEP
 847         MPI_UNWEAVE_ONE_STEP
 848         MPI_UNWEAVE_ONE_STEP
 849         MPI_UNWEAVE_ONE_STEP
 850         MPI_UNWEAVE_ONE_STEP
 851     case 16:
 852         MPI_UNWEAVE_ONE_STEP
 853         MPI_UNWEAVE_ONE_STEP
 854         MPI_UNWEAVE_ONE_STEP
 855         MPI_UNWEAVE_ONE_STEP
 856         MPI_UNWEAVE_ONE_STEP
 857         MPI_UNWEAVE_ONE_STEP
 858         MPI_UNWEAVE_ONE_STEP
 859         MPI_UNWEAVE_ONE_STEP
 860     case 8:
 861         MPI_UNWEAVE_ONE_STEP
 862         MPI_UNWEAVE_ONE_STEP
 863         MPI_UNWEAVE_ONE_STEP
 864         MPI_UNWEAVE_ONE_STEP
 865     case 4:
 866         MPI_UNWEAVE_ONE_STEP
 867         MPI_UNWEAVE_ONE_STEP
 868     case 2:
 869         break;
 870     }
 871     digit |= *b; b += count;
 872
 873     *pb = digit;
 874   }
 875   s_mp_clamp(a);
 876   return MP_OKAY;
 877 }
 878 #endif
 879
 880
 881 #define SQR(a,b) \
 882   MP_CHECKOK( mp_sqr(a, b) );\
 883   MP_CHECKOK( s_mp_redc(b, mmm) )
 884
 885 #if defined(MP_MONT_USE_MP_MUL)
 886 #define MUL_NOWEAVE(x,a,b) \
 887   MP_CHECKOK( mp_mul(a, x, b) ); \
 888   MP_CHECKOK( s_mp_redc(b, mmm) )
 889 #else
 890 #define MUL_NOWEAVE(x,a,b) \
 891   MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) )
 892 #endif
 893
 894 #define MUL(x,a,b) \
 895   MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \
 896   MUL_NOWEAVE(&tmp,a,b)
 897
 898 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
 899 #define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(((ptrdiff_t)0)-(y)))
 900
 901 /* Do modular exponentiation using integer multiply code. */
 902 mp_err mp_exptmod_safe_i(const mp_int *   montBase,
 903                     const mp_int *   exponent,
 904                     const mp_int *   modulus,
 905                     mp_int *         result,
 906                     mp_mont_modulus *mmm,
 907                     int              nLen,
 908                     mp_size          bits_in_exponent,
 909                     mp_size          window_bits,
 910                     mp_size          num_powers)
 911 {
 912   mp_int *pa1, *pa2, *ptmp;
 913   mp_size i;
 914   mp_size first_window;
 915   mp_err  res;
 916   int     expOff;
 917   mp_int  accum1, accum2, accum[WEAVE_WORD_SIZE];
 918   mp_int  tmp;
 919   unsigned char *powersArray;
 920   unsigned char *powers;
 921
 922   MP_DIGITS(&accum1) = 0;
 923   MP_DIGITS(&accum2) = 0;
 924   MP_DIGITS(&accum[0]) = 0;
 925   MP_DIGITS(&accum[1]) = 0;
 926   MP_DIGITS(&accum[2]) = 0;
 927   MP_DIGITS(&accum[3]) = 0;
 928   MP_DIGITS(&tmp) = 0;
 929
 930   powersArray = (unsigned char *)malloc(num_powers*(nLen*sizeof(mp_digit)+1));
 931   if (powersArray == NULL) {
 932     res = MP_MEM;
 933     goto CLEANUP;
 934   }
 935
 936   /* powers[i] = base ** (i); */
 937   powers = (unsigned char *)MP_ALIGN(powersArray,num_powers);
 938
 939   /* grab the first window value. This allows us to preload accumulator1
 940    * and save a conversion, some squares and a multiple*/
 941   MP_CHECKOK( mpl_get_bits(exponent,
 942                                 bits_in_exponent-window_bits, window_bits) );
 943   first_window = (mp_size)res;
 944
 945   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
 946   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
 947   MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) );
 948
 949   /* build the first WEAVE_WORD powers inline */
 950   /* if WEAVE_WORD_SIZE is not 4, this code will have to change */
 951   if (num_powers > 2) {
 952     MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );
 953     MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );
 954     MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );
 955     MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );
 956     mp_set(&accum[0], 1);
 957     MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) );
 958     MP_CHECKOK( mp_copy(montBase, &accum[1]) );
 959     SQR(montBase, &accum[2]);
 960     MUL_NOWEAVE(montBase, &accum[2], &accum[3]);
 961     MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) );
 962     if (first_window < 4) {
 963       MP_CHECKOK( mp_copy(&accum[first_window], &accum1) );
 964       first_window = num_powers;
 965     }
 966   } else {
 967       if (first_window == 0) {
 968         mp_set(&accum1, 1);
 969         MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
 970       } else {
 971         /* assert first_window == 1? */
 972         MP_CHECKOK( mp_copy(montBase, &accum1) );
 973       }
 974   }
 975
 976   /*
 977    * calculate all the powers in the powers array.
 978    * this adds 2**(k-1)-2 square operations over just calculating the
 979    * odd powers where k is the window size in the two other mp_modexpt
 980    * implementations in this file. We will get some of that
 981    * back by not needing the first 'k' squares and one multiply for the
 982    * first window */
 983   for (i = WEAVE_WORD_SIZE; i < num_powers; i++) {
 984     int acc_index = i & (WEAVE_WORD_SIZE-1); /* i % WEAVE_WORD_SIZE */
 985     if ( i & 1 ) {
 986       MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]);
 987       /* we've filled the array do our 'per array' processing */
 988       if (acc_index == (WEAVE_WORD_SIZE-1)) {
 989         MP_CHECKOK( mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE-1),
 990                                                          nLen, num_powers) );
 991
 992         if (first_window <= i) {
 993           MP_CHECKOK( mp_copy(&accum[first_window & (WEAVE_WORD_SIZE-1)],
 994                                                                 &accum1) );
 995           first_window = num_powers;
 996         }
 997       }
 998     } else {
 999       /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source
1000        * and target are the same so we need to copy.. After that, the
1001        * value is overwritten, so we need to fetch it from the stored
1002        * weave array */
1003       if (i > 2* WEAVE_WORD_SIZE) {
1004         MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers));
1005         SQR(&accum2, &accum[acc_index]);
1006       } else {
1007         int half_power_index = (i/2) & (WEAVE_WORD_SIZE-1);
1008         if (half_power_index == acc_index) {
1009            /* copy is cheaper than weave_to_mpi */
1010            MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2));
1011            SQR(&accum2,&accum[acc_index]);
1012         } else {
1013            SQR(&accum[half_power_index],&accum[acc_index]);
1014         }
1015       }
1016     }
1017   }
1018   /* if the accum1 isn't set, Then there is something wrong with our logic
1019    * above and is an internal programming error.
1020    */
1021 #if MP_ARGCHK == 2
1022   assert(MP_USED(&accum1) != 0);
1023 #endif
1024
1025   /* set accumulator to montgomery residue of 1 */
1026   pa1 = &accum1;
1027   pa2 = &accum2;
1028
1029   for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) {
1030     mp_size smallExp;
1031     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
1032     smallExp = (mp_size)res;
1033
1034     /* handle unroll the loops */
1035     switch (window_bits) {
1036     case 1:
1037         if (!smallExp) {
1038             SQR(pa1,pa2); SWAPPA;
1039         } else if (smallExp & 1) {
1040             SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1);
1041         } else {
1042             ABORT;
1043         }
1044         break;
1045     case 6:
1046         SQR(pa1,pa2); SQR(pa2,pa1);
1047         /* fall through */
1048     case 4:
1049         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
1050         MUL(smallExp, pa1,pa2); SWAPPA;
1051         break;
1052     case 5:
1053         SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
1054         SQR(pa1,pa2); MUL(smallExp,pa2,pa1);
1055         break;
1056     default:
1057         ABORT; /* could do a loop? */
1058     }
1059   }
1060
1061   res = s_mp_redc(pa1, mmm);
1062   mp_exch(pa1, result);
1063
1064 CLEANUP:
1065   mp_clear(&accum1);
1066   mp_clear(&accum2);
1067   mp_clear(&accum[0]);
1068   mp_clear(&accum[1]);
1069   mp_clear(&accum[2]);
1070   mp_clear(&accum[3]);
1071   mp_clear(&tmp);
1072   /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */
1073   free(powersArray);
1074   return res;
1075 }
1076 #undef SQR
1077 #undef MUL
1078 #endif
1079
1080 mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
1081                   const mp_int *modulus, mp_int *result)
1082 {
1083   const mp_int *base;
1084   mp_size bits_in_exponent, i, window_bits, odd_ints;
1085   mp_err  res;
1086   int     nLen;
1087   mp_int  montBase, goodBase;
1088   mp_mont_modulus mmm;
1089 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1090   static unsigned int max_window_bits;
1091 #endif
1092
1093   /* function for computing n0prime only works if n0 is odd */
1094   if (!mp_isodd(modulus))
1095     return s_mp_exptmod(inBase, exponent, modulus, result);
1096
1097   MP_DIGITS(&montBase) = 0;
1098   MP_DIGITS(&goodBase) = 0;
1099
1100   if (mp_cmp(inBase, modulus) < 0) {
1101     base = inBase;
1102   } else {
1103     MP_CHECKOK( mp_init(&goodBase) );
1104     base = &goodBase;
1105     MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) );
1106   }
1107
1108   nLen  = MP_USED(modulus);
1109   MP_CHECKOK( mp_init_size(&montBase, 2 * nLen + 2) );
1110
1111   mmm.N = *modulus;                     /* a copy of the mp_int struct */
1112   i = mpl_significant_bits(modulus);
1113   i += MP_DIGIT_BIT - 1;
1114   mmm.b = i - i % MP_DIGIT_BIT;
1115
1116   /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX
1117   **            where n0 = least significant mp_digit of N, the modulus.
1118   */
1119   mmm.n0prime = 0 - s_mp_invmod_radix( MP_DIGIT(modulus, 0) );
1120
1121   MP_CHECKOK( s_mp_to_mont(base, &mmm, &montBase) );
1122
1123   bits_in_exponent = mpl_significant_bits(exponent);
1124 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1125   if (mp_using_cache_safe_exp) {
1126     if (bits_in_exponent > 780)
1127         window_bits = 6;
1128     else if (bits_in_exponent > 256)
1129         window_bits = 5;
1130     else if (bits_in_exponent > 20)
1131         window_bits = 4;
1132        /* RSA public key exponents are typically under 20 bits (common values
1133         * are: 3, 17, 65537) and a 4-bit window is inefficient
1134         */
1135     else
1136         window_bits = 1;
1137   } else
1138 #endif
1139   if (bits_in_exponent > 480)
1140     window_bits = 6;
1141   else if (bits_in_exponent > 160)
1142     window_bits = 5;
1143   else if (bits_in_exponent > 20)
1144     window_bits = 4;
1145   /* RSA public key exponents are typically under 20 bits (common values
1146    * are: 3, 17, 65537) and a 4-bit window is inefficient
1147    */
1148   else
1149     window_bits = 1;
1150
1151 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1152   /*
1153    * clamp the window size based on
1154    * the cache line size.
1155    */
1156   if (!max_window_bits) {
1157     unsigned long cache_size = s_mpi_getProcessorLineSize();
1158     /* processor has no cache, use 'fast' code always */
1159     if (cache_size == 0) {
1160       mp_using_cache_safe_exp = 0;
1161     }
1162     if ((cache_size == 0) || (cache_size >= 64)) {
1163       max_window_bits = 6;
1164     } else if (cache_size >= 32) {
1165       max_window_bits = 5;
1166     } else if (cache_size >= 16) {
1167       max_window_bits = 4;
1168     } else max_window_bits = 1; /* should this be an assert? */
1169   }
1170
1171   /* clamp the window size down before we caclulate bits_in_exponent */
1172   if (mp_using_cache_safe_exp) {
1173     if (window_bits > max_window_bits) {
1174       window_bits = max_window_bits;
1175     }
1176   }
1177 #endif
1178
1179   odd_ints = 1 << (window_bits - 1);
1180   i = bits_in_exponent % window_bits;
1181   if (i != 0) {
1182     bits_in_exponent += window_bits - i;
1183   }
1184
1185 #ifdef MP_USING_MONT_MULF
1186   if (mp_using_mont_mulf) {
1187     MP_CHECKOK( s_mp_pad(&montBase, nLen) );
1188     res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen,
1189                      bits_in_exponent, window_bits, odd_ints);
1190   } else
1191 #endif
1192 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1193   if (mp_using_cache_safe_exp) {
1194     res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen,
1195                      bits_in_exponent, window_bits, 1 << window_bits);
1196   } else
1197 #endif
1198   res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen,
1199                      bits_in_exponent, window_bits, odd_ints);
1200
1201 CLEANUP:
1202   mp_clear(&montBase);
1203   mp_clear(&goodBase);
1204   /* Don't mp_clear mmm.N because it is merely a copy of modulus.
1205   ** Just zap it.
1206   */
1207   memset(&mmm, 0, sizeof mmm);
1208   return res;
1209 }