1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Original Code is the Netscape security libraries.
16 * The Initial Developer of the Original Code is
17 * Netscape Communications Corporation.
18 * Portions created by the Initial Developer are Copyright (C) 2000
19 * the Initial Developer. All Rights Reserved.
22 * Sheueling Chang Shantz <sheueling.chang@sun.com>,
23 * Stephen Fung <stephen.fung@sun.com>, and
24 * Douglas Stebila <douglas@stebila.ca> of Sun Laboratories.
26 * Alternatively, the contents of this file may be used under the terms of
27 * either the GNU General Public License Version 2 or later (the "GPL"), or
28 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
39 /* $Id: mpmontg.c,v 1.20 2006/08/29 02:41:38 nelson%bolyard.com Exp $ */
41 /* This file implements moduluar exponentiation using Montgomery's
42 * method for modular reduction. This file implements the method
43 * described as "Improvement 1" in the paper "A Cryptogrpahic Library for
44 * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr.
45 * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90"
46 * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244,
47 * published by Springer Verlag.
50 #define MP_USING_CACHE_SAFE_MOD_EXP 1
55 #ifdef MP_USING_MONT_MULF
58 #include <stddef.h> /* ptrdiff_t */
60 /* if MP_CHAR_STORE_SLOW is defined, we */
61 /* need to know endianness of this platform. */
62 #ifdef MP_CHAR_STORE_SLOW
63 #if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN)
64 #error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \
65 " if you define MP_CHAR_STORE_SLOW."
71 #define MAX_ODD_INTS 32 /* 2 ** (WINDOW_BITS - 1) */
73 #if defined(_WIN32_WCE)
74 #define ABORT res = MP_UNDEF; goto CLEANUP
79 /* computes T = REDC(T), 2^b == R */
80 mp_err
s_mp_redc(mp_int
*T
, mp_mont_modulus
*mmm
)
85 i
= MP_USED(T
) + MP_USED(&mmm
->N
) + 2;
86 MP_CHECKOK( s_mp_pad(T
, i
) );
87 for (i
= 0; i
< MP_USED(&mmm
->N
); ++i
) {
88 mp_digit m_i
= MP_DIGIT(T
, i
) * mmm
->n0prime
;
89 /* T += N * m_i * (MP_RADIX ** i); */
90 MP_CHECKOK( s_mp_mul_d_add_offset(&mmm
->N
, m_i
, T
, i
) );
95 s_mp_div_2d(T
, mmm
->b
);
97 if ((res
= s_mp_cmp(T
, &mmm
->N
)) >= 0) {
99 MP_CHECKOK( s_mp_sub(T
, &mmm
->N
) );
101 if ((res
= mp_cmp(T
, &mmm
->N
)) >= 0) {
112 #if !defined(MP_ASSEMBLY_MUL_MONT) && !defined(MP_MONT_USE_MP_MUL)
113 mp_err
s_mp_mul_mont(const mp_int
*a
, const mp_int
*b
, mp_int
*c
,
114 mp_mont_modulus
*mmm
)
120 mp_size useda
, usedb
;
122 ARGCHK(a
!= NULL
&& b
!= NULL
&& c
!= NULL
, MP_BADARG
);
124 if (MP_USED(a
) < MP_USED(b
)) {
125 const mp_int
*xch
= b
; /* switch a and b, to do fewer outer loops */
130 MP_USED(c
) = 1; MP_DIGIT(c
, 0) = 0;
131 ib
= MP_USED(a
) + MP_MAX(MP_USED(b
), MP_USED(&mmm
->N
)) + 2;
132 if((res
= s_mp_pad(c
, ib
)) != MP_OKAY
)
137 s_mpv_mul_d(MP_DIGITS(a
), useda
, *pb
++, MP_DIGITS(c
));
138 s_mp_setz(MP_DIGITS(c
) + useda
+ 1, ib
- (useda
+ 1));
139 m_i
= MP_DIGIT(c
, 0) * mmm
->n0prime
;
140 s_mp_mul_d_add_offset(&mmm
->N
, m_i
, c
, 0);
142 /* Outer loop: Digits of b */
144 for (ib
= 1; ib
< usedb
; ib
++) {
145 mp_digit b_i
= *pb
++;
147 /* Inner product: Digits of a */
149 s_mpv_mul_d_add_prop(MP_DIGITS(a
), useda
, b_i
, MP_DIGITS(c
) + ib
);
150 m_i
= MP_DIGIT(c
, ib
) * mmm
->n0prime
;
151 s_mp_mul_d_add_offset(&mmm
->N
, m_i
, c
, ib
);
153 if (usedb
< MP_USED(&mmm
->N
)) {
154 for (usedb
= MP_USED(&mmm
->N
); ib
< usedb
; ++ib
) {
155 m_i
= MP_DIGIT(c
, ib
) * mmm
->n0prime
;
156 s_mp_mul_d_add_offset(&mmm
->N
, m_i
, c
, ib
);
160 s_mp_div_2d(c
, mmm
->b
);
161 if (s_mp_cmp(c
, &mmm
->N
) >= 0) {
162 MP_CHECKOK( s_mp_sub(c
, &mmm
->N
) );
172 mp_err
s_mp_to_mont(const mp_int
*x
, mp_mont_modulus
*mmm
, mp_int
*xMont
)
176 /* xMont = x * R mod N where N is modulus */
177 MP_CHECKOK( mpl_lsh(x
, xMont
, mmm
->b
) ); /* xMont = x << b */
178 MP_CHECKOK( mp_div(xMont
, &mmm
->N
, 0, xMont
) ); /* mod N */
183 #ifdef MP_USING_MONT_MULF
185 /* the floating point multiply is already cache safe,
186 * don't turn on cache safe unless we specifically
188 #ifndef MP_FORCE_CACHE_SAFE
189 #undef MP_USING_CACHE_SAFE_MOD_EXP
192 unsigned int mp_using_mont_mulf
= 1;
194 /* computes montgomery square of the integer in mResult */
196 conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \
197 mont_mulf_noconv(mResult, dm1, d16Tmp, \
198 dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
200 /* computes montgomery product of x and the integer in mResult */
202 conv_i32_to_d32(dm1, mResult, nLen); \
203 mont_mulf_noconv(mResult, dm1, oddPowers[x], \
204 dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
206 /* Do modular exponentiation using floating point multiply code. */
207 mp_err
mp_exptmod_f(const mp_int
* montBase
,
208 const mp_int
* exponent
,
209 const mp_int
* modulus
,
211 mp_mont_modulus
*mmm
,
213 mp_size bits_in_exponent
,
218 double *dBuf
= 0, *dm1
, *dn
, *dSqr
, *d16Tmp
, *dTmp
;
223 int dSize
= 0, oddPowSize
, dTmpSize
;
225 double *oddPowers
[MAX_ODD_INTS
];
227 /* function for computing n0prime only works if n0 is odd */
229 MP_DIGITS(&accum1
) = 0;
231 for (i
= 0; i
< MAX_ODD_INTS
; ++i
)
234 MP_CHECKOK( mp_init_size(&accum1
, 3 * nLen
+ 2) );
237 MP_CHECKOK( s_mp_to_mont(&accum1
, mmm
, &accum1
) );
238 MP_CHECKOK( s_mp_pad(&accum1
, nLen
) );
240 oddPowSize
= 2 * nLen
+ 1;
241 dTmpSize
= 2 * oddPowSize
;
242 dSize
= sizeof(double) * (nLen
* 4 + 1 +
243 ((odd_ints
+ 1) * oddPowSize
) + dTmpSize
);
244 dBuf
= (double *)malloc(dSize
);
245 dm1
= dBuf
; /* array of d32 */
246 dn
= dBuf
+ nLen
; /* array of d32 */
247 dSqr
= dn
+ nLen
; /* array of d32 */
248 d16Tmp
= dSqr
+ nLen
; /* array of d16 */
249 dTmp
= d16Tmp
+ oddPowSize
;
251 for (i
= 0; i
< odd_ints
; ++i
) {
255 mResult
= (mp_digit
*)(dTmp
+ dTmpSize
); /* size is nLen + 1 */
257 /* Make dn and dn0 */
258 conv_i32_to_d32(dn
, MP_DIGITS(modulus
), nLen
);
259 dn0
= (double)(mmm
->n0prime
& 0xffff);
262 conv_i32_to_d32_and_d16(dm1
, oddPowers
[0], MP_DIGITS(montBase
), nLen
);
263 mont_mulf_noconv(mResult
, dm1
, oddPowers
[0],
264 dTmp
, dn
, MP_DIGITS(modulus
), nLen
, dn0
);
265 conv_i32_to_d32(dSqr
, mResult
, nLen
);
267 for (i
= 1; i
< odd_ints
; ++i
) {
268 mont_mulf_noconv(mResult
, dSqr
, oddPowers
[i
- 1],
269 dTmp
, dn
, MP_DIGITS(modulus
), nLen
, dn0
);
270 conv_i32_to_d16(oddPowers
[i
], mResult
, nLen
);
273 s_mp_copy(MP_DIGITS(&accum1
), mResult
, nLen
); /* from, to, len */
275 for (expOff
= bits_in_exponent
- window_bits
; expOff
>= 0; expOff
-= window_bits
) {
277 MP_CHECKOK( mpl_get_bits(exponent
, expOff
, window_bits
) );
278 smallExp
= (mp_size
)res
;
280 if (window_bits
== 1) {
283 } else if (smallExp
& 1) {
288 } else if (window_bits
== 4) {
291 } else if (smallExp
& 1) {
292 SQR
; SQR
; SQR
; SQR
; MUL(smallExp
/2);
293 } else if (smallExp
& 2) {
294 SQR
; SQR
; SQR
; MUL(smallExp
/4); SQR
;
295 } else if (smallExp
& 4) {
296 SQR
; SQR
; MUL(smallExp
/8); SQR
; SQR
;
297 } else if (smallExp
& 8) {
298 SQR
; MUL(smallExp
/16); SQR
; SQR
; SQR
;
302 } else if (window_bits
== 5) {
304 SQR
; SQR
; SQR
; SQR
; SQR
;
305 } else if (smallExp
& 1) {
306 SQR
; SQR
; SQR
; SQR
; SQR
; MUL(smallExp
/2);
307 } else if (smallExp
& 2) {
308 SQR
; SQR
; SQR
; SQR
; MUL(smallExp
/4); SQR
;
309 } else if (smallExp
& 4) {
310 SQR
; SQR
; SQR
; MUL(smallExp
/8); SQR
; SQR
;
311 } else if (smallExp
& 8) {
312 SQR
; SQR
; MUL(smallExp
/16); SQR
; SQR
; SQR
;
313 } else if (smallExp
& 0x10) {
314 SQR
; MUL(smallExp
/32); SQR
; SQR
; SQR
; SQR
;
318 } else if (window_bits
== 6) {
320 SQR
; SQR
; SQR
; SQR
; SQR
; SQR
;
321 } else if (smallExp
& 1) {
322 SQR
; SQR
; SQR
; SQR
; SQR
; SQR
; MUL(smallExp
/2);
323 } else if (smallExp
& 2) {
324 SQR
; SQR
; SQR
; SQR
; SQR
; MUL(smallExp
/4); SQR
;
325 } else if (smallExp
& 4) {
326 SQR
; SQR
; SQR
; SQR
; MUL(smallExp
/8); SQR
; SQR
;
327 } else if (smallExp
& 8) {
328 SQR
; SQR
; SQR
; MUL(smallExp
/16); SQR
; SQR
; SQR
;
329 } else if (smallExp
& 0x10) {
330 SQR
; SQR
; MUL(smallExp
/32); SQR
; SQR
; SQR
; SQR
;
331 } else if (smallExp
& 0x20) {
332 SQR
; MUL(smallExp
/64); SQR
; SQR
; SQR
; SQR
; SQR
;
341 s_mp_copy(mResult
, MP_DIGITS(&accum1
), nLen
); /* from, to, len */
343 res
= s_mp_redc(&accum1
, mmm
);
344 mp_exch(&accum1
, result
);
350 memset(dBuf
, 0, dSize
);
361 MP_CHECKOK( mp_sqr(a, b) );\
362 MP_CHECKOK( s_mp_redc(b, mmm) )
364 #if defined(MP_MONT_USE_MP_MUL)
366 MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \
367 MP_CHECKOK( s_mp_redc(b, mmm) )
370 MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, mmm) )
373 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
375 /* Do modular exponentiation using integer multiply code. */
376 mp_err
mp_exptmod_i(const mp_int
* montBase
,
377 const mp_int
* exponent
,
378 const mp_int
* modulus
,
380 mp_mont_modulus
*mmm
,
382 mp_size bits_in_exponent
,
386 mp_int
*pa1
, *pa2
, *ptmp
;
390 mp_int accum1
, accum2
, power2
, oddPowers
[MAX_ODD_INTS
];
392 /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
393 /* oddPowers[i] = base ** (2*i + 1); */
395 MP_DIGITS(&accum1
) = 0;
396 MP_DIGITS(&accum2
) = 0;
397 MP_DIGITS(&power2
) = 0;
398 for (i
= 0; i
< MAX_ODD_INTS
; ++i
) {
399 MP_DIGITS(oddPowers
+ i
) = 0;
402 MP_CHECKOK( mp_init_size(&accum1
, 3 * nLen
+ 2) );
403 MP_CHECKOK( mp_init_size(&accum2
, 3 * nLen
+ 2) );
405 MP_CHECKOK( mp_init_copy(&oddPowers
[0], montBase
) );
407 mp_init_size(&power2
, nLen
+ 2 * MP_USED(montBase
) + 2);
408 MP_CHECKOK( mp_sqr(montBase
, &power2
) ); /* power2 = montBase ** 2 */
409 MP_CHECKOK( s_mp_redc(&power2
, mmm
) );
411 for (i
= 1; i
< odd_ints
; ++i
) {
412 mp_init_size(oddPowers
+ i
, nLen
+ 2 * MP_USED(&power2
) + 2);
413 MP_CHECKOK( mp_mul(oddPowers
+ (i
- 1), &power2
, oddPowers
+ i
) );
414 MP_CHECKOK( s_mp_redc(oddPowers
+ i
, mmm
) );
417 /* set accumulator to montgomery residue of 1 */
419 MP_CHECKOK( s_mp_to_mont(&accum1
, mmm
, &accum1
) );
423 for (expOff
= bits_in_exponent
- window_bits
; expOff
>= 0; expOff
-= window_bits
) {
425 MP_CHECKOK( mpl_get_bits(exponent
, expOff
, window_bits
) );
426 smallExp
= (mp_size
)res
;
428 if (window_bits
== 1) {
430 SQR(pa1
,pa2
); SWAPPA
;
431 } else if (smallExp
& 1) {
432 SQR(pa1
,pa2
); MUL(0,pa2
,pa1
);
436 } else if (window_bits
== 4) {
438 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
439 } else if (smallExp
& 1) {
440 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
441 MUL(smallExp
/2, pa1
,pa2
); SWAPPA
;
442 } else if (smallExp
& 2) {
443 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
);
444 MUL(smallExp
/4,pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
445 } else if (smallExp
& 4) {
446 SQR(pa1
,pa2
); SQR(pa2
,pa1
); MUL(smallExp
/8,pa1
,pa2
);
447 SQR(pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
448 } else if (smallExp
& 8) {
449 SQR(pa1
,pa2
); MUL(smallExp
/16,pa2
,pa1
); SQR(pa1
,pa2
);
450 SQR(pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
454 } else if (window_bits
== 5) {
456 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
457 SQR(pa1
,pa2
); SWAPPA
;
458 } else if (smallExp
& 1) {
459 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
460 SQR(pa1
,pa2
); MUL(smallExp
/2,pa2
,pa1
);
461 } else if (smallExp
& 2) {
462 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
463 MUL(smallExp
/4,pa1
,pa2
); SQR(pa2
,pa1
);
464 } else if (smallExp
& 4) {
465 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
);
466 MUL(smallExp
/8,pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
467 } else if (smallExp
& 8) {
468 SQR(pa1
,pa2
); SQR(pa2
,pa1
); MUL(smallExp
/16,pa1
,pa2
);
469 SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
470 } else if (smallExp
& 0x10) {
471 SQR(pa1
,pa2
); MUL(smallExp
/32,pa2
,pa1
); SQR(pa1
,pa2
);
472 SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
476 } else if (window_bits
== 6) {
478 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
479 SQR(pa1
,pa2
); SQR(pa2
,pa1
);
480 } else if (smallExp
& 1) {
481 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
482 SQR(pa1
,pa2
); SQR(pa2
,pa1
); MUL(smallExp
/2,pa1
,pa2
); SWAPPA
;
483 } else if (smallExp
& 2) {
484 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
485 SQR(pa1
,pa2
); MUL(smallExp
/4,pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
486 } else if (smallExp
& 4) {
487 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
488 MUL(smallExp
/8,pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
489 } else if (smallExp
& 8) {
490 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
);
491 MUL(smallExp
/16,pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
492 SQR(pa1
,pa2
); SWAPPA
;
493 } else if (smallExp
& 0x10) {
494 SQR(pa1
,pa2
); SQR(pa2
,pa1
); MUL(smallExp
/32,pa1
,pa2
);
495 SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
496 } else if (smallExp
& 0x20) {
497 SQR(pa1
,pa2
); MUL(smallExp
/64,pa2
,pa1
); SQR(pa1
,pa2
);
498 SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SWAPPA
;
507 res
= s_mp_redc(pa1
, mmm
);
508 mp_exch(pa1
, result
);
514 for (i
= 0; i
< odd_ints
; ++i
) {
515 mp_clear(oddPowers
+ i
);
522 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
523 unsigned int mp_using_cache_safe_exp
= 1;
526 mp_err
mp_set_safe_modexp(int value
)
528 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
529 mp_using_cache_safe_exp
= value
;
539 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
540 #define WEAVE_WORD_SIZE 4
542 #ifndef MP_CHAR_STORE_SLOW
544 * mpi_to_weave takes an array of bignums, a matrix in which each bignum
545 * occupies all the columns of a row, and transposes it into a matrix in
546 * which each bignum occupies a column of every row. The first row of the
547 * input matrix becomes the first column of the output matrix. The n'th
548 * row of input becomes the n'th column of output. The input data is said
549 * to be "interleaved" or "woven" into the output matrix.
551 * The array of bignums is left in this woven form. Each time a single
552 * bignum value is needed, it is recreated by fetching the n'th column,
553 * forming a single row which is the new bignum.
555 * The purpose of this interleaving is make it impossible to determine which
556 * of the bignums is being used in any one operation by examining the pattern
559 * The weaving function does not transpose the entire input matrix in one call.
560 * It transposes 4 rows of mp_ints into their respective columns of output.
562 * There are two different implementations of the weaving and unweaving code
563 * in this file. One uses byte loads and stores. The second uses loads and
564 * stores of mp_weave_word size values. The weaved forms of these two
565 * implementations differ. Consequently, each one has its own explanation.
567 * Here is the explanation for the byte-at-a-time implementation.
569 * This implementation treats each mp_int bignum as an array of bytes,
570 * rather than as an array of mp_digits. It stores those bytes as a
571 * column of bytes in the output matrix. It doesn't care if the machine
572 * uses big-endian or little-endian byte ordering within mp_digits.
573 * The first byte of the mp_digit array becomes the first byte in the output
574 * column, regardless of whether that byte is the MSB or LSB of the mp_digit.
576 * "bignums" is an array of mp_ints.
577 * It points to four rows, four mp_ints, a subset of a larger array of mp_ints.
579 * "weaved" is the weaved output matrix.
580 * The first byte of bignums[0] is stored in weaved[0].
582 * "nBignums" is the total number of bignums in the array of which "bignums"
585 * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array.
586 * mp_ints that use less than nDigits digits are logically padded with zeros
587 * while being stored in the weaved array.
589 mp_err
mpi_to_weave(const mp_int
*bignums
,
590 unsigned char *weaved
,
591 mp_size nDigits
, /* in each mp_int of input */
592 mp_size nBignums
) /* in the entire source array */
595 unsigned char * endDest
= weaved
+ (nDigits
* nBignums
* sizeof(mp_digit
));
597 for (i
=0; i
< WEAVE_WORD_SIZE
; i
++) {
598 mp_size used
= MP_USED(&bignums
[i
]);
599 unsigned char *pSrc
= (unsigned char *)MP_DIGITS(&bignums
[i
]);
600 unsigned char *endSrc
= pSrc
+ (used
* sizeof(mp_digit
));
601 unsigned char *pDest
= weaved
+ i
;
603 ARGCHK(MP_SIGN(&bignums
[i
]) == MP_ZPOS
, MP_BADARG
);
604 ARGCHK(used
<= nDigits
, MP_BADARG
);
606 for (; pSrc
< endSrc
; pSrc
++) {
610 while (pDest
< endDest
) {
619 /* Reverse the operation above for one mp_int.
620 * Reconstruct one mp_int from its column in the weaved array.
621 * "pSrc" points to the offset into the weave array of the bignum we
622 * are going to reconstruct.
624 mp_err
weave_to_mpi(mp_int
*a
, /* output, result */
625 const unsigned char *pSrc
, /* input, byte matrix */
626 mp_size nDigits
, /* per mp_int output */
627 mp_size nBignums
) /* bignums in weaved matrix */
629 unsigned char *pDest
= (unsigned char *)MP_DIGITS(a
);
630 unsigned char *endDest
= pDest
+ (nDigits
* sizeof(mp_digit
));
632 MP_SIGN(a
) = MP_ZPOS
;
633 MP_USED(a
) = nDigits
;
635 for (; pDest
< endDest
; pSrc
+= nBignums
, pDest
++) {
644 /* Need a primitive that we know is 32 bits long... */
645 /* this is true on all modern processors we know of today*/
646 typedef unsigned int mp_weave_word
;
649 * on some platforms character stores into memory is very expensive since they
650 * generate a read/modify/write operation on the bus. On those platforms
651 * we need to do integer writes to the bus. Because of some unrolled code,
652 * in this current code the size of mp_weave_word must be four. The code that
653 * makes this assumption explicity is called out. (on some platforms a write
654 * of 4 bytes still requires a single read-modify-write operation.
656 * This function is takes the identical parameters as the function above,
657 * however it lays out the final array differently. Where the previous function
658 * treats the mpi_int as an byte array, this function treats it as an array of
659 * mp_digits where each digit is stored in big endian order.
661 * since we need to interleave on a byte by byte basis, we need to collect
662 * several mpi structures together into a single uint32 before we write. We
663 * also need to make sure the uint32 is arranged so that the first value of
664 * the first array winds up in b[0]. This means construction of that uint32
665 * is endian specific (even though the layout of the mp_digits in the array
666 * is always big endian).
668 * The final data is stored as follows :
670 * Our same logical array p array, m is sizeof(mp_digit),
671 * N is still count and n is now b_size. If we define p[i].digit[j]0 as the
672 * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as
673 * the next most significant byte of p[i].digit[j], ... and p[i].digit[j]m-1
674 * is the least significant byte.
675 * Our array would look like:
676 * p[0].digit[0]0 p[1].digit[0]0 ... p[N-2].digit[0]0 p[N-1].digit[0]0
677 * p[0].digit[0]1 p[1].digit[0]1 ... p[N-2].digit[0]1 p[N-1].digit[0]1
679 * p[0].digit[0]m-1 p[1].digit[0]m-1 ... p[N-2].digit[0]m-1 p[N-1].digit[0]m-1
680 * p[0].digit[1]0 p[1].digit[1]0 ... p[N-2].digit[1]0 p[N-1].digit[1]0
683 * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2
684 * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1
687 mp_err
mpi_to_weave(const mp_int
*a
, unsigned char *b
,
688 mp_size b_size
, mp_size count
)
699 mp_weave_word
*weaved
= (mp_weave_word
*)b
;
701 count
= count
/sizeof(mp_weave_word
);
703 /* this code pretty much depends on this ! */
705 assert(WEAVE_WORD_SIZE
== 4);
706 assert(sizeof(mp_weave_word
) == 4);
709 digitsa0
= MP_DIGITS(&a
[0]);
710 digitsa1
= MP_DIGITS(&a
[1]);
711 digitsa2
= MP_DIGITS(&a
[2]);
712 digitsa3
= MP_DIGITS(&a
[3]);
713 useda0
= MP_USED(&a
[0]);
714 useda1
= MP_USED(&a
[1]);
715 useda2
= MP_USED(&a
[2]);
716 useda3
= MP_USED(&a
[3]);
718 ARGCHK(MP_SIGN(&a
[0]) == MP_ZPOS
, MP_BADARG
);
719 ARGCHK(MP_SIGN(&a
[1]) == MP_ZPOS
, MP_BADARG
);
720 ARGCHK(MP_SIGN(&a
[2]) == MP_ZPOS
, MP_BADARG
);
721 ARGCHK(MP_SIGN(&a
[3]) == MP_ZPOS
, MP_BADARG
);
722 ARGCHK(useda0
<= b_size
, MP_BADARG
);
723 ARGCHK(useda1
<= b_size
, MP_BADARG
);
724 ARGCHK(useda2
<= b_size
, MP_BADARG
);
725 ARGCHK(useda3
<= b_size
, MP_BADARG
);
727 #define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0)
729 for (i
=0; i
< b_size
; i
++) {
730 mp_digit d0
= SAFE_FETCH(digitsa0
,useda0
,i
);
731 mp_digit d1
= SAFE_FETCH(digitsa1
,useda1
,i
);
732 mp_digit d2
= SAFE_FETCH(digitsa2
,useda2
,i
);
733 mp_digit d3
= SAFE_FETCH(digitsa3
,useda3
,i
);
734 register mp_weave_word acc
;
737 * ONE_STEP takes the MSB of each of our current digits and places that
738 * byte in the appropriate position for writing to the weaved array.
743 * When the data is written it would always wind up:
749 * Once we've written the MSB, we shift the whole digit up left one
750 * byte, putting the Next Most Significant Byte in the MSB position,
751 * so we we repeat the next one step that byte will be written.
752 * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE
755 #ifdef MP_IS_LITTLE_ENDIAN
756 #define MPI_WEAVE_ONE_STEP \
757 acc = (d0 >> (MP_DIGIT_BIT-8)) & 0x000000ff; d0 <<= 8; /*b0*/ \
758 acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \
759 acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \
760 acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
761 *weaved = acc; weaved += count;
763 #define MPI_WEAVE_ONE_STEP \
764 acc = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
765 acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \
766 acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \
767 acc |= (d3 >> (MP_DIGIT_BIT-8)) & 0x000000ff; d3 <<= 8; /*b3*/ \
768 *weaved = acc; weaved += count;
770 switch (sizeof(mp_digit
)) {
816 /* reverse the operation above for one entry.
817 * b points to the offset into the weave array of the power we are
819 mp_err
weave_to_mpi(mp_int
*a
, const unsigned char *b
,
820 mp_size b_size
, mp_size count
)
822 mp_digit
*pb
= MP_DIGITS(a
);
823 mp_digit
*end
= &pb
[b_size
];
825 MP_SIGN(a
) = MP_ZPOS
;
828 for (; pb
< end
; pb
++) {
829 register mp_digit digit
;
831 digit
= *b
<< 8; b
+= count
;
832 #define MPI_UNWEAVE_ONE_STEP digit |= *b; b += count; digit = digit << 8;
833 switch (sizeof(mp_digit
)) {
871 digit
|= *b
; b
+= count
;
882 MP_CHECKOK( mp_sqr(a, b) );\
883 MP_CHECKOK( s_mp_redc(b, mmm) )
885 #if defined(MP_MONT_USE_MP_MUL)
886 #define MUL_NOWEAVE(x,a,b) \
887 MP_CHECKOK( mp_mul(a, x, b) ); \
888 MP_CHECKOK( s_mp_redc(b, mmm) )
890 #define MUL_NOWEAVE(x,a,b) \
891 MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) )
895 MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \
896 MUL_NOWEAVE(&tmp,a,b)
898 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
899 #define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(((ptrdiff_t)0)-(y)))
901 /* Do modular exponentiation using integer multiply code. */
902 mp_err
mp_exptmod_safe_i(const mp_int
* montBase
,
903 const mp_int
* exponent
,
904 const mp_int
* modulus
,
906 mp_mont_modulus
*mmm
,
908 mp_size bits_in_exponent
,
912 mp_int
*pa1
, *pa2
, *ptmp
;
914 mp_size first_window
;
917 mp_int accum1
, accum2
, accum
[WEAVE_WORD_SIZE
];
919 unsigned char *powersArray
;
920 unsigned char *powers
;
922 MP_DIGITS(&accum1
) = 0;
923 MP_DIGITS(&accum2
) = 0;
924 MP_DIGITS(&accum
[0]) = 0;
925 MP_DIGITS(&accum
[1]) = 0;
926 MP_DIGITS(&accum
[2]) = 0;
927 MP_DIGITS(&accum
[3]) = 0;
930 powersArray
= (unsigned char *)malloc(num_powers
*(nLen
*sizeof(mp_digit
)+1));
931 if (powersArray
== NULL
) {
936 /* powers[i] = base ** (i); */
937 powers
= (unsigned char *)MP_ALIGN(powersArray
,num_powers
);
939 /* grab the first window value. This allows us to preload accumulator1
940 * and save a conversion, some squares and a multiple*/
941 MP_CHECKOK( mpl_get_bits(exponent
,
942 bits_in_exponent
-window_bits
, window_bits
) );
943 first_window
= (mp_size
)res
;
945 MP_CHECKOK( mp_init_size(&accum1
, 3 * nLen
+ 2) );
946 MP_CHECKOK( mp_init_size(&accum2
, 3 * nLen
+ 2) );
947 MP_CHECKOK( mp_init_size(&tmp
, 3 * nLen
+ 2) );
949 /* build the first WEAVE_WORD powers inline */
950 /* if WEAVE_WORD_SIZE is not 4, this code will have to change */
951 if (num_powers
> 2) {
952 MP_CHECKOK( mp_init_size(&accum
[0], 3 * nLen
+ 2) );
953 MP_CHECKOK( mp_init_size(&accum
[1], 3 * nLen
+ 2) );
954 MP_CHECKOK( mp_init_size(&accum
[2], 3 * nLen
+ 2) );
955 MP_CHECKOK( mp_init_size(&accum
[3], 3 * nLen
+ 2) );
956 mp_set(&accum
[0], 1);
957 MP_CHECKOK( s_mp_to_mont(&accum
[0], mmm
, &accum
[0]) );
958 MP_CHECKOK( mp_copy(montBase
, &accum
[1]) );
959 SQR(montBase
, &accum
[2]);
960 MUL_NOWEAVE(montBase
, &accum
[2], &accum
[3]);
961 MP_CHECKOK( mpi_to_weave(accum
, powers
, nLen
, num_powers
) );
962 if (first_window
< 4) {
963 MP_CHECKOK( mp_copy(&accum
[first_window
], &accum1
) );
964 first_window
= num_powers
;
967 if (first_window
== 0) {
969 MP_CHECKOK( s_mp_to_mont(&accum1
, mmm
, &accum1
) );
971 /* assert first_window == 1? */
972 MP_CHECKOK( mp_copy(montBase
, &accum1
) );
977 * calculate all the powers in the powers array.
978 * this adds 2**(k-1)-2 square operations over just calculating the
979 * odd powers where k is the window size in the two other mp_modexpt
980 * implementations in this file. We will get some of that
981 * back by not needing the first 'k' squares and one multiply for the
983 for (i
= WEAVE_WORD_SIZE
; i
< num_powers
; i
++) {
984 int acc_index
= i
& (WEAVE_WORD_SIZE
-1); /* i % WEAVE_WORD_SIZE */
986 MUL_NOWEAVE(montBase
, &accum
[acc_index
-1] , &accum
[acc_index
]);
987 /* we've filled the array do our 'per array' processing */
988 if (acc_index
== (WEAVE_WORD_SIZE
-1)) {
989 MP_CHECKOK( mpi_to_weave(accum
, powers
+ i
- (WEAVE_WORD_SIZE
-1),
992 if (first_window
<= i
) {
993 MP_CHECKOK( mp_copy(&accum
[first_window
& (WEAVE_WORD_SIZE
-1)],
995 first_window
= num_powers
;
999 /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source
1000 * and target are the same so we need to copy.. After that, the
1001 * value is overwritten, so we need to fetch it from the stored
1003 if (i
> 2* WEAVE_WORD_SIZE
) {
1004 MP_CHECKOK(weave_to_mpi(&accum2
, powers
+i
/2, nLen
, num_powers
));
1005 SQR(&accum2
, &accum
[acc_index
]);
1007 int half_power_index
= (i
/2) & (WEAVE_WORD_SIZE
-1);
1008 if (half_power_index
== acc_index
) {
1009 /* copy is cheaper than weave_to_mpi */
1010 MP_CHECKOK(mp_copy(&accum
[half_power_index
], &accum2
));
1011 SQR(&accum2
,&accum
[acc_index
]);
1013 SQR(&accum
[half_power_index
],&accum
[acc_index
]);
1018 /* if the accum1 isn't set, Then there is something wrong with our logic
1019 * above and is an internal programming error.
1022 assert(MP_USED(&accum1
) != 0);
1025 /* set accumulator to montgomery residue of 1 */
1029 for (expOff
= bits_in_exponent
- window_bits
*2; expOff
>= 0; expOff
-= window_bits
) {
1031 MP_CHECKOK( mpl_get_bits(exponent
, expOff
, window_bits
) );
1032 smallExp
= (mp_size
)res
;
1034 /* handle unroll the loops */
1035 switch (window_bits
) {
1038 SQR(pa1
,pa2
); SWAPPA
;
1039 } else if (smallExp
& 1) {
1040 SQR(pa1
,pa2
); MUL_NOWEAVE(montBase
,pa2
,pa1
);
1046 SQR(pa1
,pa2
); SQR(pa2
,pa1
);
1049 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
1050 MUL(smallExp
, pa1
,pa2
); SWAPPA
;
1053 SQR(pa1
,pa2
); SQR(pa2
,pa1
); SQR(pa1
,pa2
); SQR(pa2
,pa1
);
1054 SQR(pa1
,pa2
); MUL(smallExp
,pa2
,pa1
);
1057 ABORT
; /* could do a loop? */
1061 res
= s_mp_redc(pa1
, mmm
);
1062 mp_exch(pa1
, result
);
1067 mp_clear(&accum
[0]);
1068 mp_clear(&accum
[1]);
1069 mp_clear(&accum
[2]);
1070 mp_clear(&accum
[3]);
1072 /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */
1080 mp_err
mp_exptmod(const mp_int
*inBase
, const mp_int
*exponent
,
1081 const mp_int
*modulus
, mp_int
*result
)
1084 mp_size bits_in_exponent
, i
, window_bits
, odd_ints
;
1087 mp_int montBase
, goodBase
;
1088 mp_mont_modulus mmm
;
1089 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1090 static unsigned int max_window_bits
;
1093 /* function for computing n0prime only works if n0 is odd */
1094 if (!mp_isodd(modulus
))
1095 return s_mp_exptmod(inBase
, exponent
, modulus
, result
);
1097 MP_DIGITS(&montBase
) = 0;
1098 MP_DIGITS(&goodBase
) = 0;
1100 if (mp_cmp(inBase
, modulus
) < 0) {
1103 MP_CHECKOK( mp_init(&goodBase
) );
1105 MP_CHECKOK( mp_mod(inBase
, modulus
, &goodBase
) );
1108 nLen
= MP_USED(modulus
);
1109 MP_CHECKOK( mp_init_size(&montBase
, 2 * nLen
+ 2) );
1111 mmm
.N
= *modulus
; /* a copy of the mp_int struct */
1112 i
= mpl_significant_bits(modulus
);
1113 i
+= MP_DIGIT_BIT
- 1;
1114 mmm
.b
= i
- i
% MP_DIGIT_BIT
;
1116 /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX
1117 ** where n0 = least significant mp_digit of N, the modulus.
1119 mmm
.n0prime
= 0 - s_mp_invmod_radix( MP_DIGIT(modulus
, 0) );
1121 MP_CHECKOK( s_mp_to_mont(base
, &mmm
, &montBase
) );
1123 bits_in_exponent
= mpl_significant_bits(exponent
);
1124 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1125 if (mp_using_cache_safe_exp
) {
1126 if (bits_in_exponent
> 780)
1128 else if (bits_in_exponent
> 256)
1130 else if (bits_in_exponent
> 20)
1132 /* RSA public key exponents are typically under 20 bits (common values
1133 * are: 3, 17, 65537) and a 4-bit window is inefficient
1139 if (bits_in_exponent
> 480)
1141 else if (bits_in_exponent
> 160)
1143 else if (bits_in_exponent
> 20)
1145 /* RSA public key exponents are typically under 20 bits (common values
1146 * are: 3, 17, 65537) and a 4-bit window is inefficient
1151 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1153 * clamp the window size based on
1154 * the cache line size.
1156 if (!max_window_bits
) {
1157 unsigned long cache_size
= s_mpi_getProcessorLineSize();
1158 /* processor has no cache, use 'fast' code always */
1159 if (cache_size
== 0) {
1160 mp_using_cache_safe_exp
= 0;
1162 if ((cache_size
== 0) || (cache_size
>= 64)) {
1163 max_window_bits
= 6;
1164 } else if (cache_size
>= 32) {
1165 max_window_bits
= 5;
1166 } else if (cache_size
>= 16) {
1167 max_window_bits
= 4;
1168 } else max_window_bits
= 1; /* should this be an assert? */
1171 /* clamp the window size down before we caclulate bits_in_exponent */
1172 if (mp_using_cache_safe_exp
) {
1173 if (window_bits
> max_window_bits
) {
1174 window_bits
= max_window_bits
;
1179 odd_ints
= 1 << (window_bits
- 1);
1180 i
= bits_in_exponent
% window_bits
;
1182 bits_in_exponent
+= window_bits
- i
;
1185 #ifdef MP_USING_MONT_MULF
1186 if (mp_using_mont_mulf
) {
1187 MP_CHECKOK( s_mp_pad(&montBase
, nLen
) );
1188 res
= mp_exptmod_f(&montBase
, exponent
, modulus
, result
, &mmm
, nLen
,
1189 bits_in_exponent
, window_bits
, odd_ints
);
1192 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
1193 if (mp_using_cache_safe_exp
) {
1194 res
= mp_exptmod_safe_i(&montBase
, exponent
, modulus
, result
, &mmm
, nLen
,
1195 bits_in_exponent
, window_bits
, 1 << window_bits
);
1198 res
= mp_exptmod_i(&montBase
, exponent
, modulus
, result
, &mmm
, nLen
,
1199 bits_in_exponent
, window_bits
, odd_ints
);
1202 mp_clear(&montBase
);
1203 mp_clear(&goodBase
);
1204 /* Don't mp_clear mmm.N because it is merely a copy of modulus.
1207 memset(&mmm
, 0, sizeof mmm
);