Added YUV routines needed for v4l driver, and in the future possibly
[wine/gsoc-2012-control.git] / dlls / rsaenh / mpi.c
blobb053b7317daac1bf11ac6106d162963692d27b76
1 /*
2 * dlls/rsaenh/mpi.c
3 * Multi Precision Integer functions
5 * Copyright 2004 Michael Jung
6 * Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 * This file contains code from the LibTomCrypt cryptographic
25 * library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
26 * is in the public domain. The code in this file is tailored to
27 * special requirements. Take a look at http://libtomcrypt.org for the
28 * original version.
31 #include <stdarg.h>
32 #include "tomcrypt.h"
34 /* computes the modular inverse via binary extended euclidean algorithm,
35 * that is c = 1/a mod b
37 * Based on slow invmod except this is optimized for the case where b is
38 * odd as per HAC Note 14.64 on pp. 610
40 int
41 fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
43 mp_int x, y, u, v, B, D;
44 int res, neg;
46 /* 2. [modified] b must be odd */
47 if (mp_iseven (b) == 1) {
48 return MP_VAL;
51 /* init all our temps */
52 if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
53 return res;
56 /* x == modulus, y == value to invert */
57 if ((res = mp_copy (b, &x)) != MP_OKAY) {
58 goto __ERR;
61 /* we need y = |a| */
62 if ((res = mp_abs (a, &y)) != MP_OKAY) {
63 goto __ERR;
66 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
67 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
68 goto __ERR;
70 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
71 goto __ERR;
73 mp_set (&D, 1);
75 top:
76 /* 4. while u is even do */
77 while (mp_iseven (&u) == 1) {
78 /* 4.1 u = u/2 */
79 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
80 goto __ERR;
82 /* 4.2 if B is odd then */
83 if (mp_isodd (&B) == 1) {
84 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
85 goto __ERR;
88 /* B = B/2 */
89 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
90 goto __ERR;
94 /* 5. while v is even do */
95 while (mp_iseven (&v) == 1) {
96 /* 5.1 v = v/2 */
97 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
98 goto __ERR;
100 /* 5.2 if D is odd then */
101 if (mp_isodd (&D) == 1) {
102 /* D = (D-x)/2 */
103 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
104 goto __ERR;
107 /* D = D/2 */
108 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
109 goto __ERR;
113 /* 6. if u >= v then */
114 if (mp_cmp (&u, &v) != MP_LT) {
115 /* u = u - v, B = B - D */
116 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
117 goto __ERR;
120 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
121 goto __ERR;
123 } else {
124 /* v - v - u, D = D - B */
125 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
126 goto __ERR;
129 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
130 goto __ERR;
134 /* if not zero goto step 4 */
135 if (mp_iszero (&u) == 0) {
136 goto top;
139 /* now a = C, b = D, gcd == g*v */
141 /* if v != 1 then there is no inverse */
142 if (mp_cmp_d (&v, 1) != MP_EQ) {
143 res = MP_VAL;
144 goto __ERR;
147 /* b is now the inverse */
148 neg = a->sign;
149 while (D.sign == MP_NEG) {
150 if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
151 goto __ERR;
154 mp_exch (&D, c);
155 c->sign = neg;
156 res = MP_OKAY;
158 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
159 return res;
162 /* computes xR**-1 == x (mod N) via Montgomery Reduction
164 * This is an optimized implementation of montgomery_reduce
165 * which uses the comba method to quickly calculate the columns of the
166 * reduction.
168 * Based on Algorithm 14.32 on pp.601 of HAC.
171 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
173 int ix, res, olduse;
174 mp_word W[MP_WARRAY];
176 /* get old used count */
177 olduse = x->used;
179 /* grow a as required */
180 if (x->alloc < n->used + 1) {
181 if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
182 return res;
186 /* first we have to get the digits of the input into
187 * an array of double precision words W[...]
190 register mp_word *_W;
191 register mp_digit *tmpx;
193 /* alias for the W[] array */
194 _W = W;
196 /* alias for the digits of x*/
197 tmpx = x->dp;
199 /* copy the digits of a into W[0..a->used-1] */
200 for (ix = 0; ix < x->used; ix++) {
201 *_W++ = *tmpx++;
204 /* zero the high words of W[a->used..m->used*2] */
205 for (; ix < n->used * 2 + 1; ix++) {
206 *_W++ = 0;
210 /* now we proceed to zero successive digits
211 * from the least significant upwards
213 for (ix = 0; ix < n->used; ix++) {
214 /* mu = ai * m' mod b
216 * We avoid a double precision multiplication (which isn't required)
217 * by casting the value down to a mp_digit. Note this requires
218 * that W[ix-1] have the carry cleared (see after the inner loop)
220 register mp_digit mu;
221 mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
223 /* a = a + mu * m * b**i
225 * This is computed in place and on the fly. The multiplication
226 * by b**i is handled by offseting which columns the results
227 * are added to.
229 * Note the comba method normally doesn't handle carries in the
230 * inner loop In this case we fix the carry from the previous
231 * column since the Montgomery reduction requires digits of the
232 * result (so far) [see above] to work. This is
233 * handled by fixing up one carry after the inner loop. The
234 * carry fixups are done in order so after these loops the
235 * first m->used words of W[] have the carries fixed
238 register int iy;
239 register mp_digit *tmpn;
240 register mp_word *_W;
242 /* alias for the digits of the modulus */
243 tmpn = n->dp;
245 /* Alias for the columns set by an offset of ix */
246 _W = W + ix;
248 /* inner loop */
249 for (iy = 0; iy < n->used; iy++) {
250 *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
254 /* now fix carry for next digit, W[ix+1] */
255 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
258 /* now we have to propagate the carries and
259 * shift the words downward [all those least
260 * significant digits we zeroed].
263 register mp_digit *tmpx;
264 register mp_word *_W, *_W1;
266 /* nox fix rest of carries */
268 /* alias for current word */
269 _W1 = W + ix;
271 /* alias for next word, where the carry goes */
272 _W = W + ++ix;
274 for (; ix <= n->used * 2 + 1; ix++) {
275 *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
278 /* copy out, A = A/b**n
280 * The result is A/b**n but instead of converting from an
281 * array of mp_word to mp_digit than calling mp_rshd
282 * we just copy them in the right order
285 /* alias for destination word */
286 tmpx = x->dp;
288 /* alias for shifted double precision result */
289 _W = W + n->used;
291 for (ix = 0; ix < n->used + 1; ix++) {
292 *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
295 /* zero oldused digits, if the input a was larger than
296 * m->used+1 we'll have to clear the digits
298 for (; ix < olduse; ix++) {
299 *tmpx++ = 0;
303 /* set the max used and clamp */
304 x->used = n->used + 1;
305 mp_clamp (x);
307 /* if A >= m then A = A - m */
308 if (mp_cmp_mag (x, n) != MP_LT) {
309 return s_mp_sub (x, n, x);
311 return MP_OKAY;
314 /* Fast (comba) multiplier
316 * This is the fast column-array [comba] multiplier. It is
317 * designed to compute the columns of the product first
318 * then handle the carries afterwards. This has the effect
319 * of making the nested loops that compute the columns very
320 * simple and schedulable on super-scalar processors.
322 * This has been modified to produce a variable number of
323 * digits of output so if say only a half-product is required
324 * you don't have to compute the upper half (a feature
325 * required for fast Barrett reduction).
327 * Based on Algorithm 14.12 on pp.595 of HAC.
331 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
333 int olduse, res, pa, ix, iz;
334 mp_digit W[MP_WARRAY];
335 register mp_word _W;
337 /* grow the destination as required */
338 if (c->alloc < digs) {
339 if ((res = mp_grow (c, digs)) != MP_OKAY) {
340 return res;
344 /* number of output digits to produce */
345 pa = MIN(digs, a->used + b->used);
347 /* clear the carry */
348 _W = 0;
349 for (ix = 0; ix <= pa; ix++) {
350 int tx, ty;
351 int iy;
352 mp_digit *tmpx, *tmpy;
354 /* get offsets into the two bignums */
355 ty = MIN(b->used-1, ix);
356 tx = ix - ty;
358 /* setup temp aliases */
359 tmpx = a->dp + tx;
360 tmpy = b->dp + ty;
362 /* this is the number of times the loop will iterrate, essentially its
363 while (tx++ < a->used && ty-- >= 0) { ... }
365 iy = MIN(a->used-tx, ty+1);
367 /* execute loop */
368 for (iz = 0; iz < iy; ++iz) {
369 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
372 /* store term */
373 W[ix] = ((mp_digit)_W) & MP_MASK;
375 /* make next carry */
376 _W = _W >> ((mp_word)DIGIT_BIT);
379 /* setup dest */
380 olduse = c->used;
381 c->used = digs;
384 register mp_digit *tmpc;
385 tmpc = c->dp;
386 for (ix = 0; ix < digs; ix++) {
387 /* now extract the previous digit [below the carry] */
388 *tmpc++ = W[ix];
391 /* clear unused digits [that existed in the old copy of c] */
392 for (; ix < olduse; ix++) {
393 *tmpc++ = 0;
396 mp_clamp (c);
397 return MP_OKAY;
400 /* this is a modified version of fast_s_mul_digs that only produces
401 * output digits *above* digs. See the comments for fast_s_mul_digs
402 * to see how it works.
404 * This is used in the Barrett reduction since for one of the multiplications
405 * only the higher digits were needed. This essentially halves the work.
407 * Based on Algorithm 14.12 on pp.595 of HAC.
410 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
412 int olduse, res, pa, ix, iz;
413 mp_digit W[MP_WARRAY];
414 mp_word _W;
416 /* grow the destination as required */
417 pa = a->used + b->used;
418 if (c->alloc < pa) {
419 if ((res = mp_grow (c, pa)) != MP_OKAY) {
420 return res;
424 /* number of output digits to produce */
425 pa = a->used + b->used;
426 _W = 0;
427 for (ix = digs; ix <= pa; ix++) {
428 int tx, ty, iy;
429 mp_digit *tmpx, *tmpy;
431 /* get offsets into the two bignums */
432 ty = MIN(b->used-1, ix);
433 tx = ix - ty;
435 /* setup temp aliases */
436 tmpx = a->dp + tx;
437 tmpy = b->dp + ty;
439 /* this is the number of times the loop will iterrate, essentially its
440 while (tx++ < a->used && ty-- >= 0) { ... }
442 iy = MIN(a->used-tx, ty+1);
444 /* execute loop */
445 for (iz = 0; iz < iy; iz++) {
446 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
449 /* store term */
450 W[ix] = ((mp_digit)_W) & MP_MASK;
452 /* make next carry */
453 _W = _W >> ((mp_word)DIGIT_BIT);
456 /* setup dest */
457 olduse = c->used;
458 c->used = pa;
461 register mp_digit *tmpc;
463 tmpc = c->dp + digs;
464 for (ix = digs; ix <= pa; ix++) {
465 /* now extract the previous digit [below the carry] */
466 *tmpc++ = W[ix];
469 /* clear unused digits [that existed in the old copy of c] */
470 for (; ix < olduse; ix++) {
471 *tmpc++ = 0;
474 mp_clamp (c);
475 return MP_OKAY;
478 /* fast squaring
480 * This is the comba method where the columns of the product
481 * are computed first then the carries are computed. This
482 * has the effect of making a very simple inner loop that
483 * is executed the most
485 * W2 represents the outer products and W the inner.
487 * A further optimizations is made because the inner
488 * products are of the form "A * B * 2". The *2 part does
489 * not need to be computed until the end which is good
490 * because 64-bit shifts are slow!
492 * Based on Algorithm 14.16 on pp.597 of HAC.
495 /* the jist of squaring...
497 you do like mult except the offset of the tmpx [one that starts closer to zero]
498 can't equal the offset of tmpy. So basically you set up iy like before then you min it with
499 (ty-tx) so that it never happens. You double all those you add in the inner loop
501 After that loop you do the squares and add them in.
503 Remove W2 and don't memset W
507 int fast_s_mp_sqr (mp_int * a, mp_int * b)
509 int olduse, res, pa, ix, iz;
510 mp_digit W[MP_WARRAY], *tmpx;
511 mp_word W1;
513 /* grow the destination as required */
514 pa = a->used + a->used;
515 if (b->alloc < pa) {
516 if ((res = mp_grow (b, pa)) != MP_OKAY) {
517 return res;
521 /* number of output digits to produce */
522 W1 = 0;
523 for (ix = 0; ix <= pa; ix++) {
524 int tx, ty, iy;
525 mp_word _W;
526 mp_digit *tmpy;
528 /* clear counter */
529 _W = 0;
531 /* get offsets into the two bignums */
532 ty = MIN(a->used-1, ix);
533 tx = ix - ty;
535 /* setup temp aliases */
536 tmpx = a->dp + tx;
537 tmpy = a->dp + ty;
539 /* this is the number of times the loop will iterrate, essentially its
540 while (tx++ < a->used && ty-- >= 0) { ... }
542 iy = MIN(a->used-tx, ty+1);
544 /* now for squaring tx can never equal ty
545 * we halve the distance since they approach at a rate of 2x
546 * and we have to round because odd cases need to be executed
548 iy = MIN(iy, (ty-tx+1)>>1);
550 /* execute loop */
551 for (iz = 0; iz < iy; iz++) {
552 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
555 /* double the inner product and add carry */
556 _W = _W + _W + W1;
558 /* even columns have the square term in them */
559 if ((ix&1) == 0) {
560 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
563 /* store it */
564 W[ix] = _W;
566 /* make next carry */
567 W1 = _W >> ((mp_word)DIGIT_BIT);
570 /* setup dest */
571 olduse = b->used;
572 b->used = a->used+a->used;
575 mp_digit *tmpb;
576 tmpb = b->dp;
577 for (ix = 0; ix < pa; ix++) {
578 *tmpb++ = W[ix] & MP_MASK;
581 /* clear unused digits [that existed in the old copy of c] */
582 for (; ix < olduse; ix++) {
583 *tmpb++ = 0;
586 mp_clamp (b);
587 return MP_OKAY;
590 /* computes a = 2**b
592 * Simple algorithm which zeroes the int, grows it then just sets one bit
593 * as required.
596 mp_2expt (mp_int * a, int b)
598 int res;
600 /* zero a as per default */
601 mp_zero (a);
603 /* grow a to accommodate the single bit */
604 if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
605 return res;
608 /* set the used count of where the bit will go */
609 a->used = b / DIGIT_BIT + 1;
611 /* put the single bit in its place */
612 a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
614 return MP_OKAY;
617 /* b = |a|
619 * Simple function copies the input and fixes the sign to positive
622 mp_abs (mp_int * a, mp_int * b)
624 int res;
626 /* copy a to b */
627 if (a != b) {
628 if ((res = mp_copy (a, b)) != MP_OKAY) {
629 return res;
633 /* force the sign of b to positive */
634 b->sign = MP_ZPOS;
636 return MP_OKAY;
639 /* high level addition (handles signs) */
640 int mp_add (mp_int * a, mp_int * b, mp_int * c)
642 int sa, sb, res;
644 /* get sign of both inputs */
645 sa = a->sign;
646 sb = b->sign;
648 /* handle two cases, not four */
649 if (sa == sb) {
650 /* both positive or both negative */
651 /* add their magnitudes, copy the sign */
652 c->sign = sa;
653 res = s_mp_add (a, b, c);
654 } else {
655 /* one positive, the other negative */
656 /* subtract the one with the greater magnitude from */
657 /* the one of the lesser magnitude. The result gets */
658 /* the sign of the one with the greater magnitude. */
659 if (mp_cmp_mag (a, b) == MP_LT) {
660 c->sign = sb;
661 res = s_mp_sub (b, a, c);
662 } else {
663 c->sign = sa;
664 res = s_mp_sub (a, b, c);
667 return res;
671 /* single digit addition */
673 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
675 int res, ix, oldused;
676 mp_digit *tmpa, *tmpc, mu;
678 /* grow c as required */
679 if (c->alloc < a->used + 1) {
680 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
681 return res;
685 /* if a is negative and |a| >= b, call c = |a| - b */
686 if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
687 /* temporarily fix sign of a */
688 a->sign = MP_ZPOS;
690 /* c = |a| - b */
691 res = mp_sub_d(a, b, c);
693 /* fix sign */
694 a->sign = c->sign = MP_NEG;
696 return res;
699 /* old number of used digits in c */
700 oldused = c->used;
702 /* sign always positive */
703 c->sign = MP_ZPOS;
705 /* source alias */
706 tmpa = a->dp;
708 /* destination alias */
709 tmpc = c->dp;
711 /* if a is positive */
712 if (a->sign == MP_ZPOS) {
713 /* add digit, after this we're propagating
714 * the carry.
716 *tmpc = *tmpa++ + b;
717 mu = *tmpc >> DIGIT_BIT;
718 *tmpc++ &= MP_MASK;
720 /* now handle rest of the digits */
721 for (ix = 1; ix < a->used; ix++) {
722 *tmpc = *tmpa++ + mu;
723 mu = *tmpc >> DIGIT_BIT;
724 *tmpc++ &= MP_MASK;
726 /* set final carry */
727 ix++;
728 *tmpc++ = mu;
730 /* setup size */
731 c->used = a->used + 1;
732 } else {
733 /* a was negative and |a| < b */
734 c->used = 1;
736 /* the result is a single digit */
737 if (a->used == 1) {
738 *tmpc++ = b - a->dp[0];
739 } else {
740 *tmpc++ = b;
743 /* setup count so the clearing of oldused
744 * can fall through correctly
746 ix = 1;
749 /* now zero to oldused */
750 while (ix++ < oldused) {
751 *tmpc++ = 0;
753 mp_clamp(c);
755 return MP_OKAY;
758 /* trim unused digits
760 * This is used to ensure that leading zero digits are
761 * trimed and the leading "used" digit will be non-zero
762 * Typically very fast. Also fixes the sign if there
763 * are no more leading digits
765 void
766 mp_clamp (mp_int * a)
768 /* decrease used while the most significant digit is
769 * zero.
771 while (a->used > 0 && a->dp[a->used - 1] == 0) {
772 --(a->used);
775 /* reset the sign flag if used == 0 */
776 if (a->used == 0) {
777 a->sign = MP_ZPOS;
781 /* clear one (frees) */
782 void
783 mp_clear (mp_int * a)
785 int i;
787 /* only do anything if a hasn't been freed previously */
788 if (a->dp != NULL) {
789 /* first zero the digits */
790 for (i = 0; i < a->used; i++) {
791 a->dp[i] = 0;
794 /* free ram */
795 free(a->dp);
797 /* reset members to make debugging easier */
798 a->dp = NULL;
799 a->alloc = a->used = 0;
800 a->sign = MP_ZPOS;
805 void mp_clear_multi(mp_int *mp, ...)
807 mp_int* next_mp = mp;
808 va_list args;
809 va_start(args, mp);
810 while (next_mp != NULL) {
811 mp_clear(next_mp);
812 next_mp = va_arg(args, mp_int*);
814 va_end(args);
817 /* compare two ints (signed)*/
819 mp_cmp (mp_int * a, mp_int * b)
821 /* compare based on sign */
822 if (a->sign != b->sign) {
823 if (a->sign == MP_NEG) {
824 return MP_LT;
825 } else {
826 return MP_GT;
830 /* compare digits */
831 if (a->sign == MP_NEG) {
832 /* if negative compare opposite direction */
833 return mp_cmp_mag(b, a);
834 } else {
835 return mp_cmp_mag(a, b);
839 /* compare a digit */
840 int mp_cmp_d(mp_int * a, mp_digit b)
842 /* compare based on sign */
843 if (a->sign == MP_NEG) {
844 return MP_LT;
847 /* compare based on magnitude */
848 if (a->used > 1) {
849 return MP_GT;
852 /* compare the only digit of a to b */
853 if (a->dp[0] > b) {
854 return MP_GT;
855 } else if (a->dp[0] < b) {
856 return MP_LT;
857 } else {
858 return MP_EQ;
862 /* compare maginitude of two ints (unsigned) */
863 int mp_cmp_mag (mp_int * a, mp_int * b)
865 int n;
866 mp_digit *tmpa, *tmpb;
868 /* compare based on # of non-zero digits */
869 if (a->used > b->used) {
870 return MP_GT;
873 if (a->used < b->used) {
874 return MP_LT;
877 /* alias for a */
878 tmpa = a->dp + (a->used - 1);
880 /* alias for b */
881 tmpb = b->dp + (a->used - 1);
883 /* compare based on digits */
884 for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
885 if (*tmpa > *tmpb) {
886 return MP_GT;
889 if (*tmpa < *tmpb) {
890 return MP_LT;
893 return MP_EQ;
896 static const int lnz[16] = {
897 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
900 /* Counts the number of lsbs which are zero before the first zero bit */
901 int mp_cnt_lsb(mp_int *a)
903 int x;
904 mp_digit q, qq;
906 /* easy out */
907 if (mp_iszero(a) == 1) {
908 return 0;
911 /* scan lower digits until non-zero */
912 for (x = 0; x < a->used && a->dp[x] == 0; x++);
913 q = a->dp[x];
914 x *= DIGIT_BIT;
916 /* now scan this digit until a 1 is found */
917 if ((q & 1) == 0) {
918 do {
919 qq = q & 15;
920 x += lnz[qq];
921 q >>= 4;
922 } while (qq == 0);
924 return x;
927 /* copy, b = a */
929 mp_copy (const mp_int * a, mp_int * b)
931 int res, n;
933 /* if dst == src do nothing */
934 if (a == b) {
935 return MP_OKAY;
938 /* grow dest */
939 if (b->alloc < a->used) {
940 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
941 return res;
945 /* zero b and copy the parameters over */
947 register mp_digit *tmpa, *tmpb;
949 /* pointer aliases */
951 /* source */
952 tmpa = a->dp;
954 /* destination */
955 tmpb = b->dp;
957 /* copy all the digits */
958 for (n = 0; n < a->used; n++) {
959 *tmpb++ = *tmpa++;
962 /* clear high digits */
963 for (; n < b->used; n++) {
964 *tmpb++ = 0;
968 /* copy used count and sign */
969 b->used = a->used;
970 b->sign = a->sign;
971 return MP_OKAY;
974 /* returns the number of bits in an int */
976 mp_count_bits (mp_int * a)
978 int r;
979 mp_digit q;
981 /* shortcut */
982 if (a->used == 0) {
983 return 0;
986 /* get number of digits and add that */
987 r = (a->used - 1) * DIGIT_BIT;
989 /* take the last digit and count the bits in it */
990 q = a->dp[a->used - 1];
991 while (q > ((mp_digit) 0)) {
992 ++r;
993 q >>= ((mp_digit) 1);
995 return r;
998 /* integer signed division.
999 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1000 * HAC pp.598 Algorithm 14.20
1002 * Note that the description in HAC is horribly
1003 * incomplete. For example, it doesn't consider
1004 * the case where digits are removed from 'x' in
1005 * the inner loop. It also doesn't consider the
1006 * case that y has fewer than three digits, etc..
1008 * The overall algorithm is as described as
1009 * 14.20 from HAC but fixed to treat these cases.
1011 int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1013 mp_int q, x, y, t1, t2;
1014 int res, n, t, i, norm, neg;
1016 /* is divisor zero ? */
1017 if (mp_iszero (b) == 1) {
1018 return MP_VAL;
1021 /* if a < b then q=0, r = a */
1022 if (mp_cmp_mag (a, b) == MP_LT) {
1023 if (d != NULL) {
1024 res = mp_copy (a, d);
1025 } else {
1026 res = MP_OKAY;
1028 if (c != NULL) {
1029 mp_zero (c);
1031 return res;
1034 if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1035 return res;
1037 q.used = a->used + 2;
1039 if ((res = mp_init (&t1)) != MP_OKAY) {
1040 goto __Q;
1043 if ((res = mp_init (&t2)) != MP_OKAY) {
1044 goto __T1;
1047 if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1048 goto __T2;
1051 if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1052 goto __X;
1055 /* fix the sign */
1056 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1057 x.sign = y.sign = MP_ZPOS;
1059 /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1060 norm = mp_count_bits(&y) % DIGIT_BIT;
1061 if (norm < (int)(DIGIT_BIT-1)) {
1062 norm = (DIGIT_BIT-1) - norm;
1063 if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1064 goto __Y;
1066 if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1067 goto __Y;
1069 } else {
1070 norm = 0;
1073 /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1074 n = x.used - 1;
1075 t = y.used - 1;
1077 /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1078 if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1079 goto __Y;
1082 while (mp_cmp (&x, &y) != MP_LT) {
1083 ++(q.dp[n - t]);
1084 if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1085 goto __Y;
1089 /* reset y by shifting it back down */
1090 mp_rshd (&y, n - t);
1092 /* step 3. for i from n down to (t + 1) */
1093 for (i = n; i >= (t + 1); i--) {
1094 if (i > x.used) {
1095 continue;
1098 /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1099 * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1100 if (x.dp[i] == y.dp[t]) {
1101 q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1102 } else {
1103 mp_word tmp;
1104 tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1105 tmp |= ((mp_word) x.dp[i - 1]);
1106 tmp /= ((mp_word) y.dp[t]);
1107 if (tmp > (mp_word) MP_MASK)
1108 tmp = MP_MASK;
1109 q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1112 /* while (q{i-t-1} * (yt * b + y{t-1})) >
1113 xi * b**2 + xi-1 * b + xi-2
1115 do q{i-t-1} -= 1;
1117 q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1118 do {
1119 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1121 /* find left hand */
1122 mp_zero (&t1);
1123 t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1124 t1.dp[1] = y.dp[t];
1125 t1.used = 2;
1126 if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1127 goto __Y;
1130 /* find right hand */
1131 t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1132 t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1133 t2.dp[2] = x.dp[i];
1134 t2.used = 3;
1135 } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1137 /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1138 if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1139 goto __Y;
1142 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1143 goto __Y;
1146 if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1147 goto __Y;
1150 /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1151 if (x.sign == MP_NEG) {
1152 if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1153 goto __Y;
1155 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1156 goto __Y;
1158 if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1159 goto __Y;
1162 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1166 /* now q is the quotient and x is the remainder
1167 * [which we have to normalize]
1170 /* get sign before writing to c */
1171 x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1173 if (c != NULL) {
1174 mp_clamp (&q);
1175 mp_exch (&q, c);
1176 c->sign = neg;
1179 if (d != NULL) {
1180 mp_div_2d (&x, norm, &x, NULL);
1181 mp_exch (&x, d);
1184 res = MP_OKAY;
1186 __Y:mp_clear (&y);
1187 __X:mp_clear (&x);
1188 __T2:mp_clear (&t2);
1189 __T1:mp_clear (&t1);
1190 __Q:mp_clear (&q);
1191 return res;
1194 /* b = a/2 */
1195 int mp_div_2(mp_int * a, mp_int * b)
1197 int x, res, oldused;
1199 /* copy */
1200 if (b->alloc < a->used) {
1201 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1202 return res;
1206 oldused = b->used;
1207 b->used = a->used;
1209 register mp_digit r, rr, *tmpa, *tmpb;
1211 /* source alias */
1212 tmpa = a->dp + b->used - 1;
1214 /* dest alias */
1215 tmpb = b->dp + b->used - 1;
1217 /* carry */
1218 r = 0;
1219 for (x = b->used - 1; x >= 0; x--) {
1220 /* get the carry for the next iteration */
1221 rr = *tmpa & 1;
1223 /* shift the current digit, add in carry and store */
1224 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1226 /* forward carry to next iteration */
1227 r = rr;
1230 /* zero excess digits */
1231 tmpb = b->dp + b->used;
1232 for (x = b->used; x < oldused; x++) {
1233 *tmpb++ = 0;
1236 b->sign = a->sign;
1237 mp_clamp (b);
1238 return MP_OKAY;
1241 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1242 int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
1244 mp_digit D, r, rr;
1245 int x, res;
1246 mp_int t;
1249 /* if the shift count is <= 0 then we do no work */
1250 if (b <= 0) {
1251 res = mp_copy (a, c);
1252 if (d != NULL) {
1253 mp_zero (d);
1255 return res;
1258 if ((res = mp_init (&t)) != MP_OKAY) {
1259 return res;
1262 /* get the remainder */
1263 if (d != NULL) {
1264 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1265 mp_clear (&t);
1266 return res;
1270 /* copy */
1271 if ((res = mp_copy (a, c)) != MP_OKAY) {
1272 mp_clear (&t);
1273 return res;
1276 /* shift by as many digits in the bit count */
1277 if (b >= (int)DIGIT_BIT) {
1278 mp_rshd (c, b / DIGIT_BIT);
1281 /* shift any bit count < DIGIT_BIT */
1282 D = (mp_digit) (b % DIGIT_BIT);
1283 if (D != 0) {
1284 register mp_digit *tmpc, mask, shift;
1286 /* mask */
1287 mask = (((mp_digit)1) << D) - 1;
1289 /* shift for lsb */
1290 shift = DIGIT_BIT - D;
1292 /* alias */
1293 tmpc = c->dp + (c->used - 1);
1295 /* carry */
1296 r = 0;
1297 for (x = c->used - 1; x >= 0; x--) {
1298 /* get the lower bits of this word in a temp */
1299 rr = *tmpc & mask;
1301 /* shift the current word and mix in the carry bits from the previous word */
1302 *tmpc = (*tmpc >> D) | (r << shift);
1303 --tmpc;
1305 /* set the carry to the carry bits of the current word found above */
1306 r = rr;
1309 mp_clamp (c);
1310 if (d != NULL) {
1311 mp_exch (&t, d);
1313 mp_clear (&t);
1314 return MP_OKAY;
1317 static int s_is_power_of_two(mp_digit b, int *p)
1319 int x;
1321 for (x = 1; x < DIGIT_BIT; x++) {
1322 if (b == (((mp_digit)1)<<x)) {
1323 *p = x;
1324 return 1;
1327 return 0;
1330 /* single digit division (based on routine from MPI) */
1331 int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
1333 mp_int q;
1334 mp_word w;
1335 mp_digit t;
1336 int res, ix;
1338 /* cannot divide by zero */
1339 if (b == 0) {
1340 return MP_VAL;
1343 /* quick outs */
1344 if (b == 1 || mp_iszero(a) == 1) {
1345 if (d != NULL) {
1346 *d = 0;
1348 if (c != NULL) {
1349 return mp_copy(a, c);
1351 return MP_OKAY;
1354 /* power of two ? */
1355 if (s_is_power_of_two(b, &ix) == 1) {
1356 if (d != NULL) {
1357 *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
1359 if (c != NULL) {
1360 return mp_div_2d(a, ix, c, NULL);
1362 return MP_OKAY;
1365 /* no easy answer [c'est la vie]. Just division */
1366 if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1367 return res;
1370 q.used = a->used;
1371 q.sign = a->sign;
1372 w = 0;
1373 for (ix = a->used - 1; ix >= 0; ix--) {
1374 w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1376 if (w >= b) {
1377 t = (mp_digit)(w / b);
1378 w -= ((mp_word)t) * ((mp_word)b);
1379 } else {
1380 t = 0;
1382 q.dp[ix] = (mp_digit)t;
1385 if (d != NULL) {
1386 *d = (mp_digit)w;
1389 if (c != NULL) {
1390 mp_clamp(&q);
1391 mp_exch(&q, c);
1393 mp_clear(&q);
1395 return res;
1398 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
1400 * Based on algorithm from the paper
1402 * "Generating Efficient Primes for Discrete Log Cryptosystems"
1403 * Chae Hoon Lim, Pil Loong Lee,
1404 * POSTECH Information Research Laboratories
1406 * The modulus must be of a special format [see manual]
1408 * Has been modified to use algorithm 7.10 from the LTM book instead
1410 * Input x must be in the range 0 <= x <= (n-1)**2
1413 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
1415 int err, i, m;
1416 mp_word r;
1417 mp_digit mu, *tmpx1, *tmpx2;
1419 /* m = digits in modulus */
1420 m = n->used;
1422 /* ensure that "x" has at least 2m digits */
1423 if (x->alloc < m + m) {
1424 if ((err = mp_grow (x, m + m)) != MP_OKAY) {
1425 return err;
1429 /* top of loop, this is where the code resumes if
1430 * another reduction pass is required.
1432 top:
1433 /* aliases for digits */
1434 /* alias for lower half of x */
1435 tmpx1 = x->dp;
1437 /* alias for upper half of x, or x/B**m */
1438 tmpx2 = x->dp + m;
1440 /* set carry to zero */
1441 mu = 0;
1443 /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
1444 for (i = 0; i < m; i++) {
1445 r = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
1446 *tmpx1++ = (mp_digit)(r & MP_MASK);
1447 mu = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
1450 /* set final carry */
1451 *tmpx1++ = mu;
1453 /* zero words above m */
1454 for (i = m + 1; i < x->used; i++) {
1455 *tmpx1++ = 0;
1458 /* clamp, sub and return */
1459 mp_clamp (x);
1461 /* if x >= n then subtract and reduce again
1462 * Each successive "recursion" makes the input smaller and smaller.
1464 if (mp_cmp_mag (x, n) != MP_LT) {
1465 s_mp_sub(x, n, x);
1466 goto top;
1468 return MP_OKAY;
1471 /* determines the setup value */
1472 void mp_dr_setup(mp_int *a, mp_digit *d)
1474 /* the casts are required if DIGIT_BIT is one less than
1475 * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
1477 *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
1478 ((mp_word)a->dp[0]));
1481 /* swap the elements of two integers, for cases where you can't simply swap the
1482 * mp_int pointers around
1484 void
1485 mp_exch (mp_int * a, mp_int * b)
1487 mp_int t;
1489 t = *a;
1490 *a = *b;
1491 *b = t;
1494 /* this is a shell function that calls either the normal or Montgomery
1495 * exptmod functions. Originally the call to the montgomery code was
1496 * embedded in the normal function but that wasted a lot of stack space
1497 * for nothing (since 99% of the time the Montgomery code would be called)
1499 int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
1501 int dr;
1503 /* modulus P must be positive */
1504 if (P->sign == MP_NEG) {
1505 return MP_VAL;
1508 /* if exponent X is negative we have to recurse */
1509 if (X->sign == MP_NEG) {
1510 mp_int tmpG, tmpX;
1511 int err;
1513 /* first compute 1/G mod P */
1514 if ((err = mp_init(&tmpG)) != MP_OKAY) {
1515 return err;
1517 if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
1518 mp_clear(&tmpG);
1519 return err;
1522 /* now get |X| */
1523 if ((err = mp_init(&tmpX)) != MP_OKAY) {
1524 mp_clear(&tmpG);
1525 return err;
1527 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
1528 mp_clear_multi(&tmpG, &tmpX, NULL);
1529 return err;
1532 /* and now compute (1/G)**|X| instead of G**X [X < 0] */
1533 err = mp_exptmod(&tmpG, &tmpX, P, Y);
1534 mp_clear_multi(&tmpG, &tmpX, NULL);
1535 return err;
1538 dr = 0;
1540 /* if the modulus is odd or dr != 0 use the fast method */
1541 if (mp_isodd (P) == 1 || dr != 0) {
1542 return mp_exptmod_fast (G, X, P, Y, dr);
1543 } else {
1544 /* otherwise use the generic Barrett reduction technique */
1545 return s_mp_exptmod (G, X, P, Y);
1549 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
1551 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
1552 * The value of k changes based on the size of the exponent.
1554 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
1558 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
1560 mp_int M[256], res;
1561 mp_digit buf, mp;
1562 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1564 /* use a pointer to the reduction algorithm. This allows us to use
1565 * one of many reduction algorithms without modding the guts of
1566 * the code with if statements everywhere.
1568 int (*redux)(mp_int*,mp_int*,mp_digit);
1570 /* find window size */
1571 x = mp_count_bits (X);
1572 if (x <= 7) {
1573 winsize = 2;
1574 } else if (x <= 36) {
1575 winsize = 3;
1576 } else if (x <= 140) {
1577 winsize = 4;
1578 } else if (x <= 450) {
1579 winsize = 5;
1580 } else if (x <= 1303) {
1581 winsize = 6;
1582 } else if (x <= 3529) {
1583 winsize = 7;
1584 } else {
1585 winsize = 8;
1588 /* init M array */
1589 /* init first cell */
1590 if ((err = mp_init(&M[1])) != MP_OKAY) {
1591 return err;
1594 /* now init the second half of the array */
1595 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1596 if ((err = mp_init(&M[x])) != MP_OKAY) {
1597 for (y = 1<<(winsize-1); y < x; y++) {
1598 mp_clear (&M[y]);
1600 mp_clear(&M[1]);
1601 return err;
1605 /* determine and setup reduction code */
1606 if (redmode == 0) {
1607 /* now setup montgomery */
1608 if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
1609 goto __M;
1612 /* automatically pick the comba one if available (saves quite a few calls/ifs) */
1613 if (((P->used * 2 + 1) < MP_WARRAY) &&
1614 P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
1615 redux = fast_mp_montgomery_reduce;
1616 } else {
1617 /* use slower baseline Montgomery method */
1618 redux = mp_montgomery_reduce;
1620 } else if (redmode == 1) {
1621 /* setup DR reduction for moduli of the form B**k - b */
1622 mp_dr_setup(P, &mp);
1623 redux = mp_dr_reduce;
1624 } else {
1625 /* setup DR reduction for moduli of the form 2**k - b */
1626 if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
1627 goto __M;
1629 redux = mp_reduce_2k;
1632 /* setup result */
1633 if ((err = mp_init (&res)) != MP_OKAY) {
1634 goto __M;
1637 /* create M table
1641 * The first half of the table is not computed though accept for M[0] and M[1]
1644 if (redmode == 0) {
1645 /* now we need R mod m */
1646 if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
1647 goto __RES;
1650 /* now set M[1] to G * R mod m */
1651 if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
1652 goto __RES;
1654 } else {
1655 mp_set(&res, 1);
1656 if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
1657 goto __RES;
1661 /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
1662 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1663 goto __RES;
1666 for (x = 0; x < (winsize - 1); x++) {
1667 if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
1668 goto __RES;
1670 if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
1671 goto __RES;
1675 /* create upper table */
1676 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1677 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1678 goto __RES;
1680 if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
1681 goto __RES;
1685 /* set initial mode and bit cnt */
1686 mode = 0;
1687 bitcnt = 1;
1688 buf = 0;
1689 digidx = X->used - 1;
1690 bitcpy = 0;
1691 bitbuf = 0;
1693 for (;;) {
1694 /* grab next digit as required */
1695 if (--bitcnt == 0) {
1696 /* if digidx == -1 we are out of digits so break */
1697 if (digidx == -1) {
1698 break;
1700 /* read next digit and reset bitcnt */
1701 buf = X->dp[digidx--];
1702 bitcnt = (int)DIGIT_BIT;
1705 /* grab the next msb from the exponent */
1706 y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
1707 buf <<= (mp_digit)1;
1709 /* if the bit is zero and mode == 0 then we ignore it
1710 * These represent the leading zero bits before the first 1 bit
1711 * in the exponent. Technically this opt is not required but it
1712 * does lower the # of trivial squaring/reductions used
1714 if (mode == 0 && y == 0) {
1715 continue;
1718 /* if the bit is zero and mode == 1 then we square */
1719 if (mode == 1 && y == 0) {
1720 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1721 goto __RES;
1723 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1724 goto __RES;
1726 continue;
1729 /* else we add it to the window */
1730 bitbuf |= (y << (winsize - ++bitcpy));
1731 mode = 2;
1733 if (bitcpy == winsize) {
1734 /* ok window is filled so square as required and multiply */
1735 /* square first */
1736 for (x = 0; x < winsize; x++) {
1737 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1738 goto __RES;
1740 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1741 goto __RES;
1745 /* then multiply */
1746 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
1747 goto __RES;
1749 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1750 goto __RES;
1753 /* empty window and reset */
1754 bitcpy = 0;
1755 bitbuf = 0;
1756 mode = 1;
1760 /* if bits remain then square/multiply */
1761 if (mode == 2 && bitcpy > 0) {
1762 /* square then multiply if the bit is set */
1763 for (x = 0; x < bitcpy; x++) {
1764 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1765 goto __RES;
1767 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1768 goto __RES;
1771 /* get next bit of the window */
1772 bitbuf <<= 1;
1773 if ((bitbuf & (1 << winsize)) != 0) {
1774 /* then multiply */
1775 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
1776 goto __RES;
1778 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1779 goto __RES;
1785 if (redmode == 0) {
1786 /* fixup result if Montgomery reduction is used
1787 * recall that any value in a Montgomery system is
1788 * actually multiplied by R mod n. So we have
1789 * to reduce one more time to cancel out the factor
1790 * of R.
1792 if ((err = redux(&res, P, mp)) != MP_OKAY) {
1793 goto __RES;
1797 /* swap res with Y */
1798 mp_exch (&res, Y);
1799 err = MP_OKAY;
1800 __RES:mp_clear (&res);
1801 __M:
1802 mp_clear(&M[1]);
1803 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1804 mp_clear (&M[x]);
1806 return err;
1809 /* Greatest Common Divisor using the binary method */
1810 int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
1812 mp_int u, v;
1813 int k, u_lsb, v_lsb, res;
1815 /* either zero than gcd is the largest */
1816 if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
1817 return mp_abs (b, c);
1819 if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
1820 return mp_abs (a, c);
1823 /* optimized. At this point if a == 0 then
1824 * b must equal zero too
1826 if (mp_iszero (a) == 1) {
1827 mp_zero(c);
1828 return MP_OKAY;
1831 /* get copies of a and b we can modify */
1832 if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
1833 return res;
1836 if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
1837 goto __U;
1840 /* must be positive for the remainder of the algorithm */
1841 u.sign = v.sign = MP_ZPOS;
1843 /* B1. Find the common power of two for u and v */
1844 u_lsb = mp_cnt_lsb(&u);
1845 v_lsb = mp_cnt_lsb(&v);
1846 k = MIN(u_lsb, v_lsb);
1848 if (k > 0) {
1849 /* divide the power of two out */
1850 if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
1851 goto __V;
1854 if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
1855 goto __V;
1859 /* divide any remaining factors of two out */
1860 if (u_lsb != k) {
1861 if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
1862 goto __V;
1866 if (v_lsb != k) {
1867 if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
1868 goto __V;
1872 while (mp_iszero(&v) == 0) {
1873 /* make sure v is the largest */
1874 if (mp_cmp_mag(&u, &v) == MP_GT) {
1875 /* swap u and v to make sure v is >= u */
1876 mp_exch(&u, &v);
1879 /* subtract smallest from largest */
1880 if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
1881 goto __V;
1884 /* Divide out all factors of two */
1885 if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
1886 goto __V;
1890 /* multiply by 2**k which we divided out at the beginning */
1891 if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
1892 goto __V;
1894 c->sign = MP_ZPOS;
1895 res = MP_OKAY;
1896 __V:mp_clear (&u);
1897 __U:mp_clear (&v);
1898 return res;
1901 /* get the lower 32-bits of an mp_int */
1902 unsigned long mp_get_int(mp_int * a)
1904 int i;
1905 unsigned long res;
1907 if (a->used == 0) {
1908 return 0;
1911 /* get number of digits of the lsb we have to read */
1912 i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
1914 /* get most significant digit of result */
1915 res = DIGIT(a,i);
1917 while (--i >= 0) {
1918 res = (res << DIGIT_BIT) | DIGIT(a,i);
1921 /* force result to 32-bits always so it is consistent on non 32-bit platforms */
1922 return res & 0xFFFFFFFFUL;
1925 /* grow as required */
1926 int mp_grow (mp_int * a, int size)
1928 int i;
1929 mp_digit *tmp;
1931 /* if the alloc size is smaller alloc more ram */
1932 if (a->alloc < size) {
1933 /* ensure there are always at least MP_PREC digits extra on top */
1934 size += (MP_PREC * 2) - (size % MP_PREC);
1936 /* reallocate the array a->dp
1938 * We store the return in a temporary variable
1939 * in case the operation failed we don't want
1940 * to overwrite the dp member of a.
1942 tmp = realloc (a->dp, sizeof (mp_digit) * size);
1943 if (tmp == NULL) {
1944 /* reallocation failed but "a" is still valid [can be freed] */
1945 return MP_MEM;
1948 /* reallocation succeeded so set a->dp */
1949 a->dp = tmp;
1951 /* zero excess digits */
1952 i = a->alloc;
1953 a->alloc = size;
1954 for (; i < a->alloc; i++) {
1955 a->dp[i] = 0;
1958 return MP_OKAY;
1961 /* init a new mp_int */
1962 int mp_init (mp_int * a)
1964 int i;
1966 /* allocate memory required and clear it */
1967 a->dp = malloc (sizeof (mp_digit) * MP_PREC);
1968 if (a->dp == NULL) {
1969 return MP_MEM;
1972 /* set the digits to zero */
1973 for (i = 0; i < MP_PREC; i++) {
1974 a->dp[i] = 0;
1977 /* set the used to zero, allocated digits to the default precision
1978 * and sign to positive */
1979 a->used = 0;
1980 a->alloc = MP_PREC;
1981 a->sign = MP_ZPOS;
1983 return MP_OKAY;
1986 /* creates "a" then copies b into it */
1987 int mp_init_copy (mp_int * a, const mp_int * b)
1989 int res;
1991 if ((res = mp_init (a)) != MP_OKAY) {
1992 return res;
1994 return mp_copy (b, a);
1997 int mp_init_multi(mp_int *mp, ...)
1999 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */
2000 int n = 0; /* Number of ok inits */
2001 mp_int* cur_arg = mp;
2002 va_list args;
2004 va_start(args, mp); /* init args to next argument from caller */
2005 while (cur_arg != NULL) {
2006 if (mp_init(cur_arg) != MP_OKAY) {
2007 /* Oops - error! Back-track and mp_clear what we already
2008 succeeded in init-ing, then return error.
2010 va_list clean_args;
2012 /* end the current list */
2013 va_end(args);
2015 /* now start cleaning up */
2016 cur_arg = mp;
2017 va_start(clean_args, mp);
2018 while (n--) {
2019 mp_clear(cur_arg);
2020 cur_arg = va_arg(clean_args, mp_int*);
2022 va_end(clean_args);
2023 res = MP_MEM;
2024 break;
2026 n++;
2027 cur_arg = va_arg(args, mp_int*);
2029 va_end(args);
2030 return res; /* Assumed ok, if error flagged above. */
2033 /* init an mp_init for a given size */
2034 int mp_init_size (mp_int * a, int size)
2036 int x;
2038 /* pad size so there are always extra digits */
2039 size += (MP_PREC * 2) - (size % MP_PREC);
2041 /* alloc mem */
2042 a->dp = malloc (sizeof (mp_digit) * size);
2043 if (a->dp == NULL) {
2044 return MP_MEM;
2047 /* set the members */
2048 a->used = 0;
2049 a->alloc = size;
2050 a->sign = MP_ZPOS;
2052 /* zero the digits */
2053 for (x = 0; x < size; x++) {
2054 a->dp[x] = 0;
2057 return MP_OKAY;
2060 /* hac 14.61, pp608 */
2061 int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
2063 /* b cannot be negative */
2064 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2065 return MP_VAL;
2068 /* if the modulus is odd we can use a faster routine instead */
2069 if (mp_isodd (b) == 1) {
2070 return fast_mp_invmod (a, b, c);
2073 return mp_invmod_slow(a, b, c);
2076 /* hac 14.61, pp608 */
2077 int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
2079 mp_int x, y, u, v, A, B, C, D;
2080 int res;
2082 /* b cannot be negative */
2083 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2084 return MP_VAL;
2087 /* init temps */
2088 if ((res = mp_init_multi(&x, &y, &u, &v,
2089 &A, &B, &C, &D, NULL)) != MP_OKAY) {
2090 return res;
2093 /* x = a, y = b */
2094 if ((res = mp_copy (a, &x)) != MP_OKAY) {
2095 goto __ERR;
2097 if ((res = mp_copy (b, &y)) != MP_OKAY) {
2098 goto __ERR;
2101 /* 2. [modified] if x,y are both even then return an error! */
2102 if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
2103 res = MP_VAL;
2104 goto __ERR;
2107 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
2108 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
2109 goto __ERR;
2111 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
2112 goto __ERR;
2114 mp_set (&A, 1);
2115 mp_set (&D, 1);
2117 top:
2118 /* 4. while u is even do */
2119 while (mp_iseven (&u) == 1) {
2120 /* 4.1 u = u/2 */
2121 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
2122 goto __ERR;
2124 /* 4.2 if A or B is odd then */
2125 if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
2126 /* A = (A+y)/2, B = (B-x)/2 */
2127 if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
2128 goto __ERR;
2130 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
2131 goto __ERR;
2134 /* A = A/2, B = B/2 */
2135 if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
2136 goto __ERR;
2138 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
2139 goto __ERR;
2143 /* 5. while v is even do */
2144 while (mp_iseven (&v) == 1) {
2145 /* 5.1 v = v/2 */
2146 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
2147 goto __ERR;
2149 /* 5.2 if C or D is odd then */
2150 if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
2151 /* C = (C+y)/2, D = (D-x)/2 */
2152 if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
2153 goto __ERR;
2155 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
2156 goto __ERR;
2159 /* C = C/2, D = D/2 */
2160 if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
2161 goto __ERR;
2163 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
2164 goto __ERR;
2168 /* 6. if u >= v then */
2169 if (mp_cmp (&u, &v) != MP_LT) {
2170 /* u = u - v, A = A - C, B = B - D */
2171 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
2172 goto __ERR;
2175 if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
2176 goto __ERR;
2179 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
2180 goto __ERR;
2182 } else {
2183 /* v - v - u, C = C - A, D = D - B */
2184 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
2185 goto __ERR;
2188 if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
2189 goto __ERR;
2192 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
2193 goto __ERR;
2197 /* if not zero goto step 4 */
2198 if (mp_iszero (&u) == 0)
2199 goto top;
2201 /* now a = C, b = D, gcd == g*v */
2203 /* if v != 1 then there is no inverse */
2204 if (mp_cmp_d (&v, 1) != MP_EQ) {
2205 res = MP_VAL;
2206 goto __ERR;
2209 /* if its too low */
2210 while (mp_cmp_d(&C, 0) == MP_LT) {
2211 if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
2212 goto __ERR;
2216 /* too big */
2217 while (mp_cmp_mag(&C, b) != MP_LT) {
2218 if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
2219 goto __ERR;
2223 /* C is now the inverse */
2224 mp_exch (&C, c);
2225 res = MP_OKAY;
2226 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
2227 return res;
2230 /* c = |a| * |b| using Karatsuba Multiplication using
2231 * three half size multiplications
2233 * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2234 * let n represent half of the number of digits in
2235 * the min(a,b)
2237 * a = a1 * B**n + a0
2238 * b = b1 * B**n + b0
2240 * Then, a * b =>
2241 a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
2243 * Note that a1b1 and a0b0 are used twice and only need to be
2244 * computed once. So in total three half size (half # of
2245 * digit) multiplications are performed, a0b0, a1b1 and
2246 * (a1-b1)(a0-b0)
2248 * Note that a multiplication of half the digits requires
2249 * 1/4th the number of single precision multiplications so in
2250 * total after one call 25% of the single precision multiplications
2251 * are saved. Note also that the call to mp_mul can end up back
2252 * in this function if the a0, a1, b0, or b1 are above the threshold.
2253 * This is known as divide-and-conquer and leads to the famous
2254 * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
2255 * the standard O(N**2) that the baseline/comba methods use.
2256 * Generally though the overhead of this method doesn't pay off
2257 * until a certain size (N ~ 80) is reached.
2259 int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
2261 mp_int x0, x1, y0, y1, t1, x0y0, x1y1;
2262 int B, err;
2264 /* default the return code to an error */
2265 err = MP_MEM;
2267 /* min # of digits */
2268 B = MIN (a->used, b->used);
2270 /* now divide in two */
2271 B = B >> 1;
2273 /* init copy all the temps */
2274 if (mp_init_size (&x0, B) != MP_OKAY)
2275 goto ERR;
2276 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2277 goto X0;
2278 if (mp_init_size (&y0, B) != MP_OKAY)
2279 goto X1;
2280 if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2281 goto Y0;
2283 /* init temps */
2284 if (mp_init_size (&t1, B * 2) != MP_OKAY)
2285 goto Y1;
2286 if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2287 goto T1;
2288 if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2289 goto X0Y0;
2291 /* now shift the digits */
2292 x0.used = y0.used = B;
2293 x1.used = a->used - B;
2294 y1.used = b->used - B;
2297 register int x;
2298 register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2300 /* we copy the digits directly instead of using higher level functions
2301 * since we also need to shift the digits
2303 tmpa = a->dp;
2304 tmpb = b->dp;
2306 tmpx = x0.dp;
2307 tmpy = y0.dp;
2308 for (x = 0; x < B; x++) {
2309 *tmpx++ = *tmpa++;
2310 *tmpy++ = *tmpb++;
2313 tmpx = x1.dp;
2314 for (x = B; x < a->used; x++) {
2315 *tmpx++ = *tmpa++;
2318 tmpy = y1.dp;
2319 for (x = B; x < b->used; x++) {
2320 *tmpy++ = *tmpb++;
2324 /* only need to clamp the lower words since by definition the
2325 * upper words x1/y1 must have a known number of digits
2327 mp_clamp (&x0);
2328 mp_clamp (&y0);
2330 /* now calc the products x0y0 and x1y1 */
2331 /* after this x0 is no longer required, free temp [x0==t2]! */
2332 if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2333 goto X1Y1; /* x0y0 = x0*y0 */
2334 if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2335 goto X1Y1; /* x1y1 = x1*y1 */
2337 /* now calc x1-x0 and y1-y0 */
2338 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2339 goto X1Y1; /* t1 = x1 - x0 */
2340 if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
2341 goto X1Y1; /* t2 = y1 - y0 */
2342 if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2343 goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */
2345 /* add x0y0 */
2346 if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2347 goto X1Y1; /* t2 = x0y0 + x1y1 */
2348 if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
2349 goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
2351 /* shift by B */
2352 if (mp_lshd (&t1, B) != MP_OKAY)
2353 goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2354 if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2355 goto X1Y1; /* x1y1 = x1y1 << 2*B */
2357 if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2358 goto X1Y1; /* t1 = x0y0 + t1 */
2359 if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2360 goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
2362 /* Algorithm succeeded set the return code to MP_OKAY */
2363 err = MP_OKAY;
2365 X1Y1:mp_clear (&x1y1);
2366 X0Y0:mp_clear (&x0y0);
2367 T1:mp_clear (&t1);
2368 Y1:mp_clear (&y1);
2369 Y0:mp_clear (&y0);
2370 X1:mp_clear (&x1);
2371 X0:mp_clear (&x0);
2372 ERR:
2373 return err;
2376 /* Karatsuba squaring, computes b = a*a using three
2377 * half size squarings
2379 * See comments of karatsuba_mul for details. It
2380 * is essentially the same algorithm but merely
2381 * tuned to perform recursive squarings.
2383 int mp_karatsuba_sqr (mp_int * a, mp_int * b)
2385 mp_int x0, x1, t1, t2, x0x0, x1x1;
2386 int B, err;
2388 err = MP_MEM;
2390 /* min # of digits */
2391 B = a->used;
2393 /* now divide in two */
2394 B = B >> 1;
2396 /* init copy all the temps */
2397 if (mp_init_size (&x0, B) != MP_OKAY)
2398 goto ERR;
2399 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2400 goto X0;
2402 /* init temps */
2403 if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2404 goto X1;
2405 if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
2406 goto T1;
2407 if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
2408 goto T2;
2409 if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
2410 goto X0X0;
2413 register int x;
2414 register mp_digit *dst, *src;
2416 src = a->dp;
2418 /* now shift the digits */
2419 dst = x0.dp;
2420 for (x = 0; x < B; x++) {
2421 *dst++ = *src++;
2424 dst = x1.dp;
2425 for (x = B; x < a->used; x++) {
2426 *dst++ = *src++;
2430 x0.used = B;
2431 x1.used = a->used - B;
2433 mp_clamp (&x0);
2435 /* now calc the products x0*x0 and x1*x1 */
2436 if (mp_sqr (&x0, &x0x0) != MP_OKAY)
2437 goto X1X1; /* x0x0 = x0*x0 */
2438 if (mp_sqr (&x1, &x1x1) != MP_OKAY)
2439 goto X1X1; /* x1x1 = x1*x1 */
2441 /* now calc (x1-x0)**2 */
2442 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2443 goto X1X1; /* t1 = x1 - x0 */
2444 if (mp_sqr (&t1, &t1) != MP_OKAY)
2445 goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */
2447 /* add x0y0 */
2448 if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
2449 goto X1X1; /* t2 = x0x0 + x1x1 */
2450 if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
2451 goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
2453 /* shift by B */
2454 if (mp_lshd (&t1, B) != MP_OKAY)
2455 goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
2456 if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
2457 goto X1X1; /* x1x1 = x1x1 << 2*B */
2459 if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
2460 goto X1X1; /* t1 = x0x0 + t1 */
2461 if (mp_add (&t1, &x1x1, b) != MP_OKAY)
2462 goto X1X1; /* t1 = x0x0 + t1 + x1x1 */
2464 err = MP_OKAY;
2466 X1X1:mp_clear (&x1x1);
2467 X0X0:mp_clear (&x0x0);
2468 T2:mp_clear (&t2);
2469 T1:mp_clear (&t1);
2470 X1:mp_clear (&x1);
2471 X0:mp_clear (&x0);
2472 ERR:
2473 return err;
2476 /* computes least common multiple as |a*b|/(a, b) */
2477 int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
2479 int res;
2480 mp_int t1, t2;
2483 if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
2484 return res;
2487 /* t1 = get the GCD of the two inputs */
2488 if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
2489 goto __T;
2492 /* divide the smallest by the GCD */
2493 if (mp_cmp_mag(a, b) == MP_LT) {
2494 /* store quotient in t2 such that t2 * b is the LCM */
2495 if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
2496 goto __T;
2498 res = mp_mul(b, &t2, c);
2499 } else {
2500 /* store quotient in t2 such that t2 * a is the LCM */
2501 if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
2502 goto __T;
2504 res = mp_mul(a, &t2, c);
2507 /* fix the sign to positive */
2508 c->sign = MP_ZPOS;
2510 __T:
2511 mp_clear_multi (&t1, &t2, NULL);
2512 return res;
2515 /* shift left a certain amount of digits */
2516 int mp_lshd (mp_int * a, int b)
2518 int x, res;
2520 /* if its less than zero return */
2521 if (b <= 0) {
2522 return MP_OKAY;
2525 /* grow to fit the new digits */
2526 if (a->alloc < a->used + b) {
2527 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
2528 return res;
2533 register mp_digit *top, *bottom;
2535 /* increment the used by the shift amount then copy upwards */
2536 a->used += b;
2538 /* top */
2539 top = a->dp + a->used - 1;
2541 /* base */
2542 bottom = a->dp + a->used - 1 - b;
2544 /* much like mp_rshd this is implemented using a sliding window
2545 * except the window goes the otherway around. Copying from
2546 * the bottom to the top. see bn_mp_rshd.c for more info.
2548 for (x = a->used - 1; x >= b; x--) {
2549 *top-- = *bottom--;
2552 /* zero the lower digits */
2553 top = a->dp;
2554 for (x = 0; x < b; x++) {
2555 *top++ = 0;
2558 return MP_OKAY;
2561 /* c = a mod b, 0 <= c < b */
2563 mp_mod (mp_int * a, mp_int * b, mp_int * c)
2565 mp_int t;
2566 int res;
2568 if ((res = mp_init (&t)) != MP_OKAY) {
2569 return res;
2572 if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
2573 mp_clear (&t);
2574 return res;
2577 if (t.sign != b->sign) {
2578 res = mp_add (b, &t, c);
2579 } else {
2580 res = MP_OKAY;
2581 mp_exch (&t, c);
2584 mp_clear (&t);
2585 return res;
2588 /* calc a value mod 2**b */
2590 mp_mod_2d (mp_int * a, int b, mp_int * c)
2592 int x, res;
2594 /* if b is <= 0 then zero the int */
2595 if (b <= 0) {
2596 mp_zero (c);
2597 return MP_OKAY;
2600 /* if the modulus is larger than the value than return */
2601 if (b > (int) (a->used * DIGIT_BIT)) {
2602 res = mp_copy (a, c);
2603 return res;
2606 /* copy */
2607 if ((res = mp_copy (a, c)) != MP_OKAY) {
2608 return res;
2611 /* zero digits above the last digit of the modulus */
2612 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
2613 c->dp[x] = 0;
2615 /* clear the digit that is not completely outside/inside the modulus */
2616 c->dp[b / DIGIT_BIT] &=
2617 (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
2618 mp_clamp (c);
2619 return MP_OKAY;
2623 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
2625 return mp_div_d(a, b, NULL, c);
2629 * shifts with subtractions when the result is greater than b.
2631 * The method is slightly modified to shift B unconditionally upto just under
2632 * the leading bit of b. This saves a lot of multiple precision shifting.
2634 int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
2636 int x, bits, res;
2638 /* how many bits of last digit does b use */
2639 bits = mp_count_bits (b) % DIGIT_BIT;
2642 if (b->used > 1) {
2643 if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2644 return res;
2646 } else {
2647 mp_set(a, 1);
2648 bits = 1;
2652 /* now compute C = A * B mod b */
2653 for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
2654 if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2655 return res;
2657 if (mp_cmp_mag (a, b) != MP_LT) {
2658 if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2659 return res;
2664 return MP_OKAY;
2667 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
2669 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
2671 int ix, res, digs;
2672 mp_digit mu;
2674 /* can the fast reduction [comba] method be used?
2676 * Note that unlike in mul you're safely allowed *less*
2677 * than the available columns [255 per default] since carries
2678 * are fixed up in the inner loop.
2680 digs = n->used * 2 + 1;
2681 if ((digs < MP_WARRAY) &&
2682 n->used <
2683 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2684 return fast_mp_montgomery_reduce (x, n, rho);
2687 /* grow the input as required */
2688 if (x->alloc < digs) {
2689 if ((res = mp_grow (x, digs)) != MP_OKAY) {
2690 return res;
2693 x->used = digs;
2695 for (ix = 0; ix < n->used; ix++) {
2696 /* mu = ai * rho mod b
2698 * The value of rho must be precalculated via
2699 * montgomery_setup() such that
2700 * it equals -1/n0 mod b this allows the
2701 * following inner loop to reduce the
2702 * input one digit at a time
2704 mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
2706 /* a = a + mu * m * b**i */
2708 register int iy;
2709 register mp_digit *tmpn, *tmpx, u;
2710 register mp_word r;
2712 /* alias for digits of the modulus */
2713 tmpn = n->dp;
2715 /* alias for the digits of x [the input] */
2716 tmpx = x->dp + ix;
2718 /* set the carry to zero */
2719 u = 0;
2721 /* Multiply and add in place */
2722 for (iy = 0; iy < n->used; iy++) {
2723 /* compute product and sum */
2724 r = ((mp_word)mu) * ((mp_word)*tmpn++) +
2725 ((mp_word) u) + ((mp_word) * tmpx);
2727 /* get carry */
2728 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2730 /* fix digit */
2731 *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
2733 /* At this point the ix'th digit of x should be zero */
2736 /* propagate carries upwards as required*/
2737 while (u) {
2738 *tmpx += u;
2739 u = *tmpx >> DIGIT_BIT;
2740 *tmpx++ &= MP_MASK;
2745 /* at this point the n.used'th least
2746 * significant digits of x are all zero
2747 * which means we can shift x to the
2748 * right by n.used digits and the
2749 * residue is unchanged.
2752 /* x = x/b**n.used */
2753 mp_clamp(x);
2754 mp_rshd (x, n->used);
2756 /* if x >= n then x = x - n */
2757 if (mp_cmp_mag (x, n) != MP_LT) {
2758 return s_mp_sub (x, n, x);
2761 return MP_OKAY;
2764 /* setups the montgomery reduction stuff */
2766 mp_montgomery_setup (mp_int * n, mp_digit * rho)
2768 mp_digit x, b;
2770 /* fast inversion mod 2**k
2772 * Based on the fact that
2774 * XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
2775 * => 2*X*A - X*X*A*A = 1
2776 * => 2*(1) - (1) = 1
2778 b = n->dp[0];
2780 if ((b & 1) == 0) {
2781 return MP_VAL;
2784 x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2785 x *= 2 - b * x; /* here x*a==1 mod 2**8 */
2786 x *= 2 - b * x; /* here x*a==1 mod 2**16 */
2787 x *= 2 - b * x; /* here x*a==1 mod 2**32 */
2789 /* rho = -1/m mod b */
2790 *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2792 return MP_OKAY;
2795 /* high level multiplication (handles sign) */
2796 int mp_mul (mp_int * a, mp_int * b, mp_int * c)
2798 int res, neg;
2799 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2801 /* use Karatsuba? */
2802 if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
2803 res = mp_karatsuba_mul (a, b, c);
2804 } else
2806 /* can we use the fast multiplier?
2808 * The fast multiplier can be used if the output will
2809 * have less than MP_WARRAY digits and the number of
2810 * digits won't affect carry propagation
2812 int digs = a->used + b->used + 1;
2814 if ((digs < MP_WARRAY) &&
2815 MIN(a->used, b->used) <=
2816 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2817 res = fast_s_mp_mul_digs (a, b, c, digs);
2818 } else
2819 res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
2821 c->sign = (c->used > 0) ? neg : MP_ZPOS;
2822 return res;
2825 /* b = a*2 */
2826 int mp_mul_2(mp_int * a, mp_int * b)
2828 int x, res, oldused;
2830 /* grow to accommodate result */
2831 if (b->alloc < a->used + 1) {
2832 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2833 return res;
2837 oldused = b->used;
2838 b->used = a->used;
2841 register mp_digit r, rr, *tmpa, *tmpb;
2843 /* alias for source */
2844 tmpa = a->dp;
2846 /* alias for dest */
2847 tmpb = b->dp;
2849 /* carry */
2850 r = 0;
2851 for (x = 0; x < a->used; x++) {
2853 /* get what will be the *next* carry bit from the
2854 * MSB of the current digit
2856 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2858 /* now shift up this digit, add in the carry [from the previous] */
2859 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2861 /* copy the carry that would be from the source
2862 * digit into the next iteration
2864 r = rr;
2867 /* new leading digit? */
2868 if (r != 0) {
2869 /* add a MSB which is always 1 at this point */
2870 *tmpb = 1;
2871 ++(b->used);
2874 /* now zero any excess digits on the destination
2875 * that we didn't write to
2877 tmpb = b->dp + b->used;
2878 for (x = b->used; x < oldused; x++) {
2879 *tmpb++ = 0;
2882 b->sign = a->sign;
2883 return MP_OKAY;
2886 /* shift left by a certain bit count */
2887 int mp_mul_2d (mp_int * a, int b, mp_int * c)
2889 mp_digit d;
2890 int res;
2892 /* copy */
2893 if (a != c) {
2894 if ((res = mp_copy (a, c)) != MP_OKAY) {
2895 return res;
2899 if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
2900 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
2901 return res;
2905 /* shift by as many digits in the bit count */
2906 if (b >= (int)DIGIT_BIT) {
2907 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
2908 return res;
2912 /* shift any bit count < DIGIT_BIT */
2913 d = (mp_digit) (b % DIGIT_BIT);
2914 if (d != 0) {
2915 register mp_digit *tmpc, shift, mask, r, rr;
2916 register int x;
2918 /* bitmask for carries */
2919 mask = (((mp_digit)1) << d) - 1;
2921 /* shift for msbs */
2922 shift = DIGIT_BIT - d;
2924 /* alias */
2925 tmpc = c->dp;
2927 /* carry */
2928 r = 0;
2929 for (x = 0; x < c->used; x++) {
2930 /* get the higher bits of the current word */
2931 rr = (*tmpc >> shift) & mask;
2933 /* shift the current word and OR in the carry */
2934 *tmpc = ((*tmpc << d) | r) & MP_MASK;
2935 ++tmpc;
2937 /* set the carry to the carry bits of the current word */
2938 r = rr;
2941 /* set final carry */
2942 if (r != 0) {
2943 c->dp[(c->used)++] = r;
2946 mp_clamp (c);
2947 return MP_OKAY;
2950 /* multiply by a digit */
2952 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
2954 mp_digit u, *tmpa, *tmpc;
2955 mp_word r;
2956 int ix, res, olduse;
2958 /* make sure c is big enough to hold a*b */
2959 if (c->alloc < a->used + 1) {
2960 if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
2961 return res;
2965 /* get the original destinations used count */
2966 olduse = c->used;
2968 /* set the sign */
2969 c->sign = a->sign;
2971 /* alias for a->dp [source] */
2972 tmpa = a->dp;
2974 /* alias for c->dp [dest] */
2975 tmpc = c->dp;
2977 /* zero carry */
2978 u = 0;
2980 /* compute columns */
2981 for (ix = 0; ix < a->used; ix++) {
2982 /* compute product and carry sum for this term */
2983 r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
2985 /* mask off higher bits to get a single digit */
2986 *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
2988 /* send carry into next iteration */
2989 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2992 /* store final carry [if any] */
2993 *tmpc++ = u;
2995 /* now zero digits above the top */
2996 while (ix++ < olduse) {
2997 *tmpc++ = 0;
3000 /* set used count */
3001 c->used = a->used + 1;
3002 mp_clamp(c);
3004 return MP_OKAY;
3007 /* d = a * b (mod c) */
3009 mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
3011 int res;
3012 mp_int t;
3014 if ((res = mp_init (&t)) != MP_OKAY) {
3015 return res;
3018 if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3019 mp_clear (&t);
3020 return res;
3022 res = mp_mod (&t, c, d);
3023 mp_clear (&t);
3024 return res;
3027 /* determines if an integers is divisible by one
3028 * of the first PRIME_SIZE primes or not
3030 * sets result to 0 if not, 1 if yes
3032 int mp_prime_is_divisible (mp_int * a, int *result)
3034 int err, ix;
3035 mp_digit res;
3037 /* default to not */
3038 *result = MP_NO;
3040 for (ix = 0; ix < PRIME_SIZE; ix++) {
3041 /* what is a mod __prime_tab[ix] */
3042 if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
3043 return err;
3046 /* is the residue zero? */
3047 if (res == 0) {
3048 *result = MP_YES;
3049 return MP_OKAY;
3053 return MP_OKAY;
3056 /* performs a variable number of rounds of Miller-Rabin
3058 * Probability of error after t rounds is no more than
3061 * Sets result to 1 if probably prime, 0 otherwise
3063 int mp_prime_is_prime (mp_int * a, int t, int *result)
3065 mp_int b;
3066 int ix, err, res;
3068 /* default to no */
3069 *result = MP_NO;
3071 /* valid value of t? */
3072 if (t <= 0 || t > PRIME_SIZE) {
3073 return MP_VAL;
3076 /* is the input equal to one of the primes in the table? */
3077 for (ix = 0; ix < PRIME_SIZE; ix++) {
3078 if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
3079 *result = 1;
3080 return MP_OKAY;
3084 /* first perform trial division */
3085 if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
3086 return err;
3089 /* return if it was trivially divisible */
3090 if (res == MP_YES) {
3091 return MP_OKAY;
3094 /* now perform the miller-rabin rounds */
3095 if ((err = mp_init (&b)) != MP_OKAY) {
3096 return err;
3099 for (ix = 0; ix < t; ix++) {
3100 /* set the prime */
3101 mp_set (&b, __prime_tab[ix]);
3103 if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
3104 goto __B;
3107 if (res == MP_NO) {
3108 goto __B;
3112 /* passed the test */
3113 *result = MP_YES;
3114 __B:mp_clear (&b);
3115 return err;
3118 /* Miller-Rabin test of "a" to the base of "b" as described in
3119 * HAC pp. 139 Algorithm 4.24
3121 * Sets result to 0 if definitely composite or 1 if probably prime.
3122 * Randomly the chance of error is no more than 1/4 and often
3123 * very much lower.
3125 int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
3127 mp_int n1, y, r;
3128 int s, j, err;
3130 /* default */
3131 *result = MP_NO;
3133 /* ensure b > 1 */
3134 if (mp_cmp_d(b, 1) != MP_GT) {
3135 return MP_VAL;
3138 /* get n1 = a - 1 */
3139 if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
3140 return err;
3142 if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
3143 goto __N1;
3146 /* set 2**s * r = n1 */
3147 if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
3148 goto __N1;
3151 /* count the number of least significant bits
3152 * which are zero
3154 s = mp_cnt_lsb(&r);
3156 /* now divide n - 1 by 2**s */
3157 if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
3158 goto __R;
3161 /* compute y = b**r mod a */
3162 if ((err = mp_init (&y)) != MP_OKAY) {
3163 goto __R;
3165 if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
3166 goto __Y;
3169 /* if y != 1 and y != n1 do */
3170 if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
3171 j = 1;
3172 /* while j <= s-1 and y != n1 */
3173 while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
3174 if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
3175 goto __Y;
3178 /* if y == 1 then composite */
3179 if (mp_cmp_d (&y, 1) == MP_EQ) {
3180 goto __Y;
3183 ++j;
3186 /* if y != n1 then composite */
3187 if (mp_cmp (&y, &n1) != MP_EQ) {
3188 goto __Y;
3192 /* probably prime now */
3193 *result = MP_YES;
3194 __Y:mp_clear (&y);
3195 __R:mp_clear (&r);
3196 __N1:mp_clear (&n1);
3197 return err;
3200 static const struct {
3201 int k, t;
3202 } sizes[] = {
3203 { 128, 28 },
3204 { 256, 16 },
3205 { 384, 10 },
3206 { 512, 7 },
3207 { 640, 6 },
3208 { 768, 5 },
3209 { 896, 4 },
3210 { 1024, 4 }
3213 /* returns # of RM trials required for a given bit size */
3214 int mp_prime_rabin_miller_trials(int size)
3216 int x;
3218 for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
3219 if (sizes[x].k == size) {
3220 return sizes[x].t;
3221 } else if (sizes[x].k > size) {
3222 return (x == 0) ? sizes[0].t : sizes[x - 1].t;
3225 return sizes[x-1].t + 1;
3228 /* makes a truly random prime of a given size (bits),
3230 * Flags are as follows:
3232 * LTM_PRIME_BBS - make prime congruent to 3 mod 4
3233 * LTM_PRIME_SAFE - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
3234 * LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
3235 * LTM_PRIME_2MSB_ON - make the 2nd highest bit one
3237 * You have to supply a callback which fills in a buffer with random bytes. "dat" is a parameter you can
3238 * have passed to the callback (e.g. a state or something). This function doesn't use "dat" itself
3239 * so it can be NULL
3243 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
3244 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
3246 unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
3247 int res, err, bsize, maskOR_msb_offset;
3249 /* sanity check the input */
3250 if (size <= 1 || t <= 0) {
3251 return MP_VAL;
3254 /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
3255 if (flags & LTM_PRIME_SAFE) {
3256 flags |= LTM_PRIME_BBS;
3259 /* calc the byte size */
3260 bsize = (size>>3)+((size&7)?1:0);
3262 /* we need a buffer of bsize bytes */
3263 tmp = malloc(bsize);
3264 if (tmp == NULL) {
3265 return MP_MEM;
3268 /* calc the maskAND value for the MSbyte*/
3269 maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
3271 /* calc the maskOR_msb */
3272 maskOR_msb = 0;
3273 maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
3274 if (flags & LTM_PRIME_2MSB_ON) {
3275 maskOR_msb |= 1 << ((size - 2) & 7);
3276 } else if (flags & LTM_PRIME_2MSB_OFF) {
3277 maskAND &= ~(1 << ((size - 2) & 7));
3280 /* get the maskOR_lsb */
3281 maskOR_lsb = 0;
3282 if (flags & LTM_PRIME_BBS) {
3283 maskOR_lsb |= 3;
3286 do {
3287 /* read the bytes */
3288 if (cb(tmp, bsize, dat) != bsize) {
3289 err = MP_VAL;
3290 goto error;
3293 /* work over the MSbyte */
3294 tmp[0] &= maskAND;
3295 tmp[0] |= 1 << ((size - 1) & 7);
3297 /* mix in the maskORs */
3298 tmp[maskOR_msb_offset] |= maskOR_msb;
3299 tmp[bsize-1] |= maskOR_lsb;
3301 /* read it in */
3302 if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY) { goto error; }
3304 /* is it prime? */
3305 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3306 if (res == MP_NO) {
3307 continue;
3310 if (flags & LTM_PRIME_SAFE) {
3311 /* see if (a-1)/2 is prime */
3312 if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) { goto error; }
3313 if ((err = mp_div_2(a, a)) != MP_OKAY) { goto error; }
3315 /* is it prime? */
3316 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3318 } while (res == MP_NO);
3320 if (flags & LTM_PRIME_SAFE) {
3321 /* restore a to the original value */
3322 if ((err = mp_mul_2(a, a)) != MP_OKAY) { goto error; }
3323 if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { goto error; }
3326 err = MP_OKAY;
3327 error:
3328 free(tmp);
3329 return err;
3332 /* reads an unsigned char array, assumes the msb is stored first [big endian] */
3334 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
3336 int res;
3338 /* make sure there are at least two digits */
3339 if (a->alloc < 2) {
3340 if ((res = mp_grow(a, 2)) != MP_OKAY) {
3341 return res;
3345 /* zero the int */
3346 mp_zero (a);
3348 /* read the bytes in */
3349 while (c-- > 0) {
3350 if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
3351 return res;
3354 a->dp[0] |= *b++;
3355 a->used += 1;
3357 mp_clamp (a);
3358 return MP_OKAY;
3361 /* reduces x mod m, assumes 0 < x < m**2, mu is
3362 * precomputed via mp_reduce_setup.
3363 * From HAC pp.604 Algorithm 14.42
3366 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
3368 mp_int q;
3369 int res, um = m->used;
3371 /* q = x */
3372 if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
3373 return res;
3376 /* q1 = x / b**(k-1) */
3377 mp_rshd (&q, um - 1);
3379 /* according to HAC this optimization is ok */
3380 if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
3381 if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
3382 goto CLEANUP;
3384 } else {
3385 if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
3386 goto CLEANUP;
3390 /* q3 = q2 / b**(k+1) */
3391 mp_rshd (&q, um + 1);
3393 /* x = x mod b**(k+1), quick (no division) */
3394 if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
3395 goto CLEANUP;
3398 /* q = q * m mod b**(k+1), quick (no division) */
3399 if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
3400 goto CLEANUP;
3403 /* x = x - q */
3404 if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
3405 goto CLEANUP;
3408 /* If x < 0, add b**(k+1) to it */
3409 if (mp_cmp_d (x, 0) == MP_LT) {
3410 mp_set (&q, 1);
3411 if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
3412 goto CLEANUP;
3413 if ((res = mp_add (x, &q, x)) != MP_OKAY)
3414 goto CLEANUP;
3417 /* Back off if it's too big */
3418 while (mp_cmp (x, m) != MP_LT) {
3419 if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
3420 goto CLEANUP;
3424 CLEANUP:
3425 mp_clear (&q);
3427 return res;
3430 /* reduces a modulo n where n is of the form 2**p - d */
3432 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
3434 mp_int q;
3435 int p, res;
3437 if ((res = mp_init(&q)) != MP_OKAY) {
3438 return res;
3441 p = mp_count_bits(n);
3442 top:
3443 /* q = a/2**p, a = a mod 2**p */
3444 if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
3445 goto ERR;
3448 if (d != 1) {
3449 /* q = q * d */
3450 if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
3451 goto ERR;
3455 /* a = a + q */
3456 if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
3457 goto ERR;
3460 if (mp_cmp_mag(a, n) != MP_LT) {
3461 s_mp_sub(a, n, a);
3462 goto top;
3465 ERR:
3466 mp_clear(&q);
3467 return res;
3470 /* determines the setup value */
3471 int
3472 mp_reduce_2k_setup(mp_int *a, mp_digit *d)
3474 int res, p;
3475 mp_int tmp;
3477 if ((res = mp_init(&tmp)) != MP_OKAY) {
3478 return res;
3481 p = mp_count_bits(a);
3482 if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3483 mp_clear(&tmp);
3484 return res;
3487 if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3488 mp_clear(&tmp);
3489 return res;
3492 *d = tmp.dp[0];
3493 mp_clear(&tmp);
3494 return MP_OKAY;
3497 /* pre-calculate the value required for Barrett reduction
3498 * For a given modulus "b" it calulates the value required in "a"
3500 int mp_reduce_setup (mp_int * a, mp_int * b)
3502 int res;
3504 if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
3505 return res;
3507 return mp_div (a, b, a, NULL);
3510 /* shift right a certain amount of digits */
3511 void mp_rshd (mp_int * a, int b)
3513 int x;
3515 /* if b <= 0 then ignore it */
3516 if (b <= 0) {
3517 return;
3520 /* if b > used then simply zero it and return */
3521 if (a->used <= b) {
3522 mp_zero (a);
3523 return;
3527 register mp_digit *bottom, *top;
3529 /* shift the digits down */
3531 /* bottom */
3532 bottom = a->dp;
3534 /* top [offset into digits] */
3535 top = a->dp + b;
3537 /* this is implemented as a sliding window where
3538 * the window is b-digits long and digits from
3539 * the top of the window are copied to the bottom
3541 * e.g.
3543 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
3544 /\ | ---->
3545 \-------------------/ ---->
3547 for (x = 0; x < (a->used - b); x++) {
3548 *bottom++ = *top++;
3551 /* zero the top digits */
3552 for (; x < a->used; x++) {
3553 *bottom++ = 0;
3557 /* remove excess digits */
3558 a->used -= b;
3561 /* set to a digit */
3562 void mp_set (mp_int * a, mp_digit b)
3564 mp_zero (a);
3565 a->dp[0] = b & MP_MASK;
3566 a->used = (a->dp[0] != 0) ? 1 : 0;
3569 /* set a 32-bit const */
3570 int mp_set_int (mp_int * a, unsigned long b)
3572 int x, res;
3574 mp_zero (a);
3576 /* set four bits at a time */
3577 for (x = 0; x < 8; x++) {
3578 /* shift the number up four bits */
3579 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
3580 return res;
3583 /* OR in the top four bits of the source */
3584 a->dp[0] |= (b >> 28) & 15;
3586 /* shift the source up to the next four bits */
3587 b <<= 4;
3589 /* ensure that digits are not clamped off */
3590 a->used += 1;
3592 mp_clamp (a);
3593 return MP_OKAY;
3596 /* shrink a bignum */
3597 int mp_shrink (mp_int * a)
3599 mp_digit *tmp;
3600 if (a->alloc != a->used && a->used > 0) {
3601 if ((tmp = realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
3602 return MP_MEM;
3604 a->dp = tmp;
3605 a->alloc = a->used;
3607 return MP_OKAY;
3610 /* get the size for an signed equivalent */
3611 int mp_signed_bin_size (mp_int * a)
3613 return 1 + mp_unsigned_bin_size (a);
3616 /* computes b = a*a */
3618 mp_sqr (mp_int * a, mp_int * b)
3620 int res;
3622 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3623 res = mp_karatsuba_sqr (a, b);
3624 } else
3626 /* can we use the fast comba multiplier? */
3627 if ((a->used * 2 + 1) < MP_WARRAY &&
3628 a->used <
3629 (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3630 res = fast_s_mp_sqr (a, b);
3631 } else
3632 res = s_mp_sqr (a, b);
3634 b->sign = MP_ZPOS;
3635 return res;
3638 /* c = a * a (mod b) */
3640 mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
3642 int res;
3643 mp_int t;
3645 if ((res = mp_init (&t)) != MP_OKAY) {
3646 return res;
3649 if ((res = mp_sqr (a, &t)) != MP_OKAY) {
3650 mp_clear (&t);
3651 return res;
3653 res = mp_mod (&t, b, c);
3654 mp_clear (&t);
3655 return res;
3658 /* high level subtraction (handles signs) */
3660 mp_sub (mp_int * a, mp_int * b, mp_int * c)
3662 int sa, sb, res;
3664 sa = a->sign;
3665 sb = b->sign;
3667 if (sa != sb) {
3668 /* subtract a negative from a positive, OR */
3669 /* subtract a positive from a negative. */
3670 /* In either case, ADD their magnitudes, */
3671 /* and use the sign of the first number. */
3672 c->sign = sa;
3673 res = s_mp_add (a, b, c);
3674 } else {
3675 /* subtract a positive from a positive, OR */
3676 /* subtract a negative from a negative. */
3677 /* First, take the difference between their */
3678 /* magnitudes, then... */
3679 if (mp_cmp_mag (a, b) != MP_LT) {
3680 /* Copy the sign from the first */
3681 c->sign = sa;
3682 /* The first has a larger or equal magnitude */
3683 res = s_mp_sub (a, b, c);
3684 } else {
3685 /* The result has the *opposite* sign from */
3686 /* the first number. */
3687 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
3688 /* The second has a larger magnitude */
3689 res = s_mp_sub (b, a, c);
3692 return res;
3695 /* single digit subtraction */
3697 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
3699 mp_digit *tmpa, *tmpc, mu;
3700 int res, ix, oldused;
3702 /* grow c as required */
3703 if (c->alloc < a->used + 1) {
3704 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3705 return res;
3709 /* if a is negative just do an unsigned
3710 * addition [with fudged signs]
3712 if (a->sign == MP_NEG) {
3713 a->sign = MP_ZPOS;
3714 res = mp_add_d(a, b, c);
3715 a->sign = c->sign = MP_NEG;
3716 return res;
3719 /* setup regs */
3720 oldused = c->used;
3721 tmpa = a->dp;
3722 tmpc = c->dp;
3724 /* if a <= b simply fix the single digit */
3725 if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
3726 if (a->used == 1) {
3727 *tmpc++ = b - *tmpa;
3728 } else {
3729 *tmpc++ = b;
3731 ix = 1;
3733 /* negative/1digit */
3734 c->sign = MP_NEG;
3735 c->used = 1;
3736 } else {
3737 /* positive/size */
3738 c->sign = MP_ZPOS;
3739 c->used = a->used;
3741 /* subtract first digit */
3742 *tmpc = *tmpa++ - b;
3743 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3744 *tmpc++ &= MP_MASK;
3746 /* handle rest of the digits */
3747 for (ix = 1; ix < a->used; ix++) {
3748 *tmpc = *tmpa++ - mu;
3749 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3750 *tmpc++ &= MP_MASK;
3754 /* zero excess digits */
3755 while (ix++ < oldused) {
3756 *tmpc++ = 0;
3758 mp_clamp(c);
3759 return MP_OKAY;
3762 /* store in unsigned [big endian] format */
3764 mp_to_unsigned_bin (mp_int * a, unsigned char *b)
3766 int x, res;
3767 mp_int t;
3769 if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
3770 return res;
3773 x = 0;
3774 while (mp_iszero (&t) == 0) {
3775 b[x++] = (unsigned char) (t.dp[0] & 255);
3776 if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
3777 mp_clear (&t);
3778 return res;
3781 bn_reverse (b, x);
3782 mp_clear (&t);
3783 return MP_OKAY;
3786 /* get the size for an unsigned equivalent */
3788 mp_unsigned_bin_size (mp_int * a)
3790 int size = mp_count_bits (a);
3791 return (size / 8 + ((size & 7) != 0 ? 1 : 0));
3794 /* set to zero */
3795 void
3796 mp_zero (mp_int * a)
3798 a->sign = MP_ZPOS;
3799 a->used = 0;
3800 memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
3803 const mp_digit __prime_tab[] = {
3804 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
3805 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
3806 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
3807 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
3808 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
3809 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
3810 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
3811 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
3813 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
3814 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
3815 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
3816 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
3817 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
3818 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
3819 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
3820 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
3822 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
3823 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
3824 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
3825 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
3826 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
3827 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
3828 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
3829 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
3831 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
3832 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
3833 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
3834 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
3835 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
3836 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
3837 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
3838 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
3841 /* reverse an array, used for radix code */
3842 void
3843 bn_reverse (unsigned char *s, int len)
3845 int ix, iy;
3846 unsigned char t;
3848 ix = 0;
3849 iy = len - 1;
3850 while (ix < iy) {
3851 t = s[ix];
3852 s[ix] = s[iy];
3853 s[iy] = t;
3854 ++ix;
3855 --iy;
3859 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
3861 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
3863 mp_int *x;
3864 int olduse, res, min, max;
3866 /* find sizes, we let |a| <= |b| which means we have to sort
3867 * them. "x" will point to the input with the most digits
3869 if (a->used > b->used) {
3870 min = b->used;
3871 max = a->used;
3872 x = a;
3873 } else {
3874 min = a->used;
3875 max = b->used;
3876 x = b;
3879 /* init result */
3880 if (c->alloc < max + 1) {
3881 if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
3882 return res;
3886 /* get old used digit count and set new one */
3887 olduse = c->used;
3888 c->used = max + 1;
3891 register mp_digit u, *tmpa, *tmpb, *tmpc;
3892 register int i;
3894 /* alias for digit pointers */
3896 /* first input */
3897 tmpa = a->dp;
3899 /* second input */
3900 tmpb = b->dp;
3902 /* destination */
3903 tmpc = c->dp;
3905 /* zero the carry */
3906 u = 0;
3907 for (i = 0; i < min; i++) {
3908 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
3909 *tmpc = *tmpa++ + *tmpb++ + u;
3911 /* U = carry bit of T[i] */
3912 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3914 /* take away carry bit from T[i] */
3915 *tmpc++ &= MP_MASK;
3918 /* now copy higher words if any, that is in A+B
3919 * if A or B has more digits add those in
3921 if (min != max) {
3922 for (; i < max; i++) {
3923 /* T[i] = X[i] + U */
3924 *tmpc = x->dp[i] + u;
3926 /* U = carry bit of T[i] */
3927 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3929 /* take away carry bit from T[i] */
3930 *tmpc++ &= MP_MASK;
3934 /* add carry */
3935 *tmpc++ = u;
3937 /* clear digits above oldused */
3938 for (i = c->used; i < olduse; i++) {
3939 *tmpc++ = 0;
3943 mp_clamp (c);
3944 return MP_OKAY;
3947 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
3949 mp_int M[256], res, mu;
3950 mp_digit buf;
3951 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
3953 /* find window size */
3954 x = mp_count_bits (X);
3955 if (x <= 7) {
3956 winsize = 2;
3957 } else if (x <= 36) {
3958 winsize = 3;
3959 } else if (x <= 140) {
3960 winsize = 4;
3961 } else if (x <= 450) {
3962 winsize = 5;
3963 } else if (x <= 1303) {
3964 winsize = 6;
3965 } else if (x <= 3529) {
3966 winsize = 7;
3967 } else {
3968 winsize = 8;
3971 /* init M array */
3972 /* init first cell */
3973 if ((err = mp_init(&M[1])) != MP_OKAY) {
3974 return err;
3977 /* now init the second half of the array */
3978 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3979 if ((err = mp_init(&M[x])) != MP_OKAY) {
3980 for (y = 1<<(winsize-1); y < x; y++) {
3981 mp_clear (&M[y]);
3983 mp_clear(&M[1]);
3984 return err;
3988 /* create mu, used for Barrett reduction */
3989 if ((err = mp_init (&mu)) != MP_OKAY) {
3990 goto __M;
3992 if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
3993 goto __MU;
3996 /* create M table
3998 * The M table contains powers of the base,
3999 * e.g. M[x] = G**x mod P
4001 * The first half of the table is not
4002 * computed though accept for M[0] and M[1]
4004 if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
4005 goto __MU;
4008 /* compute the value at M[1<<(winsize-1)] by squaring
4009 * M[1] (winsize-1) times
4011 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4012 goto __MU;
4015 for (x = 0; x < (winsize - 1); x++) {
4016 if ((err = mp_sqr (&M[1 << (winsize - 1)],
4017 &M[1 << (winsize - 1)])) != MP_OKAY) {
4018 goto __MU;
4020 if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
4021 goto __MU;
4025 /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
4026 * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
4028 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4029 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4030 goto __MU;
4032 if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
4033 goto __MU;
4037 /* setup result */
4038 if ((err = mp_init (&res)) != MP_OKAY) {
4039 goto __MU;
4041 mp_set (&res, 1);
4043 /* set initial mode and bit cnt */
4044 mode = 0;
4045 bitcnt = 1;
4046 buf = 0;
4047 digidx = X->used - 1;
4048 bitcpy = 0;
4049 bitbuf = 0;
4051 for (;;) {
4052 /* grab next digit as required */
4053 if (--bitcnt == 0) {
4054 /* if digidx == -1 we are out of digits */
4055 if (digidx == -1) {
4056 break;
4058 /* read next digit and reset the bitcnt */
4059 buf = X->dp[digidx--];
4060 bitcnt = (int) DIGIT_BIT;
4063 /* grab the next msb from the exponent */
4064 y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
4065 buf <<= (mp_digit)1;
4067 /* if the bit is zero and mode == 0 then we ignore it
4068 * These represent the leading zero bits before the first 1 bit
4069 * in the exponent. Technically this opt is not required but it
4070 * does lower the # of trivial squaring/reductions used
4072 if (mode == 0 && y == 0) {
4073 continue;
4076 /* if the bit is zero and mode == 1 then we square */
4077 if (mode == 1 && y == 0) {
4078 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4079 goto __RES;
4081 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4082 goto __RES;
4084 continue;
4087 /* else we add it to the window */
4088 bitbuf |= (y << (winsize - ++bitcpy));
4089 mode = 2;
4091 if (bitcpy == winsize) {
4092 /* ok window is filled so square as required and multiply */
4093 /* square first */
4094 for (x = 0; x < winsize; x++) {
4095 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4096 goto __RES;
4098 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4099 goto __RES;
4103 /* then multiply */
4104 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4105 goto __RES;
4107 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4108 goto __RES;
4111 /* empty window and reset */
4112 bitcpy = 0;
4113 bitbuf = 0;
4114 mode = 1;
4118 /* if bits remain then square/multiply */
4119 if (mode == 2 && bitcpy > 0) {
4120 /* square then multiply if the bit is set */
4121 for (x = 0; x < bitcpy; x++) {
4122 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4123 goto __RES;
4125 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4126 goto __RES;
4129 bitbuf <<= 1;
4130 if ((bitbuf & (1 << winsize)) != 0) {
4131 /* then multiply */
4132 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4133 goto __RES;
4135 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4136 goto __RES;
4142 mp_exch (&res, Y);
4143 err = MP_OKAY;
4144 __RES:mp_clear (&res);
4145 __MU:mp_clear (&mu);
4146 __M:
4147 mp_clear(&M[1]);
4148 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4149 mp_clear (&M[x]);
4151 return err;
4154 /* multiplies |a| * |b| and only computes upto digs digits of result
4155 * HAC pp. 595, Algorithm 14.12 Modified so you can control how
4156 * many digits of output are created.
4159 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
4161 mp_int t;
4162 int res, pa, pb, ix, iy;
4163 mp_digit u;
4164 mp_word r;
4165 mp_digit tmpx, *tmpt, *tmpy;
4167 /* can we use the fast multiplier? */
4168 if (((digs) < MP_WARRAY) &&
4169 MIN (a->used, b->used) <
4170 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4171 return fast_s_mp_mul_digs (a, b, c, digs);
4174 if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
4175 return res;
4177 t.used = digs;
4179 /* compute the digits of the product directly */
4180 pa = a->used;
4181 for (ix = 0; ix < pa; ix++) {
4182 /* set the carry to zero */
4183 u = 0;
4185 /* limit ourselves to making digs digits of output */
4186 pb = MIN (b->used, digs - ix);
4188 /* setup some aliases */
4189 /* copy of the digit from a used within the nested loop */
4190 tmpx = a->dp[ix];
4192 /* an alias for the destination shifted ix places */
4193 tmpt = t.dp + ix;
4195 /* an alias for the digits of b */
4196 tmpy = b->dp;
4198 /* compute the columns of the output and propagate the carry */
4199 for (iy = 0; iy < pb; iy++) {
4200 /* compute the column as a mp_word */
4201 r = ((mp_word)*tmpt) +
4202 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4203 ((mp_word) u);
4205 /* the new column is the lower part of the result */
4206 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4208 /* get the carry word from the result */
4209 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4211 /* set carry if it is placed below digs */
4212 if (ix + iy < digs) {
4213 *tmpt = u;
4217 mp_clamp (&t);
4218 mp_exch (&t, c);
4220 mp_clear (&t);
4221 return MP_OKAY;
4224 /* multiplies |a| * |b| and does not compute the lower digs digits
4225 * [meant to get the higher part of the product]
4228 s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
4230 mp_int t;
4231 int res, pa, pb, ix, iy;
4232 mp_digit u;
4233 mp_word r;
4234 mp_digit tmpx, *tmpt, *tmpy;
4236 /* can we use the fast multiplier? */
4237 if (((a->used + b->used + 1) < MP_WARRAY)
4238 && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4239 return fast_s_mp_mul_high_digs (a, b, c, digs);
4242 if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
4243 return res;
4245 t.used = a->used + b->used + 1;
4247 pa = a->used;
4248 pb = b->used;
4249 for (ix = 0; ix < pa; ix++) {
4250 /* clear the carry */
4251 u = 0;
4253 /* left hand side of A[ix] * B[iy] */
4254 tmpx = a->dp[ix];
4256 /* alias to the address of where the digits will be stored */
4257 tmpt = &(t.dp[digs]);
4259 /* alias for where to read the right hand side from */
4260 tmpy = b->dp + (digs - ix);
4262 for (iy = digs - ix; iy < pb; iy++) {
4263 /* calculate the double precision result */
4264 r = ((mp_word)*tmpt) +
4265 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4266 ((mp_word) u);
4268 /* get the lower part */
4269 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4271 /* carry the carry */
4272 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4274 *tmpt = u;
4276 mp_clamp (&t);
4277 mp_exch (&t, c);
4278 mp_clear (&t);
4279 return MP_OKAY;
4282 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
4284 s_mp_sqr (mp_int * a, mp_int * b)
4286 mp_int t;
4287 int res, ix, iy, pa;
4288 mp_word r;
4289 mp_digit u, tmpx, *tmpt;
4291 pa = a->used;
4292 if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
4293 return res;
4296 /* default used is maximum possible size */
4297 t.used = 2*pa + 1;
4299 for (ix = 0; ix < pa; ix++) {
4300 /* first calculate the digit at 2*ix */
4301 /* calculate double precision result */
4302 r = ((mp_word) t.dp[2*ix]) +
4303 ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
4305 /* store lower part in result */
4306 t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
4308 /* get the carry */
4309 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4311 /* left hand side of A[ix] * A[iy] */
4312 tmpx = a->dp[ix];
4314 /* alias for where to store the results */
4315 tmpt = t.dp + (2*ix + 1);
4317 for (iy = ix + 1; iy < pa; iy++) {
4318 /* first calculate the product */
4319 r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
4321 /* now calculate the double precision result, note we use
4322 * addition instead of *2 since it's easier to optimize
4324 r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
4326 /* store lower part */
4327 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4329 /* get carry */
4330 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4332 /* propagate upwards */
4333 while (u != ((mp_digit) 0)) {
4334 r = ((mp_word) *tmpt) + ((mp_word) u);
4335 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4336 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4340 mp_clamp (&t);
4341 mp_exch (&t, b);
4342 mp_clear (&t);
4343 return MP_OKAY;
4346 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
4348 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
4350 int olduse, res, min, max;
4352 /* find sizes */
4353 min = b->used;
4354 max = a->used;
4356 /* init result */
4357 if (c->alloc < max) {
4358 if ((res = mp_grow (c, max)) != MP_OKAY) {
4359 return res;
4362 olduse = c->used;
4363 c->used = max;
4366 register mp_digit u, *tmpa, *tmpb, *tmpc;
4367 register int i;
4369 /* alias for digit pointers */
4370 tmpa = a->dp;
4371 tmpb = b->dp;
4372 tmpc = c->dp;
4374 /* set carry to zero */
4375 u = 0;
4376 for (i = 0; i < min; i++) {
4377 /* T[i] = A[i] - B[i] - U */
4378 *tmpc = *tmpa++ - *tmpb++ - u;
4380 /* U = carry bit of T[i]
4381 * Note this saves performing an AND operation since
4382 * if a carry does occur it will propagate all the way to the
4383 * MSB. As a result a single shift is enough to get the carry
4385 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4387 /* Clear carry from T[i] */
4388 *tmpc++ &= MP_MASK;
4391 /* now copy higher words if any, e.g. if A has more digits than B */
4392 for (; i < max; i++) {
4393 /* T[i] = A[i] - U */
4394 *tmpc = *tmpa++ - u;
4396 /* U = carry bit of T[i] */
4397 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4399 /* Clear carry from T[i] */
4400 *tmpc++ &= MP_MASK;
4403 /* clear digits above used (since we may not have grown result above) */
4404 for (i = c->used; i < olduse; i++) {
4405 *tmpc++ = 0;
4409 mp_clamp (c);
4410 return MP_OKAY;
4413 /* Known optimal configurations
4415 CPU /Compiler /MUL CUTOFF/SQR CUTOFF
4416 -------------------------------------------------------------
4417 Intel P4 Northwood /GCC v3.4.1 / 88/ 128/LTM 0.32 ;-)
4421 int KARATSUBA_MUL_CUTOFF = 88, /* Min. number of digits before Karatsuba multiplication is used. */
4422 KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */