Merge commit 'dfc115332c94a2f62058ac7f2bce7631fbd20b3d'
[unleashed/tickless.git] / lib / libcrypto / bn / bn_asm.c
blob993fbb3dc57c4513da0bdd4c0ca82ba802181a1d
1 /* $OpenBSD: bn_asm.c,v 1.15 2017/05/02 03:59:44 deraadt Exp $ */
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
59 #ifndef BN_DEBUG
60 # undef NDEBUG /* avoid conflicting definitions */
61 # define NDEBUG
62 #endif
64 #include <assert.h>
65 #include <stdio.h>
67 #include <openssl/opensslconf.h>
69 #include "bn_lcl.h"
71 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
73 BN_ULONG
74 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
76 BN_ULONG c1 = 0;
78 assert(num >= 0);
79 if (num <= 0)
80 return (c1);
82 #ifndef OPENSSL_SMALL_FOOTPRINT
83 while (num & ~3) {
84 mul_add(rp[0], ap[0], w, c1);
85 mul_add(rp[1], ap[1], w, c1);
86 mul_add(rp[2], ap[2], w, c1);
87 mul_add(rp[3], ap[3], w, c1);
88 ap += 4;
89 rp += 4;
90 num -= 4;
92 #endif
93 while (num) {
94 mul_add(rp[0], ap[0], w, c1);
95 ap++;
96 rp++;
97 num--;
100 return (c1);
103 BN_ULONG
104 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
106 BN_ULONG c1 = 0;
108 assert(num >= 0);
109 if (num <= 0)
110 return (c1);
112 #ifndef OPENSSL_SMALL_FOOTPRINT
113 while (num & ~3) {
114 mul(rp[0], ap[0], w, c1);
115 mul(rp[1], ap[1], w, c1);
116 mul(rp[2], ap[2], w, c1);
117 mul(rp[3], ap[3], w, c1);
118 ap += 4;
119 rp += 4;
120 num -= 4;
122 #endif
123 while (num) {
124 mul(rp[0], ap[0], w, c1);
125 ap++;
126 rp++;
127 num--;
129 return (c1);
132 void
133 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
135 assert(n >= 0);
136 if (n <= 0)
137 return;
139 #ifndef OPENSSL_SMALL_FOOTPRINT
140 while (n & ~3) {
141 sqr(r[0], r[1], a[0]);
142 sqr(r[2], r[3], a[1]);
143 sqr(r[4], r[5], a[2]);
144 sqr(r[6], r[7], a[3]);
145 a += 4;
146 r += 8;
147 n -= 4;
149 #endif
150 while (n) {
151 sqr(r[0], r[1], a[0]);
152 a++;
153 r += 2;
154 n--;
158 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
160 BN_ULONG
161 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
163 BN_ULONG c = 0;
164 BN_ULONG bl, bh;
166 assert(num >= 0);
167 if (num <= 0)
168 return ((BN_ULONG)0);
170 bl = LBITS(w);
171 bh = HBITS(w);
173 #ifndef OPENSSL_SMALL_FOOTPRINT
174 while (num & ~3) {
175 mul_add(rp[0], ap[0], bl, bh, c);
176 mul_add(rp[1], ap[1], bl, bh, c);
177 mul_add(rp[2], ap[2], bl, bh, c);
178 mul_add(rp[3], ap[3], bl, bh, c);
179 ap += 4;
180 rp += 4;
181 num -= 4;
183 #endif
184 while (num) {
185 mul_add(rp[0], ap[0], bl, bh, c);
186 ap++;
187 rp++;
188 num--;
190 return (c);
193 BN_ULONG
194 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
196 BN_ULONG carry = 0;
197 BN_ULONG bl, bh;
199 assert(num >= 0);
200 if (num <= 0)
201 return ((BN_ULONG)0);
203 bl = LBITS(w);
204 bh = HBITS(w);
206 #ifndef OPENSSL_SMALL_FOOTPRINT
207 while (num & ~3) {
208 mul(rp[0], ap[0], bl, bh, carry);
209 mul(rp[1], ap[1], bl, bh, carry);
210 mul(rp[2], ap[2], bl, bh, carry);
211 mul(rp[3], ap[3], bl, bh, carry);
212 ap += 4;
213 rp += 4;
214 num -= 4;
216 #endif
217 while (num) {
218 mul(rp[0], ap[0], bl, bh, carry);
219 ap++;
220 rp++;
221 num--;
223 return (carry);
226 void
227 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
229 assert(n >= 0);
230 if (n <= 0)
231 return;
233 #ifndef OPENSSL_SMALL_FOOTPRINT
234 while (n & ~3) {
235 sqr64(r[0], r[1], a[0]);
236 sqr64(r[2], r[3], a[1]);
237 sqr64(r[4], r[5], a[2]);
238 sqr64(r[6], r[7], a[3]);
239 a += 4;
240 r += 8;
241 n -= 4;
243 #endif
244 while (n) {
245 sqr64(r[0], r[1], a[0]);
246 a++;
247 r += 2;
248 n--;
252 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
254 #if defined(BN_LLONG) && defined(BN_DIV2W)
256 BN_ULONG
257 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
259 return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2)|l)/(BN_ULLONG)d));
262 #else
264 /* Divide h,l by d and return the result. */
265 /* I need to test this some more :-( */
266 BN_ULONG
267 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
269 BN_ULONG dh, dl, q,ret = 0, th, tl, t;
270 int i, count = 2;
272 if (d == 0)
273 return (BN_MASK2);
275 i = BN_num_bits_word(d);
276 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
278 i = BN_BITS2 - i;
279 if (h >= d)
280 h -= d;
282 if (i) {
283 d <<= i;
284 h = (h << i) | (l >> (BN_BITS2 - i));
285 l <<= i;
287 dh = (d & BN_MASK2h) >> BN_BITS4;
288 dl = (d & BN_MASK2l);
289 for (;;) {
290 if ((h >> BN_BITS4) == dh)
291 q = BN_MASK2l;
292 else
293 q = h / dh;
295 th = q * dh;
296 tl = dl * q;
297 for (;;) {
298 t = h - th;
299 if ((t & BN_MASK2h) ||
300 ((tl) <= (
301 (t << BN_BITS4) |
302 ((l & BN_MASK2h) >> BN_BITS4))))
303 break;
304 q--;
305 th -= dh;
306 tl -= dl;
308 t = (tl >> BN_BITS4);
309 tl = (tl << BN_BITS4) & BN_MASK2h;
310 th += t;
312 if (l < tl)
313 th++;
314 l -= tl;
315 if (h < th) {
316 h += d;
317 q--;
319 h -= th;
321 if (--count == 0)
322 break;
324 ret = q << BN_BITS4;
325 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
326 l = (l & BN_MASK2l) << BN_BITS4;
328 ret |= q;
329 return (ret);
331 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
333 #ifdef BN_LLONG
334 BN_ULONG
335 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
337 BN_ULLONG ll = 0;
339 assert(n >= 0);
340 if (n <= 0)
341 return ((BN_ULONG)0);
343 #ifndef OPENSSL_SMALL_FOOTPRINT
344 while (n & ~3) {
345 ll += (BN_ULLONG)a[0] + b[0];
346 r[0] = (BN_ULONG)ll & BN_MASK2;
347 ll >>= BN_BITS2;
348 ll += (BN_ULLONG)a[1] + b[1];
349 r[1] = (BN_ULONG)ll & BN_MASK2;
350 ll >>= BN_BITS2;
351 ll += (BN_ULLONG)a[2] + b[2];
352 r[2] = (BN_ULONG)ll & BN_MASK2;
353 ll >>= BN_BITS2;
354 ll += (BN_ULLONG)a[3] + b[3];
355 r[3] = (BN_ULONG)ll & BN_MASK2;
356 ll >>= BN_BITS2;
357 a += 4;
358 b += 4;
359 r += 4;
360 n -= 4;
362 #endif
363 while (n) {
364 ll += (BN_ULLONG)a[0] + b[0];
365 r[0] = (BN_ULONG)ll & BN_MASK2;
366 ll >>= BN_BITS2;
367 a++;
368 b++;
369 r++;
370 n--;
372 return ((BN_ULONG)ll);
374 #else /* !BN_LLONG */
375 BN_ULONG
376 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
378 BN_ULONG c, l, t;
380 assert(n >= 0);
381 if (n <= 0)
382 return ((BN_ULONG)0);
384 c = 0;
385 #ifndef OPENSSL_SMALL_FOOTPRINT
386 while (n & ~3) {
387 t = a[0];
388 t = (t + c) & BN_MASK2;
389 c = (t < c);
390 l = (t + b[0]) & BN_MASK2;
391 c += (l < t);
392 r[0] = l;
393 t = a[1];
394 t = (t + c) & BN_MASK2;
395 c = (t < c);
396 l = (t + b[1]) & BN_MASK2;
397 c += (l < t);
398 r[1] = l;
399 t = a[2];
400 t = (t + c) & BN_MASK2;
401 c = (t < c);
402 l = (t + b[2]) & BN_MASK2;
403 c += (l < t);
404 r[2] = l;
405 t = a[3];
406 t = (t + c) & BN_MASK2;
407 c = (t < c);
408 l = (t + b[3]) & BN_MASK2;
409 c += (l < t);
410 r[3] = l;
411 a += 4;
412 b += 4;
413 r += 4;
414 n -= 4;
416 #endif
417 while (n) {
418 t = a[0];
419 t = (t + c) & BN_MASK2;
420 c = (t < c);
421 l = (t + b[0]) & BN_MASK2;
422 c += (l < t);
423 r[0] = l;
424 a++;
425 b++;
426 r++;
427 n--;
429 return ((BN_ULONG)c);
431 #endif /* !BN_LLONG */
433 BN_ULONG
434 bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
436 BN_ULONG t1, t2;
437 int c = 0;
439 assert(n >= 0);
440 if (n <= 0)
441 return ((BN_ULONG)0);
443 #ifndef OPENSSL_SMALL_FOOTPRINT
444 while (n&~3) {
445 t1 = a[0];
446 t2 = b[0];
447 r[0] = (t1 - t2 - c) & BN_MASK2;
448 if (t1 != t2)
449 c = (t1 < t2);
450 t1 = a[1];
451 t2 = b[1];
452 r[1] = (t1 - t2 - c) & BN_MASK2;
453 if (t1 != t2)
454 c = (t1 < t2);
455 t1 = a[2];
456 t2 = b[2];
457 r[2] = (t1 - t2 - c) & BN_MASK2;
458 if (t1 != t2)
459 c = (t1 < t2);
460 t1 = a[3];
461 t2 = b[3];
462 r[3] = (t1 - t2 - c) & BN_MASK2;
463 if (t1 != t2)
464 c = (t1 < t2);
465 a += 4;
466 b += 4;
467 r += 4;
468 n -= 4;
470 #endif
471 while (n) {
472 t1 = a[0];
473 t2 = b[0];
474 r[0] = (t1 - t2 - c) & BN_MASK2;
475 if (t1 != t2)
476 c = (t1 < t2);
477 a++;
478 b++;
479 r++;
480 n--;
482 return (c);
485 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
487 #undef bn_mul_comba8
488 #undef bn_mul_comba4
489 #undef bn_sqr_comba8
490 #undef bn_sqr_comba4
492 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
493 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
494 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
495 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
497 #ifdef BN_LLONG
499 * Keep in mind that additions to multiplication result can not
500 * overflow, because its high half cannot be all-ones.
502 #define mul_add_c(a,b,c0,c1,c2) do { \
503 BN_ULONG hi; \
504 BN_ULLONG t = (BN_ULLONG)(a)*(b); \
505 t += c0; /* no carry */ \
506 c0 = (BN_ULONG)Lw(t); \
507 hi = (BN_ULONG)Hw(t); \
508 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
509 } while(0)
511 #define mul_add_c2(a,b,c0,c1,c2) do { \
512 BN_ULONG hi; \
513 BN_ULLONG t = (BN_ULLONG)(a)*(b); \
514 BN_ULLONG tt = t+c0; /* no carry */ \
515 c0 = (BN_ULONG)Lw(tt); \
516 hi = (BN_ULONG)Hw(tt); \
517 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
518 t += c0; /* no carry */ \
519 c0 = (BN_ULONG)Lw(t); \
520 hi = (BN_ULONG)Hw(t); \
521 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
522 } while(0)
524 #define sqr_add_c(a,i,c0,c1,c2) do { \
525 BN_ULONG hi; \
526 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
527 t += c0; /* no carry */ \
528 c0 = (BN_ULONG)Lw(t); \
529 hi = (BN_ULONG)Hw(t); \
530 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
531 } while(0)
533 #define sqr_add_c2(a,i,j,c0,c1,c2) \
534 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
536 #elif defined(BN_UMULT_LOHI)
538 * Keep in mind that additions to hi can not overflow, because
539 * the high word of a multiplication result cannot be all-ones.
541 #define mul_add_c(a,b,c0,c1,c2) do { \
542 BN_ULONG ta = (a), tb = (b); \
543 BN_ULONG lo, hi; \
544 BN_UMULT_LOHI(lo,hi,ta,tb); \
545 c0 += lo; hi += (c0<lo)?1:0; \
546 c1 += hi; c2 += (c1<hi)?1:0; \
547 } while(0)
549 #define mul_add_c2(a,b,c0,c1,c2) do { \
550 BN_ULONG ta = (a), tb = (b); \
551 BN_ULONG lo, hi, tt; \
552 BN_UMULT_LOHI(lo,hi,ta,tb); \
553 c0 += lo; tt = hi+((c0<lo)?1:0); \
554 c1 += tt; c2 += (c1<tt)?1:0; \
555 c0 += lo; hi += (c0<lo)?1:0; \
556 c1 += hi; c2 += (c1<hi)?1:0; \
557 } while(0)
559 #define sqr_add_c(a,i,c0,c1,c2) do { \
560 BN_ULONG ta = (a)[i]; \
561 BN_ULONG lo, hi; \
562 BN_UMULT_LOHI(lo,hi,ta,ta); \
563 c0 += lo; hi += (c0<lo)?1:0; \
564 c1 += hi; c2 += (c1<hi)?1:0; \
565 } while(0)
567 #define sqr_add_c2(a,i,j,c0,c1,c2) \
568 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
570 #elif defined(BN_UMULT_HIGH)
572 * Keep in mind that additions to hi can not overflow, because
573 * the high word of a multiplication result cannot be all-ones.
575 #define mul_add_c(a,b,c0,c1,c2) do { \
576 BN_ULONG ta = (a), tb = (b); \
577 BN_ULONG lo = ta * tb; \
578 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
579 c0 += lo; hi += (c0<lo)?1:0; \
580 c1 += hi; c2 += (c1<hi)?1:0; \
581 } while(0)
583 #define mul_add_c2(a,b,c0,c1,c2) do { \
584 BN_ULONG ta = (a), tb = (b), tt; \
585 BN_ULONG lo = ta * tb; \
586 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
587 c0 += lo; tt = hi + ((c0<lo)?1:0); \
588 c1 += tt; c2 += (c1<tt)?1:0; \
589 c0 += lo; hi += (c0<lo)?1:0; \
590 c1 += hi; c2 += (c1<hi)?1:0; \
591 } while(0)
593 #define sqr_add_c(a,i,c0,c1,c2) do { \
594 BN_ULONG ta = (a)[i]; \
595 BN_ULONG lo = ta * ta; \
596 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
597 c0 += lo; hi += (c0<lo)?1:0; \
598 c1 += hi; c2 += (c1<hi)?1:0; \
599 } while(0)
601 #define sqr_add_c2(a,i,j,c0,c1,c2) \
602 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
604 #else /* !BN_LLONG */
606 * Keep in mind that additions to hi can not overflow, because
607 * the high word of a multiplication result cannot be all-ones.
609 #define mul_add_c(a,b,c0,c1,c2) do { \
610 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
611 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
612 mul64(lo,hi,bl,bh); \
613 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
614 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
615 } while(0)
617 #define mul_add_c2(a,b,c0,c1,c2) do { \
618 BN_ULONG tt; \
619 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
620 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
621 mul64(lo,hi,bl,bh); \
622 tt = hi; \
623 c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
624 c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
625 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
626 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
627 } while(0)
629 #define sqr_add_c(a,i,c0,c1,c2) do { \
630 BN_ULONG lo, hi; \
631 sqr64(lo,hi,(a)[i]); \
632 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
633 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
634 } while(0)
636 #define sqr_add_c2(a,i,j,c0,c1,c2) \
637 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
638 #endif /* !BN_LLONG */
640 void
641 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
643 BN_ULONG c1, c2, c3;
645 c1 = 0;
646 c2 = 0;
647 c3 = 0;
648 mul_add_c(a[0], b[0], c1, c2, c3);
649 r[0] = c1;
650 c1 = 0;
651 mul_add_c(a[0], b[1], c2, c3, c1);
652 mul_add_c(a[1], b[0], c2, c3, c1);
653 r[1] = c2;
654 c2 = 0;
655 mul_add_c(a[2], b[0], c3, c1, c2);
656 mul_add_c(a[1], b[1], c3, c1, c2);
657 mul_add_c(a[0], b[2], c3, c1, c2);
658 r[2] = c3;
659 c3 = 0;
660 mul_add_c(a[0], b[3], c1, c2, c3);
661 mul_add_c(a[1], b[2], c1, c2, c3);
662 mul_add_c(a[2], b[1], c1, c2, c3);
663 mul_add_c(a[3], b[0], c1, c2, c3);
664 r[3] = c1;
665 c1 = 0;
666 mul_add_c(a[4], b[0], c2, c3, c1);
667 mul_add_c(a[3], b[1], c2, c3, c1);
668 mul_add_c(a[2], b[2], c2, c3, c1);
669 mul_add_c(a[1], b[3], c2, c3, c1);
670 mul_add_c(a[0], b[4], c2, c3, c1);
671 r[4] = c2;
672 c2 = 0;
673 mul_add_c(a[0], b[5], c3, c1, c2);
674 mul_add_c(a[1], b[4], c3, c1, c2);
675 mul_add_c(a[2], b[3], c3, c1, c2);
676 mul_add_c(a[3], b[2], c3, c1, c2);
677 mul_add_c(a[4], b[1], c3, c1, c2);
678 mul_add_c(a[5], b[0], c3, c1, c2);
679 r[5] = c3;
680 c3 = 0;
681 mul_add_c(a[6], b[0], c1, c2, c3);
682 mul_add_c(a[5], b[1], c1, c2, c3);
683 mul_add_c(a[4], b[2], c1, c2, c3);
684 mul_add_c(a[3], b[3], c1, c2, c3);
685 mul_add_c(a[2], b[4], c1, c2, c3);
686 mul_add_c(a[1], b[5], c1, c2, c3);
687 mul_add_c(a[0], b[6], c1, c2, c3);
688 r[6] = c1;
689 c1 = 0;
690 mul_add_c(a[0], b[7], c2, c3, c1);
691 mul_add_c(a[1], b[6], c2, c3, c1);
692 mul_add_c(a[2], b[5], c2, c3, c1);
693 mul_add_c(a[3], b[4], c2, c3, c1);
694 mul_add_c(a[4], b[3], c2, c3, c1);
695 mul_add_c(a[5], b[2], c2, c3, c1);
696 mul_add_c(a[6], b[1], c2, c3, c1);
697 mul_add_c(a[7], b[0], c2, c3, c1);
698 r[7] = c2;
699 c2 = 0;
700 mul_add_c(a[7], b[1], c3, c1, c2);
701 mul_add_c(a[6], b[2], c3, c1, c2);
702 mul_add_c(a[5], b[3], c3, c1, c2);
703 mul_add_c(a[4], b[4], c3, c1, c2);
704 mul_add_c(a[3], b[5], c3, c1, c2);
705 mul_add_c(a[2], b[6], c3, c1, c2);
706 mul_add_c(a[1], b[7], c3, c1, c2);
707 r[8] = c3;
708 c3 = 0;
709 mul_add_c(a[2], b[7], c1, c2, c3);
710 mul_add_c(a[3], b[6], c1, c2, c3);
711 mul_add_c(a[4], b[5], c1, c2, c3);
712 mul_add_c(a[5], b[4], c1, c2, c3);
713 mul_add_c(a[6], b[3], c1, c2, c3);
714 mul_add_c(a[7], b[2], c1, c2, c3);
715 r[9] = c1;
716 c1 = 0;
717 mul_add_c(a[7], b[3], c2, c3, c1);
718 mul_add_c(a[6], b[4], c2, c3, c1);
719 mul_add_c(a[5], b[5], c2, c3, c1);
720 mul_add_c(a[4], b[6], c2, c3, c1);
721 mul_add_c(a[3], b[7], c2, c3, c1);
722 r[10] = c2;
723 c2 = 0;
724 mul_add_c(a[4], b[7], c3, c1, c2);
725 mul_add_c(a[5], b[6], c3, c1, c2);
726 mul_add_c(a[6], b[5], c3, c1, c2);
727 mul_add_c(a[7], b[4], c3, c1, c2);
728 r[11] = c3;
729 c3 = 0;
730 mul_add_c(a[7], b[5], c1, c2, c3);
731 mul_add_c(a[6], b[6], c1, c2, c3);
732 mul_add_c(a[5], b[7], c1, c2, c3);
733 r[12] = c1;
734 c1 = 0;
735 mul_add_c(a[6], b[7], c2, c3, c1);
736 mul_add_c(a[7], b[6], c2, c3, c1);
737 r[13] = c2;
738 c2 = 0;
739 mul_add_c(a[7], b[7], c3, c1, c2);
740 r[14] = c3;
741 r[15] = c1;
744 void
745 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
747 BN_ULONG c1, c2, c3;
749 c1 = 0;
750 c2 = 0;
751 c3 = 0;
752 mul_add_c(a[0], b[0], c1, c2, c3);
753 r[0] = c1;
754 c1 = 0;
755 mul_add_c(a[0], b[1], c2, c3, c1);
756 mul_add_c(a[1], b[0], c2, c3, c1);
757 r[1] = c2;
758 c2 = 0;
759 mul_add_c(a[2], b[0], c3, c1, c2);
760 mul_add_c(a[1], b[1], c3, c1, c2);
761 mul_add_c(a[0], b[2], c3, c1, c2);
762 r[2] = c3;
763 c3 = 0;
764 mul_add_c(a[0], b[3], c1, c2, c3);
765 mul_add_c(a[1], b[2], c1, c2, c3);
766 mul_add_c(a[2], b[1], c1, c2, c3);
767 mul_add_c(a[3], b[0], c1, c2, c3);
768 r[3] = c1;
769 c1 = 0;
770 mul_add_c(a[3], b[1], c2, c3, c1);
771 mul_add_c(a[2], b[2], c2, c3, c1);
772 mul_add_c(a[1], b[3], c2, c3, c1);
773 r[4] = c2;
774 c2 = 0;
775 mul_add_c(a[2], b[3], c3, c1, c2);
776 mul_add_c(a[3], b[2], c3, c1, c2);
777 r[5] = c3;
778 c3 = 0;
779 mul_add_c(a[3], b[3], c1, c2, c3);
780 r[6] = c1;
781 r[7] = c2;
784 void
785 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
787 BN_ULONG c1, c2, c3;
789 c1 = 0;
790 c2 = 0;
791 c3 = 0;
792 sqr_add_c(a, 0, c1, c2, c3);
793 r[0] = c1;
794 c1 = 0;
795 sqr_add_c2(a, 1, 0, c2, c3, c1);
796 r[1] = c2;
797 c2 = 0;
798 sqr_add_c(a, 1, c3, c1, c2);
799 sqr_add_c2(a, 2, 0, c3, c1, c2);
800 r[2] = c3;
801 c3 = 0;
802 sqr_add_c2(a, 3, 0, c1, c2, c3);
803 sqr_add_c2(a, 2, 1, c1, c2, c3);
804 r[3] = c1;
805 c1 = 0;
806 sqr_add_c(a, 2, c2, c3, c1);
807 sqr_add_c2(a, 3, 1, c2, c3, c1);
808 sqr_add_c2(a, 4, 0, c2, c3, c1);
809 r[4] = c2;
810 c2 = 0;
811 sqr_add_c2(a, 5, 0, c3, c1, c2);
812 sqr_add_c2(a, 4, 1, c3, c1, c2);
813 sqr_add_c2(a, 3, 2, c3, c1, c2);
814 r[5] = c3;
815 c3 = 0;
816 sqr_add_c(a, 3, c1, c2, c3);
817 sqr_add_c2(a, 4, 2, c1, c2, c3);
818 sqr_add_c2(a, 5, 1, c1, c2, c3);
819 sqr_add_c2(a, 6, 0, c1, c2, c3);
820 r[6] = c1;
821 c1 = 0;
822 sqr_add_c2(a, 7, 0, c2, c3, c1);
823 sqr_add_c2(a, 6, 1, c2, c3, c1);
824 sqr_add_c2(a, 5, 2, c2, c3, c1);
825 sqr_add_c2(a, 4, 3, c2, c3, c1);
826 r[7] = c2;
827 c2 = 0;
828 sqr_add_c(a, 4, c3, c1, c2);
829 sqr_add_c2(a, 5, 3, c3, c1, c2);
830 sqr_add_c2(a, 6, 2, c3, c1, c2);
831 sqr_add_c2(a, 7, 1, c3, c1, c2);
832 r[8] = c3;
833 c3 = 0;
834 sqr_add_c2(a, 7, 2, c1, c2, c3);
835 sqr_add_c2(a, 6, 3, c1, c2, c3);
836 sqr_add_c2(a, 5, 4, c1, c2, c3);
837 r[9] = c1;
838 c1 = 0;
839 sqr_add_c(a, 5, c2, c3, c1);
840 sqr_add_c2(a, 6, 4, c2, c3, c1);
841 sqr_add_c2(a, 7, 3, c2, c3, c1);
842 r[10] = c2;
843 c2 = 0;
844 sqr_add_c2(a, 7, 4, c3, c1, c2);
845 sqr_add_c2(a, 6, 5, c3, c1, c2);
846 r[11] = c3;
847 c3 = 0;
848 sqr_add_c(a, 6, c1, c2, c3);
849 sqr_add_c2(a, 7, 5, c1, c2, c3);
850 r[12] = c1;
851 c1 = 0;
852 sqr_add_c2(a, 7, 6, c2, c3, c1);
853 r[13] = c2;
854 c2 = 0;
855 sqr_add_c(a, 7, c3, c1, c2);
856 r[14] = c3;
857 r[15] = c1;
860 void
861 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
863 BN_ULONG c1, c2, c3;
865 c1 = 0;
866 c2 = 0;
867 c3 = 0;
868 sqr_add_c(a, 0, c1, c2, c3);
869 r[0] = c1;
870 c1 = 0;
871 sqr_add_c2(a, 1, 0, c2, c3, c1);
872 r[1] = c2;
873 c2 = 0;
874 sqr_add_c(a, 1, c3, c1, c2);
875 sqr_add_c2(a, 2, 0, c3, c1, c2);
876 r[2] = c3;
877 c3 = 0;
878 sqr_add_c2(a, 3, 0, c1, c2, c3);
879 sqr_add_c2(a, 2, 1, c1, c2, c3);
880 r[3] = c1;
881 c1 = 0;
882 sqr_add_c(a, 2, c2, c3, c1);
883 sqr_add_c2(a, 3, 1, c2, c3, c1);
884 r[4] = c2;
885 c2 = 0;
886 sqr_add_c2(a, 3, 2, c3, c1, c2);
887 r[5] = c3;
888 c3 = 0;
889 sqr_add_c(a, 3, c1, c2, c3);
890 r[6] = c1;
891 r[7] = c2;
894 #ifdef OPENSSL_NO_ASM
895 #ifdef OPENSSL_BN_ASM_MONT
897 * This is essentially reference implementation, which may or may not
898 * result in performance improvement. E.g. on IA-32 this routine was
899 * observed to give 40% faster rsa1024 private key operations and 10%
900 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
901 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
902 * reference implementation, one to be used as starting point for
903 * platform-specific assembler. Mentioned numbers apply to compiler
904 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
905 * can vary not only from platform to platform, but even for compiler
906 * versions. Assembler vs. assembler improvement coefficients can
907 * [and are known to] differ and are to be documented elsewhere.
910 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0p, int num)
912 BN_ULONG c0, c1, ml, *tp, n0;
913 #ifdef mul64
914 BN_ULONG mh;
915 #endif
916 int i = 0, j;
918 #if 0 /* template for platform-specific implementation */
919 if (ap == bp)
920 return bn_sqr_mont(rp, ap, np, n0p, num);
921 #endif
922 tp = reallocarray(NULL, num + 2, sizeof(BN_ULONG));
923 if (tp == NULL)
924 return 0;
926 n0 = *n0p;
928 c0 = 0;
929 ml = bp[0];
930 #ifdef mul64
931 mh = HBITS(ml);
932 ml = LBITS(ml);
933 for (j = 0; j < num; ++j)
934 mul(tp[j], ap[j], ml, mh, c0);
935 #else
936 for (j = 0; j < num; ++j)
937 mul(tp[j], ap[j], ml, c0);
938 #endif
940 tp[num] = c0;
941 tp[num + 1] = 0;
942 goto enter;
944 for (i = 0; i < num; i++) {
945 c0 = 0;
946 ml = bp[i];
947 #ifdef mul64
948 mh = HBITS(ml);
949 ml = LBITS(ml);
950 for (j = 0; j < num; ++j)
951 mul_add(tp[j], ap[j], ml, mh, c0);
952 #else
953 for (j = 0; j < num; ++j)
954 mul_add(tp[j], ap[j], ml, c0);
955 #endif
956 c1 = (tp[num] + c0) & BN_MASK2;
957 tp[num] = c1;
958 tp[num + 1] = (c1 < c0 ? 1 : 0);
959 enter:
960 c1 = tp[0];
961 ml = (c1 * n0) & BN_MASK2;
962 c0 = 0;
963 #ifdef mul64
964 mh = HBITS(ml);
965 ml = LBITS(ml);
966 mul_add(c1, np[0], ml, mh, c0);
967 #else
968 mul_add(c1, ml, np[0], c0);
969 #endif
970 for (j = 1; j < num; j++) {
971 c1 = tp[j];
972 #ifdef mul64
973 mul_add(c1, np[j], ml, mh, c0);
974 #else
975 mul_add(c1, ml, np[j], c0);
976 #endif
977 tp[j - 1] = c1 & BN_MASK2;
979 c1 = (tp[num] + c0) & BN_MASK2;
980 tp[num - 1] = c1;
981 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
984 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
985 c0 = bn_sub_words(rp, tp, np, num);
986 if (tp[num] != 0 || c0 == 0) {
987 goto out;
990 memcpy(rp, tp, num * sizeof(BN_ULONG));
991 out:
992 freezero(tp, (num + 2) * sizeof(BN_ULONG));
993 return 1;
995 #else
997 * Return value of 0 indicates that multiplication/convolution was not
998 * performed to signal the caller to fall down to alternative/original
999 * code-path.
1001 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
1002 { return 0;
1004 #endif /* OPENSSL_BN_ASM_MONT */
1005 #endif
1007 #else /* !BN_MUL_COMBA */
1009 /* hmm... is it faster just to do a multiply? */
1010 #undef bn_sqr_comba4
1011 void
1012 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1014 BN_ULONG t[8];
1015 bn_sqr_normal(r, a, 4, t);
1018 #undef bn_sqr_comba8
1019 void
1020 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1022 BN_ULONG t[16];
1023 bn_sqr_normal(r, a, 8, t);
1026 void
1027 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1029 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1030 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1031 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1032 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1035 void
1036 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1038 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1039 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1040 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1041 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1042 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1043 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1044 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1045 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1048 #ifdef OPENSSL_NO_ASM
1049 #ifdef OPENSSL_BN_ASM_MONT
1051 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1052 const BN_ULONG *np, const BN_ULONG *n0p, int num)
1054 BN_ULONG c0, c1, *tp, n0 = *n0p;
1055 int i = 0, j;
1057 tp = calloc(NULL, num + 2, sizeof(BN_ULONG));
1058 if (tp == NULL)
1059 return 0;
1061 for (i = 0; i < num; i++) {
1062 c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1063 c1 = (tp[num] + c0) & BN_MASK2;
1064 tp[num] = c1;
1065 tp[num + 1] = (c1 < c0 ? 1 : 0);
1067 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1068 c1 = (tp[num] + c0) & BN_MASK2;
1069 tp[num] = c1;
1070 tp[num + 1] += (c1 < c0 ? 1 : 0);
1071 for (j = 0; j <= num; j++)
1072 tp[j] = tp[j + 1];
1075 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1076 c0 = bn_sub_words(rp, tp, np, num);
1077 if (tp[num] != 0 || c0 == 0) {
1078 goto out;
1081 memcpy(rp, tp, num * sizeof(BN_ULONG));
1082 out:
1083 freezero(tp, (num + 2) * sizeof(BN_ULONG));
1084 return 1;
1086 #else
1088 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1089 const BN_ULONG *np, const BN_ULONG *n0, int num)
1091 return 0;
1093 #endif /* OPENSSL_BN_ASM_MONT */
1094 #endif
1096 #endif /* !BN_MUL_COMBA */