lib/libcrypto/bn/bn_asm.c

   1 /* $OpenBSD: bn_asm.c,v 1.15 2017/05/02 03:59:44 deraadt Exp $ */
   2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   3  * All rights reserved.
   4  *
   5  * This package is an SSL implementation written
   6  * by Eric Young (eay@cryptsoft.com).
   7  * The implementation was written so as to conform with Netscapes SSL.
   8  *
   9  * This library is free for commercial and non-commercial use as long as
  10  * the following conditions are aheared to.  The following conditions
  11  * apply to all code found in this distribution, be it the RC4, RSA,
  12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  13  * included with this distribution is covered by the same copyright terms
  14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  15  *
  16  * Copyright remains Eric Young's, and as such any Copyright notices in
  17  * the code are not to be removed.
  18  * If this package is used in a product, Eric Young should be given attribution
  19  * as the author of the parts of the library used.
  20  * This can be in the form of a textual message at program startup or
  21  * in documentation (online or textual) provided with the package.
  22  *
  23  * Redistribution and use in source and binary forms, with or without
  24  * modification, are permitted provided that the following conditions
  25  * are met:
  26  * 1. Redistributions of source code must retain the copyright
  27  *    notice, this list of conditions and the following disclaimer.
  28  * 2. Redistributions in binary form must reproduce the above copyright
  29  *    notice, this list of conditions and the following disclaimer in the
  30  *    documentation and/or other materials provided with the distribution.
  31  * 3. All advertising materials mentioning features or use of this software
  32  *    must display the following acknowledgement:
  33  *    "This product includes cryptographic software written by
  34  *     Eric Young (eay@cryptsoft.com)"
  35  *    The word 'cryptographic' can be left out if the rouines from the library
  36  *    being used are not cryptographic related :-).
  37  * 4. If you include any Windows specific code (or a derivative thereof) from
  38  *    the apps directory (application code) you must include an acknowledgement:
  39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  40  *
  41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51  * SUCH DAMAGE.
  52  *
  53  * The licence and distribution terms for any publically available version or
  54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  55  * copied and put under another distribution licence
  56  * [including the GNU Public Licence.]
  57  */
  58
  59 #ifndef BN_DEBUG
  60 # undef NDEBUG /* avoid conflicting definitions */
  61 # define NDEBUG
  62 #endif
  63
  64 #include <assert.h>
  65 #include <stdio.h>
  66
  67 #include <openssl/opensslconf.h>
  68
  69 #include "bn_lcl.h"
  70
  71 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  72
  73 BN_ULONG
  74 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  75 {
  76         BN_ULONG c1 = 0;
  77
  78         assert(num >= 0);
  79         if (num <= 0)
  80                 return (c1);
  81
  82 #ifndef OPENSSL_SMALL_FOOTPRINT
  83         while (num & ~3) {
  84                 mul_add(rp[0], ap[0], w, c1);
  85                 mul_add(rp[1], ap[1], w, c1);
  86                 mul_add(rp[2], ap[2], w, c1);
  87                 mul_add(rp[3], ap[3], w, c1);
  88                 ap += 4;
  89                 rp += 4;
  90                 num -= 4;
  91         }
  92 #endif
  93         while (num) {
  94                 mul_add(rp[0], ap[0], w, c1);
  95                 ap++;
  96                 rp++;
  97                 num--;
  98         }
  99
 100         return (c1);
 101 }
 102
 103 BN_ULONG
 104 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 105 {
 106         BN_ULONG c1 = 0;
 107
 108         assert(num >= 0);
 109         if (num <= 0)
 110                 return (c1);
 111
 112 #ifndef OPENSSL_SMALL_FOOTPRINT
 113         while (num & ~3) {
 114                 mul(rp[0], ap[0], w, c1);
 115                 mul(rp[1], ap[1], w, c1);
 116                 mul(rp[2], ap[2], w, c1);
 117                 mul(rp[3], ap[3], w, c1);
 118                 ap += 4;
 119                 rp += 4;
 120                 num -= 4;
 121         }
 122 #endif
 123         while (num) {
 124                 mul(rp[0], ap[0], w, c1);
 125                 ap++;
 126                 rp++;
 127                 num--;
 128         }
 129         return (c1);
 130 }
 131
 132 void
 133 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 134 {
 135         assert(n >= 0);
 136         if (n <= 0)
 137                 return;
 138
 139 #ifndef OPENSSL_SMALL_FOOTPRINT
 140         while (n & ~3) {
 141                 sqr(r[0], r[1], a[0]);
 142                 sqr(r[2], r[3], a[1]);
 143                 sqr(r[4], r[5], a[2]);
 144                 sqr(r[6], r[7], a[3]);
 145                 a += 4;
 146                 r += 8;
 147                 n -= 4;
 148         }
 149 #endif
 150         while (n) {
 151                 sqr(r[0], r[1], a[0]);
 152                 a++;
 153                 r += 2;
 154                 n--;
 155         }
 156 }
 157
 158 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 159
 160 BN_ULONG
 161 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 162 {
 163         BN_ULONG c = 0;
 164         BN_ULONG bl, bh;
 165
 166         assert(num >= 0);
 167         if (num <= 0)
 168                 return ((BN_ULONG)0);
 169
 170         bl = LBITS(w);
 171         bh = HBITS(w);
 172
 173 #ifndef OPENSSL_SMALL_FOOTPRINT
 174         while (num & ~3) {
 175                 mul_add(rp[0], ap[0], bl, bh, c);
 176                 mul_add(rp[1], ap[1], bl, bh, c);
 177                 mul_add(rp[2], ap[2], bl, bh, c);
 178                 mul_add(rp[3], ap[3], bl, bh, c);
 179                 ap += 4;
 180                 rp += 4;
 181                 num -= 4;
 182         }
 183 #endif
 184         while (num) {
 185                 mul_add(rp[0], ap[0], bl, bh, c);
 186                 ap++;
 187                 rp++;
 188                 num--;
 189         }
 190         return (c);
 191 }
 192
 193 BN_ULONG
 194 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 195 {
 196         BN_ULONG carry = 0;
 197         BN_ULONG bl, bh;
 198
 199         assert(num >= 0);
 200         if (num <= 0)
 201                 return ((BN_ULONG)0);
 202
 203         bl = LBITS(w);
 204         bh = HBITS(w);
 205
 206 #ifndef OPENSSL_SMALL_FOOTPRINT
 207         while (num & ~3) {
 208                 mul(rp[0], ap[0], bl, bh, carry);
 209                 mul(rp[1], ap[1], bl, bh, carry);
 210                 mul(rp[2], ap[2], bl, bh, carry);
 211                 mul(rp[3], ap[3], bl, bh, carry);
 212                 ap += 4;
 213                 rp += 4;
 214                 num -= 4;
 215         }
 216 #endif
 217         while (num) {
 218                 mul(rp[0], ap[0], bl, bh, carry);
 219                 ap++;
 220                 rp++;
 221                 num--;
 222         }
 223         return (carry);
 224 }
 225
 226 void
 227 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 228 {
 229         assert(n >= 0);
 230         if (n <= 0)
 231                 return;
 232
 233 #ifndef OPENSSL_SMALL_FOOTPRINT
 234         while (n & ~3) {
 235                 sqr64(r[0], r[1], a[0]);
 236                 sqr64(r[2], r[3], a[1]);
 237                 sqr64(r[4], r[5], a[2]);
 238                 sqr64(r[6], r[7], a[3]);
 239                 a += 4;
 240                 r += 8;
 241                 n -= 4;
 242         }
 243 #endif
 244         while (n) {
 245                 sqr64(r[0], r[1], a[0]);
 246                 a++;
 247                 r += 2;
 248                 n--;
 249         }
 250 }
 251
 252 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 253
 254 #if defined(BN_LLONG) && defined(BN_DIV2W)
 255
 256 BN_ULONG
 257 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 258 {
 259         return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2)|l)/(BN_ULLONG)d));
 260 }
 261
 262 #else
 263
 264 /* Divide h,l by d and return the result. */
 265 /* I need to test this some more :-( */
 266 BN_ULONG
 267 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 268 {
 269         BN_ULONG dh, dl, q,ret = 0, th, tl, t;
 270         int i, count = 2;
 271
 272         if (d == 0)
 273                 return (BN_MASK2);
 274
 275         i = BN_num_bits_word(d);
 276         assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
 277
 278         i = BN_BITS2 - i;
 279         if (h >= d)
 280                 h -= d;
 281
 282         if (i) {
 283                 d <<= i;
 284                 h = (h << i) | (l >> (BN_BITS2 - i));
 285                 l <<= i;
 286         }
 287         dh = (d & BN_MASK2h) >> BN_BITS4;
 288         dl = (d & BN_MASK2l);
 289         for (;;) {
 290                 if ((h >> BN_BITS4) == dh)
 291                         q = BN_MASK2l;
 292                 else
 293                         q = h / dh;
 294
 295                 th = q * dh;
 296                 tl = dl * q;
 297                 for (;;) {
 298                         t = h - th;
 299                         if ((t & BN_MASK2h) ||
 300                             ((tl) <= (
 301                             (t << BN_BITS4) |
 302                             ((l & BN_MASK2h) >> BN_BITS4))))
 303                                 break;
 304                         q--;
 305                         th -= dh;
 306                         tl -= dl;
 307                 }
 308                 t = (tl >> BN_BITS4);
 309                 tl = (tl << BN_BITS4) & BN_MASK2h;
 310                 th += t;
 311
 312                 if (l < tl)
 313                         th++;
 314                 l -= tl;
 315                 if (h < th) {
 316                         h += d;
 317                         q--;
 318                 }
 319                 h -= th;
 320
 321                 if (--count == 0)
 322                         break;
 323
 324                 ret = q << BN_BITS4;
 325                 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
 326                 l = (l & BN_MASK2l) << BN_BITS4;
 327         }
 328         ret |= q;
 329         return (ret);
 330 }
 331 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 332
 333 #ifdef BN_LLONG
 334 BN_ULONG
 335 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 336 {
 337         BN_ULLONG ll = 0;
 338
 339         assert(n >= 0);
 340         if (n <= 0)
 341                 return ((BN_ULONG)0);
 342
 343 #ifndef OPENSSL_SMALL_FOOTPRINT
 344         while (n & ~3) {
 345                 ll += (BN_ULLONG)a[0] + b[0];
 346                 r[0] = (BN_ULONG)ll & BN_MASK2;
 347                 ll >>= BN_BITS2;
 348                 ll += (BN_ULLONG)a[1] + b[1];
 349                 r[1] = (BN_ULONG)ll & BN_MASK2;
 350                 ll >>= BN_BITS2;
 351                 ll += (BN_ULLONG)a[2] + b[2];
 352                 r[2] = (BN_ULONG)ll & BN_MASK2;
 353                 ll >>= BN_BITS2;
 354                 ll += (BN_ULLONG)a[3] + b[3];
 355                 r[3] = (BN_ULONG)ll & BN_MASK2;
 356                 ll >>= BN_BITS2;
 357                 a += 4;
 358                 b += 4;
 359                 r += 4;
 360                 n -= 4;
 361         }
 362 #endif
 363         while (n) {
 364                 ll += (BN_ULLONG)a[0] + b[0];
 365                 r[0] = (BN_ULONG)ll & BN_MASK2;
 366                 ll >>= BN_BITS2;
 367                 a++;
 368                 b++;
 369                 r++;
 370                 n--;
 371         }
 372         return ((BN_ULONG)ll);
 373 }
 374 #else /* !BN_LLONG */
 375 BN_ULONG
 376 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 377 {
 378         BN_ULONG c, l, t;
 379
 380         assert(n >= 0);
 381         if (n <= 0)
 382                 return ((BN_ULONG)0);
 383
 384         c = 0;
 385 #ifndef OPENSSL_SMALL_FOOTPRINT
 386         while (n & ~3) {
 387                 t = a[0];
 388                 t = (t + c) & BN_MASK2;
 389                 c = (t < c);
 390                 l = (t + b[0]) & BN_MASK2;
 391                 c += (l < t);
 392                 r[0] = l;
 393                 t = a[1];
 394                 t = (t + c) & BN_MASK2;
 395                 c = (t < c);
 396                 l = (t + b[1]) & BN_MASK2;
 397                 c += (l < t);
 398                 r[1] = l;
 399                 t = a[2];
 400                 t = (t + c) & BN_MASK2;
 401                 c = (t < c);
 402                 l = (t + b[2]) & BN_MASK2;
 403                 c += (l < t);
 404                 r[2] = l;
 405                 t = a[3];
 406                 t = (t + c) & BN_MASK2;
 407                 c = (t < c);
 408                 l = (t + b[3]) & BN_MASK2;
 409                 c += (l < t);
 410                 r[3] = l;
 411                 a += 4;
 412                 b += 4;
 413                 r += 4;
 414                 n -= 4;
 415         }
 416 #endif
 417         while (n) {
 418                 t = a[0];
 419                 t = (t + c) & BN_MASK2;
 420                 c = (t < c);
 421                 l = (t + b[0]) & BN_MASK2;
 422                 c += (l < t);
 423                 r[0] = l;
 424                 a++;
 425                 b++;
 426                 r++;
 427                 n--;
 428         }
 429         return ((BN_ULONG)c);
 430 }
 431 #endif /* !BN_LLONG */
 432
 433 BN_ULONG
 434 bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 435 {
 436         BN_ULONG t1, t2;
 437         int c = 0;
 438
 439         assert(n >= 0);
 440         if (n <= 0)
 441                 return ((BN_ULONG)0);
 442
 443 #ifndef OPENSSL_SMALL_FOOTPRINT
 444         while (n&~3) {
 445                 t1 = a[0];
 446                 t2 = b[0];
 447                 r[0] = (t1 - t2 - c) & BN_MASK2;
 448                 if (t1 != t2)
 449                         c = (t1 < t2);
 450                 t1 = a[1];
 451                 t2 = b[1];
 452                 r[1] = (t1 - t2 - c) & BN_MASK2;
 453                 if (t1 != t2)
 454                         c = (t1 < t2);
 455                 t1 = a[2];
 456                 t2 = b[2];
 457                 r[2] = (t1 - t2 - c) & BN_MASK2;
 458                 if (t1 != t2)
 459                         c = (t1 < t2);
 460                 t1 = a[3];
 461                 t2 = b[3];
 462                 r[3] = (t1 - t2 - c) & BN_MASK2;
 463                 if (t1 != t2)
 464                         c = (t1 < t2);
 465                 a += 4;
 466                 b += 4;
 467                 r += 4;
 468                 n -= 4;
 469         }
 470 #endif
 471         while (n) {
 472                 t1 = a[0];
 473                 t2 = b[0];
 474                 r[0] = (t1 - t2 - c) & BN_MASK2;
 475                 if (t1 != t2)
 476                         c = (t1 < t2);
 477                 a++;
 478                 b++;
 479                 r++;
 480                 n--;
 481         }
 482         return (c);
 483 }
 484
 485 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 486
 487 #undef bn_mul_comba8
 488 #undef bn_mul_comba4
 489 #undef bn_sqr_comba8
 490 #undef bn_sqr_comba4
 491
 492 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 493 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 494 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 495 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
 496
 497 #ifdef BN_LLONG
 498 /*
 499  * Keep in mind that additions to multiplication result can not
 500  * overflow, because its high half cannot be all-ones.
 501  */
 502 #define mul_add_c(a,b,c0,c1,c2)         do {    \
 503         BN_ULONG hi;                            \
 504         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 505         t += c0;                /* no carry */  \
 506         c0 = (BN_ULONG)Lw(t);                   \
 507         hi = (BN_ULONG)Hw(t);                   \
 508         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 509         } while(0)
 510
 511 #define mul_add_c2(a,b,c0,c1,c2)        do {    \
 512         BN_ULONG hi;                            \
 513         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
 514         BN_ULLONG tt = t+c0;    /* no carry */  \
 515         c0 = (BN_ULONG)Lw(tt);                  \
 516         hi = (BN_ULONG)Hw(tt);                  \
 517         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 518         t += c0;                /* no carry */  \
 519         c0 = (BN_ULONG)Lw(t);                   \
 520         hi = (BN_ULONG)Hw(t);                   \
 521         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 522         } while(0)
 523
 524 #define sqr_add_c(a,i,c0,c1,c2)         do {    \
 525         BN_ULONG hi;                            \
 526         BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
 527         t += c0;                /* no carry */  \
 528         c0 = (BN_ULONG)Lw(t);                   \
 529         hi = (BN_ULONG)Hw(t);                   \
 530         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 531         } while(0)
 532
 533 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 534         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 535
 536 #elif defined(BN_UMULT_LOHI)
 537 /*
 538  * Keep in mind that additions to hi can not overflow, because
 539  * the high word of a multiplication result cannot be all-ones.
 540  */
 541 #define mul_add_c(a,b,c0,c1,c2)         do {    \
 542         BN_ULONG ta = (a), tb = (b);            \
 543         BN_ULONG lo, hi;                        \
 544         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 545         c0 += lo; hi += (c0<lo)?1:0;            \
 546         c1 += hi; c2 += (c1<hi)?1:0;            \
 547         } while(0)
 548
 549 #define mul_add_c2(a,b,c0,c1,c2)        do {    \
 550         BN_ULONG ta = (a), tb = (b);            \
 551         BN_ULONG lo, hi, tt;                    \
 552         BN_UMULT_LOHI(lo,hi,ta,tb);             \
 553         c0 += lo; tt = hi+((c0<lo)?1:0);        \
 554         c1 += tt; c2 += (c1<tt)?1:0;            \
 555         c0 += lo; hi += (c0<lo)?1:0;            \
 556         c1 += hi; c2 += (c1<hi)?1:0;            \
 557         } while(0)
 558
 559 #define sqr_add_c(a,i,c0,c1,c2)         do {    \
 560         BN_ULONG ta = (a)[i];                   \
 561         BN_ULONG lo, hi;                        \
 562         BN_UMULT_LOHI(lo,hi,ta,ta);             \
 563         c0 += lo; hi += (c0<lo)?1:0;            \
 564         c1 += hi; c2 += (c1<hi)?1:0;            \
 565         } while(0)
 566
 567 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
 568         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 569
 570 #elif defined(BN_UMULT_HIGH)
 571 /*
 572  * Keep in mind that additions to hi can not overflow, because
 573  * the high word of a multiplication result cannot be all-ones.
 574  */
 575 #define mul_add_c(a,b,c0,c1,c2)         do {    \
 576         BN_ULONG ta = (a), tb = (b);            \
 577         BN_ULONG lo = ta * tb;                  \
 578         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 579         c0 += lo; hi += (c0<lo)?1:0;            \
 580         c1 += hi; c2 += (c1<hi)?1:0;            \
 581         } while(0)
 582
 583 #define mul_add_c2(a,b,c0,c1,c2)        do {    \
 584         BN_ULONG ta = (a), tb = (b), tt;        \
 585         BN_ULONG lo = ta * tb;                  \
 586         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
 587         c0 += lo; tt = hi + ((c0<lo)?1:0);      \
 588         c1 += tt; c2 += (c1<tt)?1:0;            \
 589         c0 += lo; hi += (c0<lo)?1:0;            \
 590         c1 += hi; c2 += (c1<hi)?1:0;            \
 591         } while(0)
 592
 593 #define sqr_add_c(a,i,c0,c1,c2)         do {    \
 594         BN_ULONG ta = (a)[i];                   \
 595         BN_ULONG lo = ta * ta;                  \
 596         BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
 597         c0 += lo; hi += (c0<lo)?1:0;            \
 598         c1 += hi; c2 += (c1<hi)?1:0;            \
 599         } while(0)
 600
 601 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
 602         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 603
 604 #else /* !BN_LLONG */
 605 /*
 606  * Keep in mind that additions to hi can not overflow, because
 607  * the high word of a multiplication result cannot be all-ones.
 608  */
 609 #define mul_add_c(a,b,c0,c1,c2)         do {    \
 610         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 611         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 612         mul64(lo,hi,bl,bh);                     \
 613         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 614         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 615         } while(0)
 616
 617 #define mul_add_c2(a,b,c0,c1,c2)        do {    \
 618         BN_ULONG tt;                            \
 619         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
 620         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
 621         mul64(lo,hi,bl,bh);                     \
 622         tt = hi;                                \
 623         c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
 624         c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
 625         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 626         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 627         } while(0)
 628
 629 #define sqr_add_c(a,i,c0,c1,c2)         do {    \
 630         BN_ULONG lo, hi;                        \
 631         sqr64(lo,hi,(a)[i]);                    \
 632         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
 633         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
 634         } while(0)
 635
 636 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 637         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 638 #endif /* !BN_LLONG */
 639
 640 void
 641 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 642 {
 643         BN_ULONG c1, c2, c3;
 644
 645         c1 = 0;
 646         c2 = 0;
 647         c3 = 0;
 648         mul_add_c(a[0], b[0], c1, c2, c3);
 649         r[0] = c1;
 650         c1 = 0;
 651         mul_add_c(a[0], b[1], c2, c3, c1);
 652         mul_add_c(a[1], b[0], c2, c3, c1);
 653         r[1] = c2;
 654         c2 = 0;
 655         mul_add_c(a[2], b[0], c3, c1, c2);
 656         mul_add_c(a[1], b[1], c3, c1, c2);
 657         mul_add_c(a[0], b[2], c3, c1, c2);
 658         r[2] = c3;
 659         c3 = 0;
 660         mul_add_c(a[0], b[3], c1, c2, c3);
 661         mul_add_c(a[1], b[2], c1, c2, c3);
 662         mul_add_c(a[2], b[1], c1, c2, c3);
 663         mul_add_c(a[3], b[0], c1, c2, c3);
 664         r[3] = c1;
 665         c1 = 0;
 666         mul_add_c(a[4], b[0], c2, c3, c1);
 667         mul_add_c(a[3], b[1], c2, c3, c1);
 668         mul_add_c(a[2], b[2], c2, c3, c1);
 669         mul_add_c(a[1], b[3], c2, c3, c1);
 670         mul_add_c(a[0], b[4], c2, c3, c1);
 671         r[4] = c2;
 672         c2 = 0;
 673         mul_add_c(a[0], b[5], c3, c1, c2);
 674         mul_add_c(a[1], b[4], c3, c1, c2);
 675         mul_add_c(a[2], b[3], c3, c1, c2);
 676         mul_add_c(a[3], b[2], c3, c1, c2);
 677         mul_add_c(a[4], b[1], c3, c1, c2);
 678         mul_add_c(a[5], b[0], c3, c1, c2);
 679         r[5] = c3;
 680         c3 = 0;
 681         mul_add_c(a[6], b[0], c1, c2, c3);
 682         mul_add_c(a[5], b[1], c1, c2, c3);
 683         mul_add_c(a[4], b[2], c1, c2, c3);
 684         mul_add_c(a[3], b[3], c1, c2, c3);
 685         mul_add_c(a[2], b[4], c1, c2, c3);
 686         mul_add_c(a[1], b[5], c1, c2, c3);
 687         mul_add_c(a[0], b[6], c1, c2, c3);
 688         r[6] = c1;
 689         c1 = 0;
 690         mul_add_c(a[0], b[7], c2, c3, c1);
 691         mul_add_c(a[1], b[6], c2, c3, c1);
 692         mul_add_c(a[2], b[5], c2, c3, c1);
 693         mul_add_c(a[3], b[4], c2, c3, c1);
 694         mul_add_c(a[4], b[3], c2, c3, c1);
 695         mul_add_c(a[5], b[2], c2, c3, c1);
 696         mul_add_c(a[6], b[1], c2, c3, c1);
 697         mul_add_c(a[7], b[0], c2, c3, c1);
 698         r[7] = c2;
 699         c2 = 0;
 700         mul_add_c(a[7], b[1], c3, c1, c2);
 701         mul_add_c(a[6], b[2], c3, c1, c2);
 702         mul_add_c(a[5], b[3], c3, c1, c2);
 703         mul_add_c(a[4], b[4], c3, c1, c2);
 704         mul_add_c(a[3], b[5], c3, c1, c2);
 705         mul_add_c(a[2], b[6], c3, c1, c2);
 706         mul_add_c(a[1], b[7], c3, c1, c2);
 707         r[8] = c3;
 708         c3 = 0;
 709         mul_add_c(a[2], b[7], c1, c2, c3);
 710         mul_add_c(a[3], b[6], c1, c2, c3);
 711         mul_add_c(a[4], b[5], c1, c2, c3);
 712         mul_add_c(a[5], b[4], c1, c2, c3);
 713         mul_add_c(a[6], b[3], c1, c2, c3);
 714         mul_add_c(a[7], b[2], c1, c2, c3);
 715         r[9] = c1;
 716         c1 = 0;
 717         mul_add_c(a[7], b[3], c2, c3, c1);
 718         mul_add_c(a[6], b[4], c2, c3, c1);
 719         mul_add_c(a[5], b[5], c2, c3, c1);
 720         mul_add_c(a[4], b[6], c2, c3, c1);
 721         mul_add_c(a[3], b[7], c2, c3, c1);
 722         r[10] = c2;
 723         c2 = 0;
 724         mul_add_c(a[4], b[7], c3, c1, c2);
 725         mul_add_c(a[5], b[6], c3, c1, c2);
 726         mul_add_c(a[6], b[5], c3, c1, c2);
 727         mul_add_c(a[7], b[4], c3, c1, c2);
 728         r[11] = c3;
 729         c3 = 0;
 730         mul_add_c(a[7], b[5], c1, c2, c3);
 731         mul_add_c(a[6], b[6], c1, c2, c3);
 732         mul_add_c(a[5], b[7], c1, c2, c3);
 733         r[12] = c1;
 734         c1 = 0;
 735         mul_add_c(a[6], b[7], c2, c3, c1);
 736         mul_add_c(a[7], b[6], c2, c3, c1);
 737         r[13] = c2;
 738         c2 = 0;
 739         mul_add_c(a[7], b[7], c3, c1, c2);
 740         r[14] = c3;
 741         r[15] = c1;
 742 }
 743
 744 void
 745 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 746 {
 747         BN_ULONG c1, c2, c3;
 748
 749         c1 = 0;
 750         c2 = 0;
 751         c3 = 0;
 752         mul_add_c(a[0], b[0], c1, c2, c3);
 753         r[0] = c1;
 754         c1 = 0;
 755         mul_add_c(a[0], b[1], c2, c3, c1);
 756         mul_add_c(a[1], b[0], c2, c3, c1);
 757         r[1] = c2;
 758         c2 = 0;
 759         mul_add_c(a[2], b[0], c3, c1, c2);
 760         mul_add_c(a[1], b[1], c3, c1, c2);
 761         mul_add_c(a[0], b[2], c3, c1, c2);
 762         r[2] = c3;
 763         c3 = 0;
 764         mul_add_c(a[0], b[3], c1, c2, c3);
 765         mul_add_c(a[1], b[2], c1, c2, c3);
 766         mul_add_c(a[2], b[1], c1, c2, c3);
 767         mul_add_c(a[3], b[0], c1, c2, c3);
 768         r[3] = c1;
 769         c1 = 0;
 770         mul_add_c(a[3], b[1], c2, c3, c1);
 771         mul_add_c(a[2], b[2], c2, c3, c1);
 772         mul_add_c(a[1], b[3], c2, c3, c1);
 773         r[4] = c2;
 774         c2 = 0;
 775         mul_add_c(a[2], b[3], c3, c1, c2);
 776         mul_add_c(a[3], b[2], c3, c1, c2);
 777         r[5] = c3;
 778         c3 = 0;
 779         mul_add_c(a[3], b[3], c1, c2, c3);
 780         r[6] = c1;
 781         r[7] = c2;
 782 }
 783
 784 void
 785 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 786 {
 787         BN_ULONG c1, c2, c3;
 788
 789         c1 = 0;
 790         c2 = 0;
 791         c3 = 0;
 792         sqr_add_c(a, 0, c1, c2, c3);
 793         r[0] = c1;
 794         c1 = 0;
 795         sqr_add_c2(a, 1, 0, c2, c3, c1);
 796         r[1] = c2;
 797         c2 = 0;
 798         sqr_add_c(a, 1, c3, c1, c2);
 799         sqr_add_c2(a, 2, 0, c3, c1, c2);
 800         r[2] = c3;
 801         c3 = 0;
 802         sqr_add_c2(a, 3, 0, c1, c2, c3);
 803         sqr_add_c2(a, 2, 1, c1, c2, c3);
 804         r[3] = c1;
 805         c1 = 0;
 806         sqr_add_c(a, 2, c2, c3, c1);
 807         sqr_add_c2(a, 3, 1, c2, c3, c1);
 808         sqr_add_c2(a, 4, 0, c2, c3, c1);
 809         r[4] = c2;
 810         c2 = 0;
 811         sqr_add_c2(a, 5, 0, c3, c1, c2);
 812         sqr_add_c2(a, 4, 1, c3, c1, c2);
 813         sqr_add_c2(a, 3, 2, c3, c1, c2);
 814         r[5] = c3;
 815         c3 = 0;
 816         sqr_add_c(a, 3, c1, c2, c3);
 817         sqr_add_c2(a, 4, 2, c1, c2, c3);
 818         sqr_add_c2(a, 5, 1, c1, c2, c3);
 819         sqr_add_c2(a, 6, 0, c1, c2, c3);
 820         r[6] = c1;
 821         c1 = 0;
 822         sqr_add_c2(a, 7, 0, c2, c3, c1);
 823         sqr_add_c2(a, 6, 1, c2, c3, c1);
 824         sqr_add_c2(a, 5, 2, c2, c3, c1);
 825         sqr_add_c2(a, 4, 3, c2, c3, c1);
 826         r[7] = c2;
 827         c2 = 0;
 828         sqr_add_c(a, 4, c3, c1, c2);
 829         sqr_add_c2(a, 5, 3, c3, c1, c2);
 830         sqr_add_c2(a, 6, 2, c3, c1, c2);
 831         sqr_add_c2(a, 7, 1, c3, c1, c2);
 832         r[8] = c3;
 833         c3 = 0;
 834         sqr_add_c2(a, 7, 2, c1, c2, c3);
 835         sqr_add_c2(a, 6, 3, c1, c2, c3);
 836         sqr_add_c2(a, 5, 4, c1, c2, c3);
 837         r[9] = c1;
 838         c1 = 0;
 839         sqr_add_c(a, 5, c2, c3, c1);
 840         sqr_add_c2(a, 6, 4, c2, c3, c1);
 841         sqr_add_c2(a, 7, 3, c2, c3, c1);
 842         r[10] = c2;
 843         c2 = 0;
 844         sqr_add_c2(a, 7, 4, c3, c1, c2);
 845         sqr_add_c2(a, 6, 5, c3, c1, c2);
 846         r[11] = c3;
 847         c3 = 0;
 848         sqr_add_c(a, 6, c1, c2, c3);
 849         sqr_add_c2(a, 7, 5, c1, c2, c3);
 850         r[12] = c1;
 851         c1 = 0;
 852         sqr_add_c2(a, 7, 6, c2, c3, c1);
 853         r[13] = c2;
 854         c2 = 0;
 855         sqr_add_c(a, 7, c3, c1, c2);
 856         r[14] = c3;
 857         r[15] = c1;
 858 }
 859
 860 void
 861 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 862 {
 863         BN_ULONG c1, c2, c3;
 864
 865         c1 = 0;
 866         c2 = 0;
 867         c3 = 0;
 868         sqr_add_c(a, 0, c1, c2, c3);
 869         r[0] = c1;
 870         c1 = 0;
 871         sqr_add_c2(a, 1, 0, c2, c3, c1);
 872         r[1] = c2;
 873         c2 = 0;
 874         sqr_add_c(a, 1, c3, c1, c2);
 875         sqr_add_c2(a, 2, 0, c3, c1, c2);
 876         r[2] = c3;
 877         c3 = 0;
 878         sqr_add_c2(a, 3, 0, c1, c2, c3);
 879         sqr_add_c2(a, 2, 1, c1, c2, c3);
 880         r[3] = c1;
 881         c1 = 0;
 882         sqr_add_c(a, 2, c2, c3, c1);
 883         sqr_add_c2(a, 3, 1, c2, c3, c1);
 884         r[4] = c2;
 885         c2 = 0;
 886         sqr_add_c2(a, 3, 2, c3, c1, c2);
 887         r[5] = c3;
 888         c3 = 0;
 889         sqr_add_c(a, 3, c1, c2, c3);
 890         r[6] = c1;
 891         r[7] = c2;
 892 }
 893
 894 #ifdef OPENSSL_NO_ASM
 895 #ifdef OPENSSL_BN_ASM_MONT
 896 /*
 897  * This is essentially reference implementation, which may or may not
 898  * result in performance improvement. E.g. on IA-32 this routine was
 899  * observed to give 40% faster rsa1024 private key operations and 10%
 900  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
 901  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
 902  * reference implementation, one to be used as starting point for
 903  * platform-specific assembler. Mentioned numbers apply to compiler
 904  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
 905  * can vary not only from platform to platform, but even for compiler
 906  * versions. Assembler vs. assembler improvement coefficients can
 907  * [and are known to] differ and are to be documented elsewhere.
 908  */
 909 int
 910 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0p, int num)
 911 {
 912         BN_ULONG c0, c1, ml, *tp, n0;
 913 #ifdef mul64
 914         BN_ULONG mh;
 915 #endif
 916         int i = 0, j;
 917
 918 #if 0   /* template for platform-specific implementation */
 919         if (ap == bp)
 920                 return bn_sqr_mont(rp, ap, np, n0p, num);
 921 #endif
 922         tp = reallocarray(NULL, num + 2, sizeof(BN_ULONG));
 923         if (tp == NULL)
 924                 return 0;
 925
 926         n0 = *n0p;
 927
 928         c0 = 0;
 929         ml = bp[0];
 930 #ifdef mul64
 931         mh = HBITS(ml);
 932         ml = LBITS(ml);
 933         for (j = 0; j < num; ++j)
 934                 mul(tp[j], ap[j], ml, mh, c0);
 935 #else
 936         for (j = 0; j < num; ++j)
 937                 mul(tp[j], ap[j], ml, c0);
 938 #endif
 939
 940         tp[num] = c0;
 941         tp[num + 1] = 0;
 942         goto enter;
 943
 944         for (i = 0; i < num; i++) {
 945                 c0 = 0;
 946                 ml = bp[i];
 947 #ifdef mul64
 948                 mh = HBITS(ml);
 949                 ml = LBITS(ml);
 950                 for (j = 0; j < num; ++j)
 951                         mul_add(tp[j], ap[j], ml, mh, c0);
 952 #else
 953                 for (j = 0; j < num; ++j)
 954                         mul_add(tp[j], ap[j], ml, c0);
 955 #endif
 956                 c1 = (tp[num] + c0) & BN_MASK2;
 957                 tp[num] = c1;
 958                 tp[num + 1] = (c1 < c0 ? 1 : 0);
 959 enter:
 960                 c1 = tp[0];
 961                 ml = (c1 * n0) & BN_MASK2;
 962                 c0 = 0;
 963 #ifdef mul64
 964                 mh = HBITS(ml);
 965                 ml = LBITS(ml);
 966                 mul_add(c1, np[0], ml, mh, c0);
 967 #else
 968                 mul_add(c1, ml, np[0], c0);
 969 #endif
 970                 for (j = 1; j < num; j++) {
 971                         c1 = tp[j];
 972 #ifdef mul64
 973                         mul_add(c1, np[j], ml, mh, c0);
 974 #else
 975                         mul_add(c1, ml, np[j], c0);
 976 #endif
 977                         tp[j - 1] = c1 & BN_MASK2;
 978                 }
 979                 c1 = (tp[num] + c0) & BN_MASK2;
 980                 tp[num - 1] = c1;
 981                 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
 982         }
 983
 984         if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
 985                 c0 = bn_sub_words(rp, tp, np, num);
 986                 if (tp[num] != 0 || c0 == 0) {
 987                         goto out;
 988                 }
 989         }
 990         memcpy(rp, tp, num * sizeof(BN_ULONG));
 991 out:
 992         freezero(tp, (num + 2) * sizeof(BN_ULONG));
 993         return 1;
 994 }
 995 #else
 996 /*
 997  * Return value of 0 indicates that multiplication/convolution was not
 998  * performed to signal the caller to fall down to alternative/original
 999  * code-path.
1000  */
1001 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
1002         {       return 0;
1003 }
1004 #endif /* OPENSSL_BN_ASM_MONT */
1005 #endif
1006
1007 #else /* !BN_MUL_COMBA */
1008
1009 /* hmm... is it faster just to do a multiply? */
1010 #undef bn_sqr_comba4
1011 void
1012 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1013 {
1014         BN_ULONG t[8];
1015         bn_sqr_normal(r, a, 4, t);
1016 }
1017
1018 #undef bn_sqr_comba8
1019 void
1020 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1021 {
1022         BN_ULONG t[16];
1023         bn_sqr_normal(r, a, 8, t);
1024 }
1025
1026 void
1027 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1028 {
1029         r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1030         r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1031         r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1032         r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1033 }
1034
1035 void
1036 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1037 {
1038         r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1039         r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1040         r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1041         r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1042         r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1043         r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1044         r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1045         r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1046 }
1047
1048 #ifdef OPENSSL_NO_ASM
1049 #ifdef OPENSSL_BN_ASM_MONT
1050 int
1051 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1052     const BN_ULONG *np, const BN_ULONG *n0p, int num)
1053 {
1054         BN_ULONG c0, c1, *tp, n0 = *n0p;
1055         int i = 0, j;
1056
1057         tp = calloc(NULL, num + 2, sizeof(BN_ULONG));
1058         if (tp == NULL)
1059                 return 0;
1060
1061         for (i = 0; i < num; i++) {
1062                 c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1063                 c1 = (tp[num] + c0) & BN_MASK2;
1064                 tp[num] = c1;
1065                 tp[num + 1] = (c1 < c0 ? 1 : 0);
1066
1067                 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1068                 c1 = (tp[num] + c0) & BN_MASK2;
1069                 tp[num] = c1;
1070                 tp[num + 1] += (c1 < c0 ? 1 : 0);
1071                 for (j = 0; j <= num; j++)
1072                         tp[j] = tp[j + 1];
1073         }
1074
1075         if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1076                 c0 = bn_sub_words(rp, tp, np, num);
1077                 if (tp[num] != 0 || c0 == 0) {
1078                         goto out;
1079                 }
1080         }
1081         memcpy(rp, tp, num * sizeof(BN_ULONG));
1082 out:
1083         freezero(tp, (num + 2) * sizeof(BN_ULONG));
1084         return 1;
1085 }
1086 #else
1087 int
1088 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1089     const BN_ULONG *np, const BN_ULONG *n0, int num)
1090 {
1091         return 0;
1092 }
1093 #endif /* OPENSSL_BN_ASM_MONT */
1094 #endif
1095
1096 #endif /* !BN_MUL_COMBA */