usr/src/common/bignum/bignumimpl.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 /*
  27  * Configuration guide
  28  * -------------------
  29  *
  30  * There are 4 preprocessor symbols used to configure the bignum
  31  * implementation.  This file contains no logic to configure based on
  32  * processor; we leave that to the Makefiles to specify.
  33  *
  34  * USE_FLOATING_POINT
  35  *   Meaning: There is support for a fast floating-point implementation of
  36  *   Montgomery multiply.
  37  *
  38  * PSR_MUL
  39  *   Meaning: There are processor-specific versions of the low level
  40  *   functions to implement big_mul.  Those functions are: big_mul_set_vec,
  41  *   big_mul_add_vec, big_mul_vec, and big_sqr_vec.  PSR_MUL implies support
  42  *   for all 4 functions.  You cannot pick and choose which subset of these
  43  *   functions to support; that would lead to a rat's nest of #ifdefs.
  44  *
  45  * HWCAP
  46  *   Meaning: Call multiply support functions through a function pointer.
  47  *   On x86, there are multiple implementations for different hardware
  48  *   capabilities, such as MMX, SSE2, etc.  Tests are made at run-time, when
  49  *   a function is first used.  So, the support functions are called through
  50  *   a function pointer.  There is no need for that on Sparc, because there
  51  *   is only one implementation; support functions are called directly.
  52  *   Later, if there were some new VIS instruction, or something, and a
  53  *   run-time test were needed, rather than variant kernel modules and
  54  *   libraries, then HWCAP would be defined for Sparc, as well.
  55  *
  56  * UMUL64
  57  *   Meaning: It is safe to use generic C code that assumes the existence
  58  *   of a 32 x 32 --> 64 bit unsigned multiply.  If this is not defined,
  59  *   then the generic code for big_mul_add_vec() must necessarily be very slow,
  60  *   because it must fall back to using 16 x 16 --> 32 bit multiplication.
  61  *
  62  */
  63
  64
  65 #include <sys/types.h>
  66 #include "bignum.h"
  67
  68 #ifdef  _KERNEL
  69 #include <sys/ddi.h>
  70 #include <sys/mdesc.h>
  71 #include <sys/crypto/common.h>
  72
  73 #include <sys/kmem.h>
  74 #include <sys/param.h>
  75 #include <sys/sunddi.h>
  76
  77 #else
  78 #include <stdlib.h>
  79 #include <stdio.h>
  80 #include <assert.h>
  81 #define ASSERT  assert
  82 #endif  /* _KERNEL */
  83
  84 #ifdef __amd64
  85 #ifdef _KERNEL
  86 #include <sys/x86_archext.h>    /* cpuid_getvendor() */
  87 #include <sys/cpuvar.h>
  88 #else
  89 #include <sys/auxv.h>           /* getisax() */
  90 #endif  /* _KERNEL */
  91 #endif  /* __amd64 */
  92
  93
  94 #ifdef  _LP64 /* truncate 64-bit size_t to 32-bits */
  95 #define UI32(ui)        ((uint32_t)ui)
  96 #else /* size_t already 32-bits */
  97 #define UI32(ui)        (ui)
  98 #endif
  99
 100
 101 #ifdef  _KERNEL
 102 #define big_malloc(size)        kmem_alloc(size, KM_NOSLEEP)
 103 #define big_free(ptr, size)     kmem_free(ptr, size)
 104
 105 /*
 106  * big_realloc()
 107  * Allocate memory of newsize bytes and copy oldsize bytes
 108  * to the newly-allocated memory, then free the
 109  * previously-allocated memory.
 110  * Note: newsize must be > oldsize
 111  */
 112 void *
 113 big_realloc(void *from, size_t oldsize, size_t newsize)
 114 {
 115         void *rv;
 116
 117         rv = kmem_alloc(newsize, KM_NOSLEEP);
 118         if (rv != NULL)
 119                 bcopy(from, rv, oldsize);
 120         kmem_free(from, oldsize);
 121         return (rv);
 122 }
 123
 124 #else   /* _KERNEL */
 125
 126 #ifndef MALLOC_DEBUG
 127
 128 #define big_malloc(size)        malloc(size)
 129 #define big_free(ptr, size)     free(ptr)
 130
 131 #else
 132
 133 void
 134 big_free(void *ptr, size_t size)
 135 {
 136         printf("freed %d bytes at %p\n", size, ptr);
 137         free(ptr);
 138 }
 139
 140 void *
 141 big_malloc(size_t size)
 142 {
 143         void *rv;
 144         rv = malloc(size);
 145         printf("malloced %d bytes, addr:%p\n", size, rv);
 146         return (rv);
 147 }
 148 #endif /* MALLOC_DEBUG */
 149
 150 #define big_realloc(x, y, z) realloc((x), (z))
 151
 152
 153 /*
 154  * printbignum()
 155  * Print a BIGNUM type to stdout.
 156  */
 157 void
 158 printbignum(char *aname, BIGNUM *a)
 159 {
 160         int i;
 161
 162         (void) printf("\n%s\n%d\n", aname, a->sign*a->len);
 163         for (i = a->len - 1; i >= 0; i--) {
 164 #ifdef BIGNUM_CHUNK_32
 165                 (void) printf("%08x ", a->value[i]);
 166                 if (((i & (BITSINBYTE - 1)) == 0) && (i != 0)) {
 167                         (void) printf("\n");
 168                 }
 169 #else
 170                 (void) printf("%08x %08x ", (uint32_t)((a->value[i]) >> 32),
 171                     (uint32_t)((a->value[i]) & 0xffffffff));
 172                 if (((i & 3) == 0) && (i != 0)) { /* end of this chunk */
 173                         (void) printf("\n");
 174                 }
 175 #endif
 176         }
 177         (void) printf("\n");
 178 }
 179
 180 #endif  /* _KERNEL */
 181
 182
 183 #ifdef  __amd64
 184 /*
 185  * Return 1 if executing on Intel, otherwise 0 (e.g., AMD64).
 186  * Cache the result, as the CPU can't change.
 187  *
 188  * Note: the userland version uses getisax() and checks for an AMD-64-only
 189  * feature.  The kernel version uses cpuid_getvendor().
 190  */
 191 static int
 192 bignum_on_intel(void)
 193 {
 194         static int      cached_result = -1;
 195
 196         if (cached_result == -1) { /* first time */
 197 #ifdef _KERNEL
 198                 cached_result = (cpuid_getvendor(CPU) == X86_VENDOR_Intel);
 199 #else
 200                 uint_t  ui;
 201
 202                 (void) getisax(&ui, 1);
 203                 cached_result = ((ui & AV_386_AMD_MMX) == 0);
 204 #endif  /* _KERNEL */
 205         }
 206
 207         return (cached_result);
 208 }
 209 #endif  /* __amd64 */
 210
 211
 212 /*
 213  * big_init()
 214  * Initialize and allocate memory for a BIGNUM type.
 215  *
 216  * big_init(number, size) is equivalent to big_init1(number, size, NULL, 0)
 217  *
 218  * Note: call big_finish() to free memory allocated by big_init().
 219  *
 220  * Input:
 221  * number       Uninitialized memory for BIGNUM
 222  * size         Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
 223  *
 224  * Output:
 225  * number       Initialized BIGNUM
 226  *
 227  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
 228  */
 229 BIG_ERR_CODE
 230 big_init(BIGNUM *number, int size)
 231 {
 232         number->value = big_malloc(BIGNUM_WORDSIZE * size);
 233         if (number->value == NULL) {
 234                 return (BIG_NO_MEM);
 235         }
 236         number->size = size;
 237         number->len = 0;
 238         number->sign = 1;
 239         number->malloced = 1;
 240         return (BIG_OK);
 241 }
 242
 243
 244 /*
 245  * big_init1()
 246  * Initialize and, if needed, allocate memory for a BIGNUM type.
 247  * Use the buffer passed, buf, if any, instad of allocating memory
 248  * if it's at least "size" bytes.
 249  *
 250  * Note: call big_finish() to free memory allocated by big_init().
 251  *
 252  * Input:
 253  * number       Uninitialized memory for BIGNUM
 254  * size         Minimum size, in BIG_CHUNK_SIZE-bit words, required for BIGNUM
 255  * buf          Buffer for storing a BIGNUM.
 256  *              If NULL, big_init1() will allocate a buffer
 257  * bufsize      Size, in BIG_CHUNK_SIZE_bit words, of buf
 258  *
 259  * Output:
 260  * number       Initialized BIGNUM
 261  *
 262  * Return BIG_OK on success or BIG_NO_MEM for an allocation error.
 263  */
 264 BIG_ERR_CODE
 265 big_init1(BIGNUM *number, int size, BIG_CHUNK_TYPE *buf, int bufsize)
 266 {
 267         if ((buf == NULL) || (size > bufsize)) {
 268                 number->value = big_malloc(BIGNUM_WORDSIZE * size);
 269                 if (number->value == NULL) {
 270                         return (BIG_NO_MEM);
 271                 }
 272                 number->size = size;
 273                 number->malloced = 1;
 274         } else {
 275                 number->value = buf;
 276                 number->size = bufsize;
 277                 number->malloced = 0;
 278         }
 279         number->len = 0;
 280         number->sign = 1;
 281
 282         return (BIG_OK);
 283 }
 284
 285
 286 /*
 287  * big_finish()
 288  * Free memory, if any, allocated by big_init() or big_init1().
 289  */
 290 void
 291 big_finish(BIGNUM *number)
 292 {
 293         if (number->malloced == 1) {
 294                 big_free(number->value, BIGNUM_WORDSIZE * number->size);
 295                 number->malloced = 0;
 296         }
 297 }
 298
 299
 300 /*
 301  * bn->size should be at least
 302  * (len + BIGNUM_WORDSIZE - 1) / BIGNUM_WORDSIZE bytes
 303  * converts from byte-big-endian format to bignum format (words in
 304  * little endian order, but bytes within the words big endian)
 305  */
 306 void
 307 bytestring2bignum(BIGNUM *bn, uchar_t *kn, size_t len)
 308 {
 309         int             i, j;
 310         uint32_t        offs;
 311         const uint32_t  slen = UI32(len);
 312         BIG_CHUNK_TYPE  word;
 313         uchar_t         *knwordp;
 314
 315         if (slen == 0) {
 316                 bn->len = 1;
 317                 bn->value[0] = 0;
 318                 return;
 319         }
 320
 321         offs = slen % BIGNUM_WORDSIZE;
 322         bn->len = slen / BIGNUM_WORDSIZE;
 323
 324         for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
 325                 knwordp = &(kn[slen - BIGNUM_WORDSIZE * (i + 1)]);
 326                 word = knwordp[0];
 327                 for (j = 1; j < BIGNUM_WORDSIZE; j++) {
 328                         word = (word << BITSINBYTE) + knwordp[j];
 329                 }
 330                 bn->value[i] = word;
 331         }
 332         if (offs > 0) {
 333                 word = kn[0];
 334                 for (i = 1; i < offs; i++) word = (word << BITSINBYTE) + kn[i];
 335                 bn->value[bn->len++] = word;
 336         }
 337         while ((bn->len > 1) && (bn->value[bn->len - 1] == 0)) {
 338                 bn->len --;
 339         }
 340 }
 341
 342
 343 /*
 344  * copies the least significant len bytes if
 345  * len < bn->len * BIGNUM_WORDSIZE
 346  * converts from bignum format to byte-big-endian format.
 347  * bignum format is words of type  BIG_CHUNK_TYPE in little endian order.
 348  */
 349 void
 350 bignum2bytestring(uchar_t *kn, BIGNUM *bn, size_t len)
 351 {
 352         int             i, j;
 353         uint32_t        offs;
 354         const uint32_t  slen = UI32(len);
 355         BIG_CHUNK_TYPE  word;
 356
 357         if (len < BIGNUM_WORDSIZE * bn->len) {
 358                 for (i = 0; i < slen / BIGNUM_WORDSIZE; i++) {
 359                         word = bn->value[i];
 360                         for (j = 0; j < BIGNUM_WORDSIZE; j++) {
 361                                 kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
 362                                     word & 0xff;
 363                                 word = word >> BITSINBYTE;
 364                         }
 365                 }
 366                 offs = slen % BIGNUM_WORDSIZE;
 367                 if (offs > 0) {
 368                         word = bn->value[slen / BIGNUM_WORDSIZE];
 369                         for (i =  slen % BIGNUM_WORDSIZE; i > 0; i --) {
 370                                 kn[i - 1] = word & 0xff;
 371                                 word = word >> BITSINBYTE;
 372                         }
 373                 }
 374         } else {
 375                 for (i = 0; i < bn->len; i++) {
 376                         word = bn->value[i];
 377                         for (j = 0; j < BIGNUM_WORDSIZE; j++) {
 378                                 kn[slen - BIGNUM_WORDSIZE * i - j - 1] =
 379                                     word & 0xff;
 380                                 word = word >> BITSINBYTE;
 381                         }
 382                 }
 383                 for (i = 0; i < slen - BIGNUM_WORDSIZE * bn->len; i++) {
 384                         kn[i] = 0;
 385                 }
 386         }
 387 }
 388
 389
 390 int
 391 big_bitlength(BIGNUM *a)
 392 {
 393         int             l = 0, b = 0;
 394         BIG_CHUNK_TYPE  c;
 395
 396         l = a->len - 1;
 397         while ((l > 0) && (a->value[l] == 0)) {
 398                 l--;
 399         }
 400         b = BIG_CHUNK_SIZE;
 401         c = a->value[l];
 402         while ((b > 1) && ((c & BIG_CHUNK_HIGHBIT) == 0)) {
 403                 c = c << 1;
 404                 b--;
 405         }
 406
 407         return (l * BIG_CHUNK_SIZE + b);
 408 }
 409
 410
 411 /*
 412  * big_copy()
 413  * Copy BIGNUM src to dest, allocating memory if needed.
 414  */
 415 BIG_ERR_CODE
 416 big_copy(BIGNUM *dest, BIGNUM *src)
 417 {
 418         BIG_CHUNK_TYPE  *newptr;
 419         int             i, len;
 420
 421         len = src->len;
 422         while ((len > 1) && (src->value[len - 1] == 0)) {
 423                 len--;
 424         }
 425         src->len = len;
 426         if (dest->size < len) {
 427                 if (dest->malloced == 1) {
 428                         newptr = (BIG_CHUNK_TYPE *)big_realloc(dest->value,
 429                             BIGNUM_WORDSIZE * dest->size,
 430                             BIGNUM_WORDSIZE * len);
 431                 } else {
 432                         newptr = (BIG_CHUNK_TYPE *)
 433                             big_malloc(BIGNUM_WORDSIZE * len);
 434                         if (newptr != NULL) {
 435                                 dest->malloced = 1;
 436                         }
 437                 }
 438                 if (newptr == NULL) {
 439                         return (BIG_NO_MEM);
 440                 }
 441                 dest->value = newptr;
 442                 dest->size = len;
 443         }
 444         dest->len = len;
 445         dest->sign = src->sign;
 446         for (i = 0; i < len; i++) {
 447                 dest->value[i] = src->value[i];
 448         }
 449
 450         return (BIG_OK);
 451 }
 452
 453
 454 /*
 455  * big_extend()
 456  * Allocate memory to extend BIGNUM number to size bignum chunks,
 457  * if not at least that size already.
 458  */
 459 BIG_ERR_CODE
 460 big_extend(BIGNUM *number, int size)
 461 {
 462         BIG_CHUNK_TYPE  *newptr;
 463         int             i;
 464
 465         if (number->size >= size)
 466                 return (BIG_OK);
 467         if (number->malloced) {
 468                 number->value = big_realloc(number->value,
 469                     BIGNUM_WORDSIZE * number->size,
 470                     BIGNUM_WORDSIZE * size);
 471         } else {
 472                 newptr = big_malloc(BIGNUM_WORDSIZE * size);
 473                 if (newptr != NULL) {
 474                         for (i = 0; i < number->size; i++) {
 475                                 newptr[i] = number->value[i];
 476                         }
 477                 }
 478                 number->value = newptr;
 479         }
 480
 481         if (number->value == NULL) {
 482                 return (BIG_NO_MEM);
 483         }
 484
 485         number->size = size;
 486         number->malloced = 1;
 487         return (BIG_OK);
 488 }
 489
 490
 491 /* returns 1 if n == 0 */
 492 int
 493 big_is_zero(BIGNUM *n)
 494 {
 495         int     i, result;
 496
 497         result = 1;
 498         for (i = 0; i < n->len; i++) {
 499                 if (n->value[i] != 0) {
 500                         result = 0;
 501                 }
 502         }
 503         return (result);
 504 }
 505
 506
 507 BIG_ERR_CODE
 508 big_add_abs(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
 509 {
 510         int             i, shorter, longer;
 511         BIG_CHUNK_TYPE  cy, ai;
 512         BIG_CHUNK_TYPE  *r, *a, *b, *c;
 513         BIG_ERR_CODE    err;
 514         BIGNUM          *longerarg;
 515
 516         if (aa->len > bb->len) {
 517                 shorter = bb->len;
 518                 longer = aa->len;
 519                 longerarg = aa;
 520         } else {
 521                 shorter = aa->len;
 522                 longer = bb->len;
 523                 longerarg = bb;
 524         }
 525         if (result->size < longer + 1) {
 526                 err = big_extend(result, longer + 1);
 527                 if (err != BIG_OK) {
 528                         return (err);
 529                 }
 530         }
 531
 532         r = result->value;
 533         a = aa->value;
 534         b = bb->value;
 535         c = longerarg->value;
 536         cy = 0;
 537         for (i = 0; i < shorter; i++) {
 538                 ai = a[i];
 539                 r[i] = ai + b[i] + cy;
 540                 if (r[i] > ai) {
 541                         cy = 0;
 542                 } else if (r[i] < ai) {
 543                         cy = 1;
 544                 }
 545         }
 546         for (; i < longer; i++) {
 547                 ai = c[i];
 548                 r[i] = ai + cy;
 549                 if (r[i] >= ai) {
 550                         cy = 0;
 551                 }
 552         }
 553         if (cy == 1) {
 554                 r[i] = cy;
 555                 result->len = longer + 1;
 556         } else {
 557                 result->len = longer;
 558         }
 559         result->sign = 1;
 560         return (BIG_OK);
 561 }
 562
 563
 564 /* caller must make sure that result has at least len words allocated */
 565 void
 566 big_sub_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, BIG_CHUNK_TYPE *b, int len)
 567 {
 568         int             i;
 569         BIG_CHUNK_TYPE  cy, ai;
 570
 571         cy = 1;
 572         for (i = 0; i < len; i++) {
 573                 ai = a[i];
 574                 r[i] = ai + (~b[i]) + cy;
 575                 if (r[i] > ai) {
 576                         cy = 0;
 577                 } else if (r[i] < ai) {
 578                         cy = 1;
 579                 }
 580         }
 581 }
 582
 583
 584 /* result=aa-bb  it is assumed that aa>=bb */
 585 BIG_ERR_CODE
 586 big_sub_pos(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
 587 {
 588         int             i, shorter;
 589         BIG_CHUNK_TYPE  cy = 1, ai;
 590         BIG_CHUNK_TYPE  *r, *a, *b;
 591         BIG_ERR_CODE    err = BIG_OK;
 592
 593         if (aa->len > bb->len) {
 594                 shorter = bb->len;
 595         } else {
 596                 shorter = aa->len;
 597         }
 598         if (result->size < aa->len) {
 599                 err = big_extend(result, aa->len);
 600                 if (err != BIG_OK) {
 601                         return (err);
 602                 }
 603         }
 604
 605         r = result->value;
 606         a = aa->value;
 607         b = bb->value;
 608         result->len = aa->len;
 609         cy = 1;
 610         for (i = 0; i < shorter; i++) {
 611                 ai = a[i];
 612                 r[i] = ai + (~b[i]) + cy;
 613                 if (r[i] > ai) {
 614                         cy = 0;
 615                 } else if (r[i] < ai) {
 616                         cy = 1;
 617                 }
 618         }
 619         for (; i < aa->len; i++) {
 620                 ai = a[i];
 621                 r[i] = ai + (~0) + cy;
 622                 if (r[i] < ai) {
 623                         cy = 1;
 624                 }
 625         }
 626         result->sign = 1;
 627
 628         if (cy == 0) {
 629                 return (BIG_INVALID_ARGS);
 630         } else {
 631                 return (BIG_OK);
 632         }
 633 }
 634
 635
 636 /* returns -1 if |aa|<|bb|, 0 if |aa|==|bb| 1 if |aa|>|bb| */
 637 int
 638 big_cmp_abs(BIGNUM *aa, BIGNUM *bb)
 639 {
 640         int     i;
 641
 642         if (aa->len > bb->len) {
 643                 for (i = aa->len - 1; i > bb->len - 1; i--) {
 644                         if (aa->value[i] > 0) {
 645                                 return (1);
 646                         }
 647                 }
 648         } else if (aa->len < bb->len) {
 649                 for (i = bb->len - 1; i > aa->len - 1; i--) {
 650                         if (bb->value[i] > 0) {
 651                                 return (-1);
 652                         }
 653                 }
 654         } else {
 655                 i = aa->len - 1;
 656         }
 657         for (; i >= 0; i--) {
 658                 if (aa->value[i] > bb->value[i]) {
 659                         return (1);
 660                 } else if (aa->value[i] < bb->value[i]) {
 661                         return (-1);
 662                 }
 663         }
 664
 665         return (0);
 666 }
 667
 668
 669 BIG_ERR_CODE
 670 big_sub(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
 671 {
 672         BIG_ERR_CODE    err;
 673
 674         if ((bb->sign == -1) && (aa->sign == 1)) {
 675                 if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
 676                         return (err);
 677                 }
 678                 result->sign = 1;
 679         } else if ((aa->sign == -1) && (bb->sign == 1)) {
 680                 if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
 681                         return (err);
 682                 }
 683                 result->sign = -1;
 684         } else if ((aa->sign == 1) && (bb->sign == 1)) {
 685                 if (big_cmp_abs(aa, bb) >= 0) {
 686                         if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
 687                                 return (err);
 688                         }
 689                         result->sign = 1;
 690                 } else {
 691                         if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
 692                                 return (err);
 693                         }
 694                         result->sign = -1;
 695                 }
 696         } else {
 697                 if (big_cmp_abs(aa, bb) >= 0) {
 698                         if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
 699                                 return (err);
 700                         }
 701                         result->sign = -1;
 702                 } else {
 703                         if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
 704                                 return (err);
 705                         }
 706                         result->sign = 1;
 707                 }
 708         }
 709         return (BIG_OK);
 710 }
 711
 712
 713 BIG_ERR_CODE
 714 big_add(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
 715 {
 716         BIG_ERR_CODE    err;
 717
 718         if ((bb->sign == -1) && (aa->sign == -1)) {
 719                 if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
 720                         return (err);
 721                 }
 722                 result->sign = -1;
 723         } else if ((aa->sign == 1) && (bb->sign == 1)) {
 724                 if ((err = big_add_abs(result, aa, bb)) != BIG_OK) {
 725                         return (err);
 726                 }
 727                 result->sign = 1;
 728         } else if ((aa->sign == 1) && (bb->sign == -1)) {
 729                 if (big_cmp_abs(aa, bb) >= 0) {
 730                         if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
 731                                 return (err);
 732                         }
 733                         result->sign = 1;
 734                 } else {
 735                         if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
 736                                 return (err);
 737                         }
 738                         result->sign = -1;
 739                 }
 740         } else {
 741                 if (big_cmp_abs(aa, bb) >= 0) {
 742                         if ((err = big_sub_pos(result, aa, bb)) != BIG_OK) {
 743                                 return (err);
 744                         }
 745                         result->sign = -1;
 746                 } else {
 747                         if ((err = big_sub_pos(result, bb, aa)) != BIG_OK) {
 748                                 return (err);
 749                         }
 750                         result->sign = 1;
 751                 }
 752         }
 753         return (BIG_OK);
 754 }
 755
 756
 757 /* result = aa/2 */
 758 BIG_ERR_CODE
 759 big_half_pos(BIGNUM *result, BIGNUM *aa)
 760 {
 761         BIG_ERR_CODE    err;
 762         int             i;
 763         BIG_CHUNK_TYPE  cy, cy1;
 764         BIG_CHUNK_TYPE  *a, *r;
 765
 766         if (result->size < aa->len) {
 767                 err = big_extend(result, aa->len);
 768                 if (err != BIG_OK) {
 769                         return (err);
 770                 }
 771         }
 772
 773         result->len = aa->len;
 774         a = aa->value;
 775         r = result->value;
 776         cy = 0;
 777         for (i = aa->len - 1; i >= 0; i--) {
 778                 cy1 = a[i] << (BIG_CHUNK_SIZE - 1);
 779                 r[i] = (cy | (a[i] >> 1));
 780                 cy = cy1;
 781         }
 782         if (r[result->len - 1] == 0) {
 783                 result->len--;
 784         }
 785
 786         return (BIG_OK);
 787 }
 788
 789 /* result  =  aa*2 */
 790 BIG_ERR_CODE
 791 big_double(BIGNUM *result, BIGNUM *aa)
 792 {
 793         BIG_ERR_CODE    err;
 794         int             i, rsize;
 795         BIG_CHUNK_TYPE  cy, cy1;
 796         BIG_CHUNK_TYPE  *a, *r;
 797
 798         if ((aa->len > 0) &&
 799             ((aa->value[aa->len - 1] & BIG_CHUNK_HIGHBIT) != 0)) {
 800                 rsize = aa->len + 1;
 801         } else {
 802                 rsize = aa->len;
 803         }
 804
 805         if (result->size < rsize) {
 806                 err = big_extend(result, rsize);
 807                 if (err != BIG_OK)
 808                         return (err);
 809         }
 810
 811         a = aa->value;
 812         r = result->value;
 813         if (rsize == aa->len + 1) {
 814                 r[rsize - 1] = 1;
 815         }
 816         cy = 0;
 817         for (i = 0; i < aa->len; i++) {
 818                 cy1 = a[i] >> (BIG_CHUNK_SIZE - 1);
 819                 r[i] = (cy | (a[i] << 1));
 820                 cy = cy1;
 821         }
 822         result->len = rsize;
 823         return (BIG_OK);
 824 }
 825
 826
 827 /*
 828  * returns aa mod b, aa must be nonneg, b must be a max
 829  * (BIG_CHUNK_SIZE / 2)-bit integer
 830  */
 831 static uint32_t
 832 big_modhalf_pos(BIGNUM *aa, uint32_t b)
 833 {
 834         int             i;
 835         BIG_CHUNK_TYPE  rem;
 836
 837         if (aa->len == 0) {
 838                 return (0);
 839         }
 840         rem = aa->value[aa->len - 1] % b;
 841         for (i = aa->len - 2; i >= 0; i--) {
 842                 rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
 843                     (aa->value[i] >> (BIG_CHUNK_SIZE / 2))) % b;
 844                 rem = ((rem << (BIG_CHUNK_SIZE / 2)) |
 845                     (aa->value[i] & BIG_CHUNK_LOWHALFBITS)) % b;
 846         }
 847
 848         return ((uint32_t)rem);
 849 }
 850
 851
 852 /*
 853  * result = aa - (2^BIG_CHUNK_SIZE)^lendiff * bb
 854  * result->size should be at least aa->len at entry
 855  * aa, bb, and result should be positive
 856  */
 857 void
 858 big_sub_pos_high(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
 859 {
 860         int i, lendiff;
 861         BIGNUM res1, aa1;
 862
 863         lendiff = aa->len - bb->len;
 864         res1.size = result->size - lendiff;
 865         res1.malloced = 0;
 866         res1.value = result->value + lendiff;
 867         aa1.size = aa->size - lendiff;
 868         aa1.value = aa->value + lendiff;
 869         aa1.len = bb->len;
 870         aa1.sign = 1;
 871         (void) big_sub_pos(&res1, &aa1, bb);
 872         if (result->value != aa->value) {
 873                 for (i = 0; i < lendiff; i++) {
 874                         result->value[i] = aa->value[i];
 875                 }
 876         }
 877         result->len = aa->len;
 878 }
 879
 880
 881 /*
 882  * returns 1, 0, or -1 depending on whether |aa| > , ==, or <
 883  *                                      (2^BIG_CHUNK_SIZE)^lendiff * |bb|
 884  * aa->len should be >= bb->len
 885  */
 886 int
 887 big_cmp_abs_high(BIGNUM *aa, BIGNUM *bb)
 888 {
 889         int             lendiff;
 890         BIGNUM          aa1;
 891
 892         lendiff = aa->len - bb->len;
 893         aa1.len = bb->len;
 894         aa1.size = aa->size - lendiff;
 895         aa1.malloced = 0;
 896         aa1.value = aa->value + lendiff;
 897         return (big_cmp_abs(&aa1, bb));
 898 }
 899
 900
 901 /*
 902  * result = aa * b where b is a max. (BIG_CHUNK_SIZE / 2)-bit positive integer.
 903  * result should have enough space allocated.
 904  */
 905 static void
 906 big_mulhalf_low(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
 907 {
 908         int             i;
 909         BIG_CHUNK_TYPE  t1, t2, ai, cy;
 910         BIG_CHUNK_TYPE  *a, *r;
 911
 912         a = aa->value;
 913         r = result->value;
 914         cy = 0;
 915         for (i = 0; i < aa->len; i++) {
 916                 ai = a[i];
 917                 t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
 918                 t2 = (ai >> (BIG_CHUNK_SIZE / 2)) * b +
 919                     (t1 >> (BIG_CHUNK_SIZE / 2));
 920                 r[i] = (t1 & BIG_CHUNK_LOWHALFBITS) |
 921                     (t2 << (BIG_CHUNK_SIZE / 2));
 922                 cy = t2 >> (BIG_CHUNK_SIZE / 2);
 923         }
 924         r[i] = cy;
 925         result->len = aa->len + 1;
 926         result->sign = aa->sign;
 927 }
 928
 929
 930 /*
 931  * result = aa * b * 2^(BIG_CHUNK_SIZE / 2) where b is a max.
 932  * (BIG_CHUNK_SIZE / 2)-bit positive integer.
 933  * result should have enough space allocated.
 934  */
 935 static void
 936 big_mulhalf_high(BIGNUM *result, BIGNUM *aa, BIG_CHUNK_TYPE b)
 937 {
 938         int             i;
 939         BIG_CHUNK_TYPE  t1, t2, ai, cy, ri;
 940         BIG_CHUNK_TYPE  *a, *r;
 941
 942         a = aa->value;
 943         r = result->value;
 944         cy = 0;
 945         ri = 0;
 946         for (i = 0; i < aa->len; i++) {
 947                 ai = a[i];
 948                 t1 = (ai & BIG_CHUNK_LOWHALFBITS) * b + cy;
 949                 t2 = (ai >>  (BIG_CHUNK_SIZE / 2)) * b +
 950                     (t1 >>  (BIG_CHUNK_SIZE / 2));
 951                 r[i] = (t1 <<  (BIG_CHUNK_SIZE / 2)) + ri;
 952                 ri = t2 & BIG_CHUNK_LOWHALFBITS;
 953                 cy = t2 >> (BIG_CHUNK_SIZE / 2);
 954         }
 955         r[i] = (cy <<  (BIG_CHUNK_SIZE / 2)) + ri;
 956         result->len = aa->len + 1;
 957         result->sign = aa->sign;
 958 }
 959
 960
 961 /* it is assumed that result->size is big enough */
 962 void
 963 big_shiftleft(BIGNUM *result, BIGNUM *aa, int offs)
 964 {
 965         int             i;
 966         BIG_CHUNK_TYPE  cy, ai;
 967
 968         if (offs == 0) {
 969                 if (result != aa) {
 970                         (void) big_copy(result, aa);
 971                 }
 972                 return;
 973         }
 974         cy = 0;
 975         for (i = 0; i < aa->len; i++) {
 976                 ai = aa->value[i];
 977                 result->value[i] = (ai << offs) | cy;
 978                 cy = ai >> (BIG_CHUNK_SIZE - offs);
 979         }
 980         if (cy != 0) {
 981                 result->len = aa->len + 1;
 982                 result->value[result->len - 1] = cy;
 983         } else {
 984                 result->len = aa->len;
 985         }
 986         result->sign = aa->sign;
 987 }
 988
 989
 990 /* it is assumed that result->size is big enough */
 991 void
 992 big_shiftright(BIGNUM *result, BIGNUM *aa, int offs)
 993 {
 994         int              i;
 995         BIG_CHUNK_TYPE  cy, ai;
 996
 997         if (offs == 0) {
 998                 if (result != aa) {
 999                         (void) big_copy(result, aa);
1000                 }
1001                 return;
1002         }
1003         cy = aa->value[0] >> offs;
1004         for (i = 1; i < aa->len; i++) {
1005                 ai = aa->value[i];
1006                 result->value[i - 1] = (ai << (BIG_CHUNK_SIZE - offs)) | cy;
1007                 cy = ai >> offs;
1008         }
1009         result->len = aa->len;
1010         result->value[result->len - 1] = cy;
1011         result->sign = aa->sign;
1012 }
1013
1014
1015 /*
1016  * result = aa/bb   remainder = aa mod bb
1017  * it is assumed that aa and bb are positive
1018  */
1019 BIG_ERR_CODE
1020 big_div_pos(BIGNUM *result, BIGNUM *remainder, BIGNUM *aa, BIGNUM *bb)
1021 {
1022         BIG_ERR_CODE    err = BIG_OK;
1023         int             i, alen, blen, tlen, rlen, offs;
1024         BIG_CHUNK_TYPE  higha, highb, coeff;
1025         BIG_CHUNK_TYPE  *a, *b;
1026         BIGNUM          bbhigh, bblow, tresult, tmp1, tmp2;
1027         BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
1028         BIG_CHUNK_TYPE  tmp2value[BIGTMPSIZE];
1029         BIG_CHUNK_TYPE  tresultvalue[BIGTMPSIZE];
1030         BIG_CHUNK_TYPE  bblowvalue[BIGTMPSIZE];
1031         BIG_CHUNK_TYPE  bbhighvalue[BIGTMPSIZE];
1032
1033         a = aa->value;
1034         b = bb->value;
1035         alen = aa->len;
1036         blen = bb->len;
1037         while ((alen > 1) && (a[alen - 1] == 0)) {
1038                 alen = alen - 1;
1039         }
1040         aa->len = alen;
1041         while ((blen > 1) && (b[blen - 1] == 0)) {
1042                 blen = blen - 1;
1043         }
1044         bb->len = blen;
1045         if ((blen == 1) && (b[0] == 0)) {
1046                 return (BIG_DIV_BY_0);
1047         }
1048
1049         if (big_cmp_abs(aa, bb) < 0) {
1050                 if ((remainder != NULL) &&
1051                     ((err = big_copy(remainder, aa)) != BIG_OK)) {
1052                         return (err);
1053                 }
1054                 if (result != NULL) {
1055                         result->len = 1;
1056                         result->sign = 1;
1057                         result->value[0] = 0;
1058                 }
1059                 return (BIG_OK);
1060         }
1061
1062         if ((err = big_init1(&bblow, blen + 1,
1063             bblowvalue, arraysize(bblowvalue))) != BIG_OK)
1064                 return (err);
1065
1066         if ((err = big_init1(&bbhigh, blen + 1,
1067             bbhighvalue, arraysize(bbhighvalue))) != BIG_OK)
1068                 goto ret1;
1069
1070         if ((err = big_init1(&tmp1, alen + 2,
1071             tmp1value, arraysize(tmp1value))) != BIG_OK)
1072                 goto ret2;
1073
1074         if ((err = big_init1(&tmp2, blen + 2,
1075             tmp2value, arraysize(tmp2value))) != BIG_OK)
1076                 goto ret3;
1077
1078         if ((err = big_init1(&tresult, alen - blen + 2,
1079             tresultvalue, arraysize(tresultvalue))) != BIG_OK)
1080                 goto ret4;
1081
1082         offs = 0;
1083         highb = b[blen - 1];
1084         if (highb >= (BIG_CHUNK_HALF_HIGHBIT << 1)) {
1085                 highb = highb >> (BIG_CHUNK_SIZE / 2);
1086                 offs = (BIG_CHUNK_SIZE / 2);
1087         }
1088         while ((highb & BIG_CHUNK_HALF_HIGHBIT) == 0) {
1089                 highb = highb << 1;
1090                 offs++;
1091         }
1092
1093         big_shiftleft(&bblow, bb, offs);
1094
1095         if (offs <= (BIG_CHUNK_SIZE / 2 - 1)) {
1096                 big_shiftleft(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1097         } else {
1098                 big_shiftright(&bbhigh, &bblow, BIG_CHUNK_SIZE / 2);
1099         }
1100         if (bbhigh.value[bbhigh.len - 1] == 0) {
1101                 bbhigh.len--;
1102         } else {
1103                 bbhigh.value[bbhigh.len] = 0;
1104         }
1105
1106         highb = bblow.value[bblow.len - 1];
1107
1108         big_shiftleft(&tmp1, aa, offs);
1109         rlen = tmp1.len - bblow.len + 1;
1110         tresult.len = rlen;
1111
1112         tmp1.len++;
1113         tlen = tmp1.len;
1114         tmp1.value[tmp1.len - 1] = 0;
1115         for (i = 0; i < rlen; i++) {
1116                 higha = (tmp1.value[tlen - 1] << (BIG_CHUNK_SIZE / 2)) +
1117                     (tmp1.value[tlen - 2] >> (BIG_CHUNK_SIZE / 2));
1118                 coeff = higha / (highb + 1);
1119                 big_mulhalf_high(&tmp2, &bblow, coeff);
1120                 big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1121                 bbhigh.len++;
1122                 while (tmp1.value[tlen - 1] > 0) {
1123                         big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1124                         coeff++;
1125                 }
1126                 bbhigh.len--;
1127                 tlen--;
1128                 tmp1.len--;
1129                 while (big_cmp_abs_high(&tmp1, &bbhigh) >= 0) {
1130                         big_sub_pos_high(&tmp1, &tmp1, &bbhigh);
1131                         coeff++;
1132                 }
1133                 tresult.value[rlen - i - 1] = coeff << (BIG_CHUNK_SIZE / 2);
1134                 higha = tmp1.value[tlen - 1];
1135                 coeff = higha / (highb + 1);
1136                 big_mulhalf_low(&tmp2, &bblow, coeff);
1137                 tmp2.len--;
1138                 big_sub_pos_high(&tmp1, &tmp1, &tmp2);
1139                 while (big_cmp_abs_high(&tmp1, &bblow) >= 0) {
1140                         big_sub_pos_high(&tmp1, &tmp1, &bblow);
1141                         coeff++;
1142                 }
1143                 tresult.value[rlen - i - 1] =
1144                     tresult.value[rlen - i - 1] + coeff;
1145         }
1146
1147         big_shiftright(&tmp1, &tmp1, offs);
1148
1149         err = BIG_OK;
1150
1151         if ((remainder != NULL) &&
1152             ((err = big_copy(remainder, &tmp1)) != BIG_OK))
1153                 goto ret;
1154
1155         if (result != NULL)
1156                 err = big_copy(result, &tresult);
1157
1158 ret:
1159         big_finish(&tresult);
1160 ret4:
1161         big_finish(&tmp1);
1162 ret3:
1163         big_finish(&tmp2);
1164 ret2:
1165         big_finish(&bbhigh);
1166 ret1:
1167         big_finish(&bblow);
1168         return (err);
1169 }
1170
1171
1172 /*
1173  * If there is no processor-specific integer implementation of
1174  * the lower level multiply functions, then this code is provided
1175  * for big_mul_set_vec(), big_mul_add_vec(), big_mul_vec() and
1176  * big_sqr_vec().
1177  *
1178  * There are two generic implementations.  One that assumes that
1179  * there is hardware and C compiler support for a 32 x 32 --> 64
1180  * bit unsigned multiply, but otherwise is not specific to any
1181  * processor, platform, or ISA.
1182  *
1183  * The other makes very few assumptions about hardware capabilities.
1184  * It does not even assume that there is any implementation of a
1185  * 32 x 32 --> 64 bit multiply that is accessible to C code and
1186  * appropriate to use.  It falls constructs 32 x 32 --> 64 bit
1187  * multiplies from 16 x 16 --> 32 bit multiplies.
1188  *
1189  */
1190
1191 #if !defined(PSR_MUL)
1192
1193 #ifdef UMUL64
1194
1195 #if (BIG_CHUNK_SIZE == 32)
1196
1197 #define UNROLL8
1198
1199 #define MUL_SET_VEC_ROUND_PREFETCH(R) \
1200         p = pf * d; \
1201         pf = (uint64_t)a[R + 1]; \
1202         t = p + cy; \
1203         r[R] = (uint32_t)t; \
1204         cy = t >> 32
1205
1206 #define MUL_SET_VEC_ROUND_NOPREFETCH(R) \
1207         p = pf * d; \
1208         t = p + cy; \
1209         r[R] = (uint32_t)t; \
1210         cy = t >> 32
1211
1212 #define MUL_ADD_VEC_ROUND_PREFETCH(R) \
1213         t = (uint64_t)r[R]; \
1214         p = pf * d; \
1215         pf = (uint64_t)a[R + 1]; \
1216         t = p + t + cy; \
1217         r[R] = (uint32_t)t; \
1218         cy = t >> 32
1219
1220 #define MUL_ADD_VEC_ROUND_NOPREFETCH(R) \
1221         t = (uint64_t)r[R]; \
1222         p = pf * d; \
1223         t = p + t + cy; \
1224         r[R] = (uint32_t)t; \
1225         cy = t >> 32
1226
1227 #ifdef UNROLL8
1228
1229 #define UNROLL 8
1230
1231 /*
1232  * r = a * b
1233  * where r and a are vectors; b is a single 32-bit digit
1234  */
1235
1236 uint32_t
1237 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1238 {
1239         uint64_t d, pf, p, t, cy;
1240
1241         if (len == 0)
1242                 return (0);
1243         cy = 0;
1244         d = (uint64_t)b;
1245         pf = (uint64_t)a[0];
1246         while (len > UNROLL) {
1247                 MUL_SET_VEC_ROUND_PREFETCH(0);
1248                 MUL_SET_VEC_ROUND_PREFETCH(1);
1249                 MUL_SET_VEC_ROUND_PREFETCH(2);
1250                 MUL_SET_VEC_ROUND_PREFETCH(3);
1251                 MUL_SET_VEC_ROUND_PREFETCH(4);
1252                 MUL_SET_VEC_ROUND_PREFETCH(5);
1253                 MUL_SET_VEC_ROUND_PREFETCH(6);
1254                 MUL_SET_VEC_ROUND_PREFETCH(7);
1255                 r += UNROLL;
1256                 a += UNROLL;
1257                 len -= UNROLL;
1258         }
1259         if (len == UNROLL) {
1260                 MUL_SET_VEC_ROUND_PREFETCH(0);
1261                 MUL_SET_VEC_ROUND_PREFETCH(1);
1262                 MUL_SET_VEC_ROUND_PREFETCH(2);
1263                 MUL_SET_VEC_ROUND_PREFETCH(3);
1264                 MUL_SET_VEC_ROUND_PREFETCH(4);
1265                 MUL_SET_VEC_ROUND_PREFETCH(5);
1266                 MUL_SET_VEC_ROUND_PREFETCH(6);
1267                 MUL_SET_VEC_ROUND_NOPREFETCH(7);
1268                 return ((uint32_t)cy);
1269         }
1270         while (len > 1) {
1271                 MUL_SET_VEC_ROUND_PREFETCH(0);
1272                 ++r;
1273                 ++a;
1274                 --len;
1275         }
1276         if (len > 0) {
1277                 MUL_SET_VEC_ROUND_NOPREFETCH(0);
1278         }
1279         return ((uint32_t)cy);
1280 }
1281
1282 /*
1283  * r += a * b
1284  * where r and a are vectors; b is a single 32-bit digit
1285  */
1286
1287 uint32_t
1288 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t b)
1289 {
1290         uint64_t d, pf, p, t, cy;
1291
1292         if (len == 0)
1293                 return (0);
1294         cy = 0;
1295         d = (uint64_t)b;
1296         pf = (uint64_t)a[0];
1297         while (len > 8) {
1298                 MUL_ADD_VEC_ROUND_PREFETCH(0);
1299                 MUL_ADD_VEC_ROUND_PREFETCH(1);
1300                 MUL_ADD_VEC_ROUND_PREFETCH(2);
1301                 MUL_ADD_VEC_ROUND_PREFETCH(3);
1302                 MUL_ADD_VEC_ROUND_PREFETCH(4);
1303                 MUL_ADD_VEC_ROUND_PREFETCH(5);
1304                 MUL_ADD_VEC_ROUND_PREFETCH(6);
1305                 MUL_ADD_VEC_ROUND_PREFETCH(7);
1306                 r += 8;
1307                 a += 8;
1308                 len -= 8;
1309         }
1310         if (len == 8) {
1311                 MUL_ADD_VEC_ROUND_PREFETCH(0);
1312                 MUL_ADD_VEC_ROUND_PREFETCH(1);
1313                 MUL_ADD_VEC_ROUND_PREFETCH(2);
1314                 MUL_ADD_VEC_ROUND_PREFETCH(3);
1315                 MUL_ADD_VEC_ROUND_PREFETCH(4);
1316                 MUL_ADD_VEC_ROUND_PREFETCH(5);
1317                 MUL_ADD_VEC_ROUND_PREFETCH(6);
1318                 MUL_ADD_VEC_ROUND_NOPREFETCH(7);
1319                 return ((uint32_t)cy);
1320         }
1321         while (len > 1) {
1322                 MUL_ADD_VEC_ROUND_PREFETCH(0);
1323                 ++r;
1324                 ++a;
1325                 --len;
1326         }
1327         if (len > 0) {
1328                 MUL_ADD_VEC_ROUND_NOPREFETCH(0);
1329         }
1330         return ((uint32_t)cy);
1331 }
1332 #endif /* UNROLL8 */
1333
1334 void
1335 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1336 {
1337         uint32_t        *tr, *ta;
1338         int             tlen, row, col;
1339         uint64_t        p, s, t, t2, cy;
1340         uint32_t        d;
1341
1342         tr = r + 1;
1343         ta = a;
1344         tlen = len - 1;
1345         tr[tlen] = big_mul_set_vec(tr, ta + 1, tlen, ta[0]);
1346         while (--tlen > 0) {
1347                 tr += 2;
1348                 ++ta;
1349                 tr[tlen] = big_mul_add_vec(tr, ta + 1, tlen, ta[0]);
1350         }
1351         s = (uint64_t)a[0];
1352         s = s * s;
1353         r[0] = (uint32_t)s;
1354         cy = s >> 32;
1355         p = ((uint64_t)r[1] << 1) + cy;
1356         r[1] = (uint32_t)p;
1357         cy = p >> 32;
1358         row = 1;
1359         col = 2;
1360         while (row < len) {
1361                 s = (uint64_t)a[row];
1362                 s = s * s;
1363                 p = (uint64_t)r[col] << 1;
1364                 t = p + s;
1365                 d = (uint32_t)t;
1366                 t2 = (uint64_t)d + cy;
1367                 r[col] = (uint32_t)t2;
1368                 cy = (t >> 32) + (t2 >> 32);
1369                 if (row == len - 1)
1370                         break;
1371                 p = ((uint64_t)r[col + 1] << 1) + cy;
1372                 r[col + 1] = (uint32_t)p;
1373                 cy = p >> 32;
1374                 ++row;
1375                 col += 2;
1376         }
1377         r[col + 1] = (uint32_t)cy;
1378 }
1379
1380 #else /* BIG_CHUNK_SIZE == 64 */
1381
1382 /*
1383  * r = r + a * digit, r and a are vectors of length len
1384  * returns the carry digit
1385  */
1386 BIG_CHUNK_TYPE
1387 big_mul_add_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1388     BIG_CHUNK_TYPE digit)
1389 {
1390         BIG_CHUNK_TYPE  cy, cy1, retcy, dlow, dhigh;
1391         int             i;
1392
1393         cy1 = 0;
1394         dlow = digit & BIG_CHUNK_LOWHALFBITS;
1395         dhigh = digit >> (BIG_CHUNK_SIZE / 2);
1396         for (i = 0; i < len; i++) {
1397                 cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1398                     dlow * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1399                     (r[i] & BIG_CHUNK_LOWHALFBITS);
1400                 cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1401                     dlow * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1402                     (r[i] >> (BIG_CHUNK_SIZE / 2));
1403                 r[i] = (cy & BIG_CHUNK_LOWHALFBITS) |
1404                     (cy1 << (BIG_CHUNK_SIZE / 2));
1405         }
1406         retcy = cy1 >> (BIG_CHUNK_SIZE / 2);
1407
1408         cy1 = r[0] & BIG_CHUNK_LOWHALFBITS;
1409         for (i = 0; i < len - 1; i++) {
1410                 cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1411                     dhigh * (a[i] & BIG_CHUNK_LOWHALFBITS) +
1412                     (r[i] >> (BIG_CHUNK_SIZE / 2));
1413                 r[i] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1414                     (cy << (BIG_CHUNK_SIZE / 2));
1415                 cy1 = (cy >> (BIG_CHUNK_SIZE / 2)) +
1416                     dhigh * (a[i] >> (BIG_CHUNK_SIZE / 2)) +
1417                     (r[i + 1] & BIG_CHUNK_LOWHALFBITS);
1418         }
1419         cy = (cy1 >> (BIG_CHUNK_SIZE / 2)) +
1420             dhigh * (a[len - 1] & BIG_CHUNK_LOWHALFBITS) +
1421             (r[len - 1] >> (BIG_CHUNK_SIZE / 2));
1422         r[len - 1] = (cy1 & BIG_CHUNK_LOWHALFBITS) |
1423             (cy << (BIG_CHUNK_SIZE / 2));
1424         retcy = (cy >> (BIG_CHUNK_SIZE / 2)) +
1425             dhigh * (a[len - 1] >> (BIG_CHUNK_SIZE / 2)) + retcy;
1426
1427         return (retcy);
1428 }
1429
1430
1431 /*
1432  * r = a * digit, r and a are vectors of length len
1433  * returns the carry digit
1434  */
1435 BIG_CHUNK_TYPE
1436 big_mul_set_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len,
1437     BIG_CHUNK_TYPE digit)
1438 {
1439         int     i;
1440
1441         ASSERT(r != a);
1442         for (i = 0; i < len; i++) {
1443                 r[i] = 0;
1444         }
1445         return (big_mul_add_vec(r, a, len, digit));
1446 }
1447
1448 void
1449 big_sqr_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int len)
1450 {
1451         int i;
1452
1453         ASSERT(r != a);
1454         r[len] = big_mul_set_vec(r, a, len, a[0]);
1455         for (i = 1; i < len; ++i)
1456                 r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1457 }
1458
1459 #endif /* BIG_CHUNK_SIZE == 32/64 */
1460
1461
1462 #else /* ! UMUL64 */
1463
1464 #if (BIG_CHUNK_SIZE != 32)
1465 #error "Don't use 64-bit chunks without defining UMUL64"
1466 #endif
1467
1468
1469 /*
1470  * r = r + a * digit, r and a are vectors of length len
1471  * returns the carry digit
1472  */
1473 uint32_t
1474 big_mul_add_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1475 {
1476         uint32_t cy, cy1, retcy, dlow, dhigh;
1477         int i;
1478
1479         cy1 = 0;
1480         dlow = digit & 0xffff;
1481         dhigh = digit >> 16;
1482         for (i = 0; i < len; i++) {
1483                 cy = (cy1 >> 16) + dlow * (a[i] & 0xffff) + (r[i] & 0xffff);
1484                 cy1 = (cy >> 16) + dlow * (a[i]>>16) + (r[i] >> 16);
1485                 r[i] = (cy & 0xffff) | (cy1 << 16);
1486         }
1487         retcy = cy1 >> 16;
1488
1489         cy1 = r[0] & 0xffff;
1490         for (i = 0; i < len - 1; i++) {
1491                 cy = (cy1 >> 16) + dhigh * (a[i] & 0xffff) + (r[i] >> 16);
1492                 r[i] = (cy1 & 0xffff) | (cy << 16);
1493                 cy1 = (cy >> 16) + dhigh * (a[i] >> 16) + (r[i + 1] & 0xffff);
1494         }
1495         cy = (cy1 >> 16) + dhigh * (a[len - 1] & 0xffff) + (r[len - 1] >> 16);
1496         r[len - 1] = (cy1 & 0xffff) | (cy << 16);
1497         retcy = (cy >> 16) + dhigh * (a[len - 1] >> 16) + retcy;
1498
1499         return (retcy);
1500 }
1501
1502
1503 /*
1504  * r = a * digit, r and a are vectors of length len
1505  * returns the carry digit
1506  */
1507 uint32_t
1508 big_mul_set_vec(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1509 {
1510         int     i;
1511
1512         ASSERT(r != a);
1513         for (i = 0; i < len; i++) {
1514                 r[i] = 0;
1515         }
1516
1517         return (big_mul_add_vec(r, a, len, digit));
1518 }
1519
1520 void
1521 big_sqr_vec(uint32_t *r, uint32_t *a, int len)
1522 {
1523         int i;
1524
1525         ASSERT(r != a);
1526         r[len] = big_mul_set_vec(r, a, len, a[0]);
1527         for (i = 1; i < len; ++i)
1528                 r[len + i] = big_mul_add_vec(r + i, a, len, a[i]);
1529 }
1530
1531 #endif /* UMUL64 */
1532
1533 void
1534 big_mul_vec(BIG_CHUNK_TYPE *r, BIG_CHUNK_TYPE *a, int alen,
1535     BIG_CHUNK_TYPE *b, int blen)
1536 {
1537         int i;
1538
1539         r[alen] = big_mul_set_vec(r, a, alen, b[0]);
1540         for (i = 1; i < blen; ++i)
1541                 r[alen + i] = big_mul_add_vec(r + i, a, alen, b[i]);
1542 }
1543
1544
1545 #endif /* ! PSR_MUL */
1546
1547
1548 /*
1549  * result = aa * bb  result->value should be big enough to hold the result
1550  *
1551  * Implementation: Standard grammar school algorithm
1552  *
1553  */
1554 BIG_ERR_CODE
1555 big_mul(BIGNUM *result, BIGNUM *aa, BIGNUM *bb)
1556 {
1557         BIGNUM          tmp1;
1558         BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
1559         BIG_CHUNK_TYPE  *r, *t, *a, *b;
1560         BIG_ERR_CODE    err;
1561         int             i, alen, blen, rsize, sign, diff;
1562
1563         if (aa == bb) {
1564                 diff = 0;
1565         } else {
1566                 diff = big_cmp_abs(aa, bb);
1567                 if (diff < 0) {
1568                         BIGNUM *tt;
1569                         tt = aa;
1570                         aa = bb;
1571                         bb = tt;
1572                 }
1573         }
1574         a = aa->value;
1575         b = bb->value;
1576         alen = aa->len;
1577         blen = bb->len;
1578         while ((alen > 1) && (a[alen - 1] == 0)) {
1579                 alen--;
1580         }
1581         aa->len = alen;
1582         while ((blen > 1) && (b[blen - 1] == 0)) {
1583                 blen--;
1584         }
1585         bb->len = blen;
1586
1587         rsize = alen + blen;
1588         ASSERT(rsize > 0);
1589         if (result->size < rsize) {
1590                 err = big_extend(result, rsize);
1591                 if (err != BIG_OK) {
1592                         return (err);
1593                 }
1594                 /* aa or bb might be an alias to result */
1595                 a = aa->value;
1596                 b = bb->value;
1597         }
1598         r = result->value;
1599
1600         if (((alen == 1) && (a[0] == 0)) || ((blen == 1) && (b[0] == 0))) {
1601                 result->len = 1;
1602                 result->sign = 1;
1603                 r[0] = 0;
1604                 return (BIG_OK);
1605         }
1606         sign = aa->sign * bb->sign;
1607         if ((alen == 1) && (a[0] == 1)) {
1608                 for (i = 0; i < blen; i++) {
1609                         r[i] = b[i];
1610                 }
1611                 result->len = blen;
1612                 result->sign = sign;
1613                 return (BIG_OK);
1614         }
1615         if ((blen == 1) && (b[0] == 1)) {
1616                 for (i = 0; i < alen; i++) {
1617                         r[i] = a[i];
1618                 }
1619                 result->len = alen;
1620                 result->sign = sign;
1621                 return (BIG_OK);
1622         }
1623
1624         if ((err = big_init1(&tmp1, rsize,
1625             tmp1value, arraysize(tmp1value))) != BIG_OK) {
1626                 return (err);
1627         }
1628         t = tmp1.value;
1629
1630         for (i = 0; i < rsize; i++) {
1631                 t[i] = 0;
1632         }
1633
1634         if (diff == 0 && alen > 2) {
1635                 BIG_SQR_VEC(t, a, alen);
1636         } else if (blen > 0) {
1637                 BIG_MUL_VEC(t, a, alen, b, blen);
1638         }
1639
1640         if (t[rsize - 1] == 0) {
1641                 tmp1.len = rsize - 1;
1642         } else {
1643                 tmp1.len = rsize;
1644         }
1645
1646         err = big_copy(result, &tmp1);
1647
1648         result->sign = sign;
1649
1650         big_finish(&tmp1);
1651
1652         return (err);
1653 }
1654
1655
1656 /*
1657  * big_mont_mul()
1658  * Montgomery multiplication.
1659  *
1660  * Caller must ensure that  a < n,  b < n,  ret->size >=  2 * n->len + 1,
1661  * and that ret is not n.
1662  */
1663 BIG_ERR_CODE
1664 big_mont_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, BIGNUM *n, BIG_CHUNK_TYPE n0)
1665 {
1666         int             i, j, nlen, needsubtract;
1667         BIG_CHUNK_TYPE  *nn, *rr, *rrplusi;
1668         BIG_CHUNK_TYPE  digit, c;
1669         BIG_ERR_CODE    err;
1670 #ifdef  __amd64
1671 #define BIG_CPU_UNKNOWN 0
1672 #define BIG_CPU_AMD     1
1673 #define BIG_CPU_INTEL   2
1674         static int      big_cpu = BIG_CPU_UNKNOWN;
1675         BIG_CHUNK_TYPE  carry[BIGTMPSIZE];
1676
1677         if (big_cpu == BIG_CPU_UNKNOWN) {
1678                 big_cpu = 1 + bignum_on_intel();
1679         }
1680 #endif  /* __amd64 */
1681
1682         nlen = n->len;
1683         nn = n->value;
1684
1685         rr = ret->value;
1686
1687         if ((err = big_mul(ret, a, b)) != BIG_OK) {
1688                 return (err);
1689         }
1690
1691         rr = ret->value;
1692         for (i = ret->len; i < 2 * nlen + 1; i++) {
1693                 rr[i] = 0;
1694         }
1695
1696 #ifdef  __amd64 /* pipelining optimization for Intel 64, but not AMD64 */
1697         if ((big_cpu == BIG_CPU_INTEL) && (nlen <= BIGTMPSIZE)) {
1698                 /*
1699                  * Perform the following in two for loops to reduce the
1700                  * dependency between computing the carryover bits with
1701                  * BIG_MUL_ADD_VEC() and adding them, thus improving pipelining.
1702                  */
1703                 for (i = 0; i < nlen; i++) {
1704                         rrplusi = rr + i;
1705                         digit = *rrplusi * n0;
1706                         carry[i] = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
1707                 }
1708                 for (i = 0; i < nlen; i++) {
1709                         j = i + nlen;
1710                         rr[j] += carry[i];
1711                         while (rr[j] < carry[i]) {
1712                                 rr[++j] += 1;
1713                                 carry[i] = 1;
1714                         }
1715                 }
1716         } else
1717 #endif  /* __amd64 */
1718         { /* no pipelining optimization */
1719                 for (i = 0; i < nlen; i++) {
1720                         rrplusi = rr + i;
1721                         digit = *rrplusi * n0;
1722                         c = BIG_MUL_ADD_VEC(rrplusi, nn, nlen, digit);
1723                         j = i + nlen;
1724                         rr[j] += c;
1725                         while (rr[j] < c) {
1726                                 rr[++j] += 1;
1727                                 c = 1;
1728                         }
1729                 }
1730         }
1731
1732         needsubtract = 0;
1733         if ((rr[2 * nlen]  != 0))
1734                 needsubtract = 1;
1735         else {
1736                 for (i = 2 * nlen - 1; i >= nlen; i--) {
1737                         if (rr[i] > nn[i - nlen]) {
1738                                 needsubtract = 1;
1739                                 break;
1740                         } else if (rr[i] < nn[i - nlen]) {
1741                                 break;
1742                         }
1743                 }
1744         }
1745         if (needsubtract)
1746                 big_sub_vec(rr, rr + nlen, nn, nlen);
1747         else {
1748                 for (i = 0; i < nlen; i++) {
1749                         rr[i] = rr[i + nlen];
1750                 }
1751         }
1752
1753         /* Remove leading zeros, but keep at least 1 digit: */
1754         for (i = nlen - 1; (i > 0) && (rr[i] == 0); i--)
1755                 ;
1756         ret->len = i + 1;
1757
1758         return (BIG_OK);
1759 }
1760
1761
1762 BIG_CHUNK_TYPE
1763 big_n0(BIG_CHUNK_TYPE n)
1764 {
1765         int             i;
1766         BIG_CHUNK_TYPE  result, tmp;
1767
1768         result = 0;
1769         tmp = BIG_CHUNK_ALLBITS;
1770         for (i = 0; i < BIG_CHUNK_SIZE; i++) {
1771                 if ((tmp & 1) == 1) {
1772                         result = (result >> 1) | BIG_CHUNK_HIGHBIT;
1773                         tmp = tmp - n;
1774                 } else {
1775                         result = (result >> 1);
1776                 }
1777                 tmp = tmp >> 1;
1778         }
1779
1780         return (result);
1781 }
1782
1783
1784 int
1785 big_numbits(BIGNUM *n)
1786 {
1787         int             i, j;
1788         BIG_CHUNK_TYPE  t;
1789
1790         for (i = n->len - 1; i > 0; i--) {
1791                 if (n->value[i] != 0) {
1792                         break;
1793                 }
1794         }
1795         t = n->value[i];
1796         for (j = BIG_CHUNK_SIZE; j > 0; j--) {
1797                 if ((t & BIG_CHUNK_HIGHBIT) == 0) {
1798                         t = t << 1;
1799                 } else {
1800                         return (BIG_CHUNK_SIZE * i + j);
1801                 }
1802         }
1803         return (0);
1804 }
1805
1806
1807 /* caller must make sure that a < n */
1808 BIG_ERR_CODE
1809 big_mont_rr(BIGNUM *result, BIGNUM *n)
1810 {
1811         BIGNUM          rr;
1812         BIG_CHUNK_TYPE  rrvalue[BIGTMPSIZE];
1813         int             len, i;
1814         BIG_ERR_CODE    err;
1815
1816         rr.malloced = 0;
1817         len = n->len;
1818
1819         if ((err = big_init1(&rr, 2 * len + 1,
1820             rrvalue, arraysize(rrvalue))) != BIG_OK) {
1821                 return (err);
1822         }
1823
1824         for (i = 0; i < 2 * len; i++) {
1825                 rr.value[i] = 0;
1826         }
1827         rr.value[2 * len] = 1;
1828         rr.len = 2 * len + 1;
1829         if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1830                 goto ret;
1831         }
1832         err = big_copy(result, &rr);
1833 ret:
1834         big_finish(&rr);
1835         return (err);
1836 }
1837
1838
1839 /* caller must make sure that a < n */
1840 BIG_ERR_CODE
1841 big_mont_conv(BIGNUM *result, BIGNUM *a, BIGNUM *n, BIG_CHUNK_TYPE n0,
1842     BIGNUM *n_rr)
1843 {
1844         BIGNUM          rr;
1845         BIG_CHUNK_TYPE  rrvalue[BIGTMPSIZE];
1846         int             len, i;
1847         BIG_ERR_CODE    err;
1848
1849         rr.malloced = 0;
1850         len = n->len;
1851
1852         if ((err = big_init1(&rr, 2 * len + 1, rrvalue, arraysize(rrvalue)))
1853             != BIG_OK) {
1854                 return (err);
1855         }
1856
1857         if (n_rr == NULL) {
1858                 for (i = 0; i < 2 * len; i++) {
1859                         rr.value[i] = 0;
1860                 }
1861                 rr.value[2 * len] = 1;
1862                 rr.len = 2 * len + 1;
1863                 if ((err = big_div_pos(NULL, &rr, &rr, n)) != BIG_OK) {
1864                         goto ret;
1865                 }
1866                 n_rr = &rr;
1867         }
1868
1869         if ((err = big_mont_mul(&rr, n_rr, a, n, n0)) != BIG_OK) {
1870                 goto ret;
1871         }
1872         err = big_copy(result, &rr);
1873
1874 ret:
1875         big_finish(&rr);
1876         return (err);
1877 }
1878
1879
1880 #ifdef  USE_FLOATING_POINT
1881 #define big_modexp_ncp_float    big_modexp_ncp_sw
1882 #else
1883 #define big_modexp_ncp_int      big_modexp_ncp_sw
1884 #endif
1885
1886 #define MAX_EXP_BIT_GROUP_SIZE 6
1887 #define APOWERS_MAX_SIZE (1 << (MAX_EXP_BIT_GROUP_SIZE - 1))
1888
1889 /* ARGSUSED */
1890 static BIG_ERR_CODE
1891 big_modexp_ncp_int(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
1892     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
1893
1894 {
1895         BIGNUM          apowers[APOWERS_MAX_SIZE];
1896         BIGNUM          tmp1;
1897         BIG_CHUNK_TYPE  tmp1value[BIGTMPSIZE];
1898         int             i, j, k, l, m, p;
1899         uint32_t        bit, bitind, bitcount, groupbits, apowerssize;
1900         uint32_t        nbits;
1901         BIG_ERR_CODE    err;
1902
1903         nbits = big_numbits(e);
1904         if (nbits < 50) {
1905                 groupbits = 1;
1906                 apowerssize = 1;
1907         } else {
1908                 groupbits = MAX_EXP_BIT_GROUP_SIZE;
1909                 apowerssize = 1 << (groupbits - 1);
1910         }
1911
1912
1913         if ((err = big_init1(&tmp1, 2 * n->len + 1,
1914             tmp1value, arraysize(tmp1value))) != BIG_OK) {
1915                 return (err);
1916         }
1917
1918         /* clear the malloced bit to help cleanup */
1919         for (i = 0; i < apowerssize; i++) {
1920                 apowers[i].malloced = 0;
1921         }
1922
1923         for (i = 0; i < apowerssize; i++) {
1924                 if ((err = big_init1(&(apowers[i]), n->len, NULL, 0)) !=
1925                     BIG_OK) {
1926                         goto ret;
1927                 }
1928         }
1929
1930         (void) big_copy(&(apowers[0]), ma);
1931
1932         if ((err = big_mont_mul(&tmp1, ma, ma, n, n0)) != BIG_OK) {
1933                 goto ret;
1934         }
1935         (void) big_copy(ma, &tmp1);
1936
1937         for (i = 1; i < apowerssize; i++) {
1938                 if ((err = big_mont_mul(&tmp1, ma,
1939                     &(apowers[i - 1]), n, n0)) != BIG_OK) {
1940                         goto ret;
1941                 }
1942                 (void) big_copy(&apowers[i], &tmp1);
1943         }
1944
1945         bitind = nbits % BIG_CHUNK_SIZE;
1946         k = 0;
1947         l = 0;
1948         p = 0;
1949         bitcount = 0;
1950         for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
1951                 for (j = bitind - 1; j >= 0; j--) {
1952                         bit = (e->value[i] >> j) & 1;
1953                         if ((bitcount == 0) && (bit == 0)) {
1954                                 if ((err = big_mont_mul(tmp,
1955                                     tmp, tmp, n, n0)) != BIG_OK) {
1956                                         goto ret;
1957                                 }
1958                         } else {
1959                                 bitcount++;
1960                                 p = p * 2 + bit;
1961                                 if (bit == 1) {
1962                                         k = k + l + 1;
1963                                         l = 0;
1964                                 } else {
1965                                         l++;
1966                                 }
1967                                 if (bitcount == groupbits) {
1968                                         for (m = 0; m < k; m++) {
1969                                                 if ((err = big_mont_mul(tmp,
1970                                                     tmp, tmp, n, n0)) !=
1971                                                     BIG_OK) {
1972                                                         goto ret;
1973                                                 }
1974                                         }
1975                                         if ((err = big_mont_mul(tmp, tmp,
1976                                             &(apowers[p >> (l + 1)]),
1977                                             n, n0)) != BIG_OK) {
1978                                                 goto ret;
1979                                         }
1980                                         for (m = 0; m < l; m++) {
1981                                                 if ((err = big_mont_mul(tmp,
1982                                                     tmp, tmp, n, n0)) !=
1983                                                     BIG_OK) {
1984                                                         goto ret;
1985                                                 }
1986                                         }
1987                                         k = 0;
1988                                         l = 0;
1989                                         p = 0;
1990                                         bitcount = 0;
1991                                 }
1992                         }
1993                 }
1994                 bitind = BIG_CHUNK_SIZE;
1995         }
1996
1997         for (m = 0; m < k; m++) {
1998                 if ((err = big_mont_mul(tmp, tmp, tmp, n, n0)) != BIG_OK) {
1999                         goto ret;
2000                 }
2001         }
2002         if (p != 0) {
2003                 if ((err = big_mont_mul(tmp, tmp,
2004                     &(apowers[p >> (l + 1)]), n, n0)) != BIG_OK) {
2005                         goto ret;
2006                 }
2007         }
2008         for (m = 0; m < l; m++) {
2009                 if ((err = big_mont_mul(result, tmp, tmp, n, n0)) != BIG_OK) {
2010                         goto ret;
2011                 }
2012         }
2013
2014 ret:
2015         for (i = apowerssize - 1; i >= 0; i--) {
2016                 big_finish(&(apowers[i]));
2017         }
2018         big_finish(&tmp1);
2019
2020         return (err);
2021 }
2022
2023
2024 #ifdef USE_FLOATING_POINT
2025
2026 #ifdef _KERNEL
2027
2028 #include <sys/sysmacros.h>
2029 #include <sys/regset.h>
2030 #include <sys/fpu/fpusystm.h>
2031
2032 /* the alignment for block stores to save fp registers */
2033 #define FPR_ALIGN       (64)
2034
2035 extern void big_savefp(kfpu_t *);
2036 extern void big_restorefp(kfpu_t *);
2037
2038 #endif /* _KERNEL */
2039
2040 /*
2041  * This version makes use of floating point for performance
2042  */
2043 static BIG_ERR_CODE
2044 big_modexp_ncp_float(BIGNUM *result, BIGNUM *ma, BIGNUM *e, BIGNUM *n,
2045     BIGNUM *tmp, BIG_CHUNK_TYPE n0)
2046 {
2047
2048         int             i, j, k, l, m, p;
2049         uint32_t        bit, bitind, bitcount, nlen;
2050         double          dn0;
2051         double          *dn, *dt, *d16r, *d32r;
2052         uint32_t        *nint, *prod;
2053         double          *apowers[APOWERS_MAX_SIZE];
2054         uint32_t        nbits, groupbits, apowerssize;
2055         BIG_ERR_CODE    err = BIG_OK;
2056
2057 #ifdef _KERNEL
2058         uint8_t fpua[sizeof (kfpu_t) + FPR_ALIGN];
2059         kfpu_t *fpu;
2060
2061 #ifdef DEBUG
2062         if (!fpu_exists)
2063                 return (BIG_GENERAL_ERR);
2064 #endif
2065
2066         fpu =  (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, FPR_ALIGN);
2067         big_savefp(fpu);
2068
2069 #endif /* _KERNEL */
2070
2071         nbits = big_numbits(e);
2072         if (nbits < 50) {
2073                 groupbits = 1;
2074                 apowerssize = 1;
2075         } else {
2076                 groupbits = MAX_EXP_BIT_GROUP_SIZE;
2077                 apowerssize = 1 << (groupbits - 1);
2078         }
2079
2080         nlen = (BIG_CHUNK_SIZE / 32) * n->len;
2081         dn0 = (double)(n0 & 0xffff);
2082
2083         dn = dt = d16r = d32r = NULL;
2084         nint = prod = NULL;
2085         for (i = 0; i < apowerssize; i++) {
2086                 apowers[i] = NULL;
2087         }
2088
2089         if ((dn = big_malloc(nlen * sizeof (double))) == NULL) {
2090                 err = BIG_NO_MEM;
2091                 goto ret;
2092         }
2093         if ((dt = big_malloc((4 * nlen + 2) * sizeof (double))) == NULL) {
2094                 err = BIG_NO_MEM;
2095                 goto ret;
2096         }
2097         if ((nint = big_malloc(nlen * sizeof (uint32_t))) == NULL) {
2098                 err = BIG_NO_MEM;
2099                 goto ret;
2100         }
2101         if ((prod = big_malloc((nlen + 1) * sizeof (uint32_t))) == NULL) {
2102                 err = BIG_NO_MEM;
2103                 goto ret;
2104         }
2105         if ((d16r = big_malloc((2 * nlen + 1) * sizeof (double))) == NULL) {
2106                 err = BIG_NO_MEM;
2107                 goto ret;
2108         }
2109         if ((d32r = big_malloc(nlen * sizeof (double))) == NULL) {
2110                 err = BIG_NO_MEM;
2111                 goto ret;
2112         }
2113         for (i = 0; i < apowerssize; i++) {
2114                 if ((apowers[i] = big_malloc((2 * nlen + 1) *
2115                     sizeof (double))) == NULL) {
2116                         err = BIG_NO_MEM;
2117                         goto ret;
2118                 }
2119         }
2120
2121 #if (BIG_CHUNK_SIZE == 32)
2122         for (i = 0; i < ma->len; i++) {
2123                 nint[i] = ma->value[i];
2124         }
2125         for (; i < nlen; i++) {
2126                 nint[i] = 0;
2127         }
2128 #else
2129         for (i = 0; i < ma->len; i++) {
2130                 nint[2 * i] = (uint32_t)(ma->value[i] & 0xffffffffULL);
2131                 nint[2 * i + 1] = (uint32_t)(ma->value[i] >> 32);
2132         }
2133         for (i = ma->len * 2; i < nlen; i++) {
2134                 nint[i] = 0;
2135         }
2136 #endif
2137         conv_i32_to_d32_and_d16(d32r, apowers[0], nint, nlen);
2138
2139 #if (BIG_CHUNK_SIZE == 32)
2140         for (i = 0; i < n->len; i++) {
2141                 nint[i] = n->value[i];
2142         }
2143         for (; i < nlen; i++) {
2144                 nint[i] = 0;
2145         }
2146 #else
2147         for (i = 0; i < n->len; i++) {
2148                 nint[2 * i] = (uint32_t)(n->value[i] & 0xffffffffULL);
2149                 nint[2 * i + 1] = (uint32_t)(n->value[i] >> 32);
2150         }
2151         for (i = n->len * 2; i < nlen; i++) {
2152                 nint[i] = 0;
2153         }
2154 #endif
2155         conv_i32_to_d32(dn, nint, nlen);
2156
2157         mont_mulf_noconv(prod, d32r, apowers[0], dt, dn, nint, nlen, dn0);
2158         conv_i32_to_d32(d32r, prod, nlen);
2159         for (i = 1; i < apowerssize; i++) {
2160                 mont_mulf_noconv(prod, d32r, apowers[i - 1],
2161                     dt, dn, nint, nlen, dn0);
2162                 conv_i32_to_d16(apowers[i], prod, nlen);
2163         }
2164
2165 #if (BIG_CHUNK_SIZE == 32)
2166         for (i = 0; i < tmp->len; i++) {
2167                 prod[i] = tmp->value[i];
2168         }
2169         for (; i < nlen + 1; i++) {
2170                 prod[i] = 0;
2171         }
2172 #else
2173         for (i = 0; i < tmp->len; i++) {
2174                 prod[2 * i] = (uint32_t)(tmp->value[i] & 0xffffffffULL);
2175                 prod[2 * i + 1] = (uint32_t)(tmp->value[i] >> 32);
2176         }
2177         for (i = tmp->len * 2; i < nlen + 1; i++) {
2178                 prod[i] = 0;
2179         }
2180 #endif
2181
2182         bitind = nbits % BIG_CHUNK_SIZE;
2183         k = 0;
2184         l = 0;
2185         p = 0;
2186         bitcount = 0;
2187         for (i = nbits / BIG_CHUNK_SIZE; i >= 0; i--) {
2188                 for (j = bitind - 1; j >= 0; j--) {
2189                         bit = (e->value[i] >> j) & 1;
2190                         if ((bitcount == 0) && (bit == 0)) {
2191                                 conv_i32_to_d32_and_d16(d32r, d16r,
2192                                     prod, nlen);
2193                                 mont_mulf_noconv(prod, d32r, d16r,
2194                                     dt, dn, nint, nlen, dn0);
2195                         } else {
2196                                 bitcount++;
2197                                 p = p * 2 + bit;
2198                                 if (bit == 1) {
2199                                         k = k + l + 1;
2200                                         l = 0;
2201                                 } else {
2202                                         l++;
2203                                 }
2204                                 if (bitcount == groupbits) {
2205                                         for (m = 0; m < k; m++) {
2206                                                 conv_i32_to_d32_and_d16(d32r,
2207                                                     d16r, prod, nlen);
2208                                                 mont_mulf_noconv(prod, d32r,
2209                                                     d16r, dt, dn, nint,
2210                                                     nlen, dn0);
2211                                         }
2212                                         conv_i32_to_d32(d32r, prod, nlen);
2213                                         mont_mulf_noconv(prod, d32r,
2214                                             apowers[p >> (l + 1)],
2215                                             dt, dn, nint, nlen, dn0);
2216                                         for (m = 0; m < l; m++) {
2217                                                 conv_i32_to_d32_and_d16(d32r,
2218                                                     d16r, prod, nlen);
2219                                                 mont_mulf_noconv(prod, d32r,
2220                                                     d16r, dt, dn, nint,
2221                                                     nlen, dn0);
2222                                         }
2223                                         k = 0;
2224                                         l = 0;
2225                                         p = 0;
2226                                         bitcount = 0;
2227                                 }
2228                         }
2229                 }
2230                 bitind = BIG_CHUNK_SIZE;
2231         }
2232
2233         for (m = 0; m < k; m++) {
2234                 conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2235                 mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2236         }
2237         if (p != 0) {
2238                 conv_i32_to_d32(d32r, prod, nlen);
2239                 mont_mulf_noconv(prod, d32r, apowers[p >> (l + 1)],
2240                     dt, dn, nint, nlen, dn0);
2241         }
2242         for (m = 0; m < l; m++) {
2243                 conv_i32_to_d32_and_d16(d32r, d16r, prod, nlen);
2244                 mont_mulf_noconv(prod, d32r, d16r, dt, dn, nint, nlen, dn0);
2245         }
2246
2247 #if (BIG_CHUNK_SIZE == 32)
2248         for (i = 0; i < nlen; i++) {
2249                 result->value[i] = prod[i];
2250         }
2251         for (i = nlen - 1; (i > 0) && (prod[i] == 0); i--)
2252                 ;
2253 #else
2254         for (i = 0; i < nlen / 2; i++) {
2255                 result->value[i] = (uint64_t)(prod[2 * i]) +
2256                     (((uint64_t)(prod[2 * i + 1])) << 32);
2257         }
2258         for (i = nlen / 2 - 1; (i > 0) && (result->value[i] == 0); i--)
2259                 ;
2260 #endif
2261         result->len = i + 1;
2262
2263 ret:
2264         for (i = apowerssize - 1; i >= 0; i--) {
2265                 if (apowers[i] != NULL)
2266                         big_free(apowers[i], (2 * nlen + 1) * sizeof (double));
2267         }
2268         if (d32r != NULL) {
2269                 big_free(d32r, nlen * sizeof (double));
2270         }
2271         if (d16r != NULL) {
2272                 big_free(d16r, (2 * nlen + 1) * sizeof (double));
2273         }
2274         if (prod != NULL) {
2275                 big_free(prod, (nlen + 1) * sizeof (uint32_t));
2276         }
2277         if (nint != NULL) {
2278                 big_free(nint, nlen * sizeof (uint32_t));
2279         }
2280         if (dt != NULL) {
2281                 big_free(dt, (4 * nlen + 2) * sizeof (double));
2282         }
2283         if (dn != NULL) {
2284                 big_free(dn, nlen * sizeof (double));
2285         }
2286
2287 #ifdef _KERNEL
2288         big_restorefp(fpu);
2289 #endif
2290
2291         return (err);
2292 }
2293
2294 #endif /* USE_FLOATING_POINT */
2295
2296
2297 BIG_ERR_CODE
2298 big_modexp_ext(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr,
2299     big_modexp_ncp_info_t *info)
2300 {
2301         BIGNUM          ma, tmp, rr;
2302         BIG_CHUNK_TYPE  mavalue[BIGTMPSIZE];
2303         BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
2304         BIG_CHUNK_TYPE  rrvalue[BIGTMPSIZE];
2305         BIG_ERR_CODE    err;
2306         BIG_CHUNK_TYPE  n0;
2307
2308         if ((err = big_init1(&ma, n->len, mavalue, arraysize(mavalue))) !=
2309             BIG_OK) {
2310                 return (err);
2311         }
2312         ma.len = 1;
2313         ma.value[0] = 0;
2314
2315         if ((err = big_init1(&tmp, 2 * n->len + 1,
2316             tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2317                 goto ret1;
2318         }
2319
2320         /* clear the malloced bit to help cleanup */
2321         rr.malloced = 0;
2322
2323         if (n_rr == NULL) {
2324                 if ((err = big_init1(&rr, 2 * n->len + 1,
2325                     rrvalue, arraysize(rrvalue))) != BIG_OK) {
2326                         goto ret2;
2327                 }
2328                 if (big_mont_rr(&rr, n) != BIG_OK) {
2329                         goto ret;
2330                 }
2331                 n_rr = &rr;
2332         }
2333
2334         n0 = big_n0(n->value[0]);
2335
2336         if (big_cmp_abs(a, n) > 0) {
2337                 if ((err = big_div_pos(NULL, &ma, a, n)) != BIG_OK) {
2338                         goto ret;
2339                 }
2340                 err = big_mont_conv(&ma, &ma, n, n0, n_rr);
2341         } else {
2342                 err = big_mont_conv(&ma, a, n, n0, n_rr);
2343         }
2344         if (err != BIG_OK) {
2345                 goto ret;
2346         }
2347
2348         tmp.len = 1;
2349         tmp.value[0] = 1;
2350         if ((err = big_mont_conv(&tmp, &tmp, n, n0, n_rr)) != BIG_OK) {
2351                 goto ret;
2352         }
2353
2354         if ((info != NULL) && (info->func != NULL)) {
2355                 err = (*(info->func))(&tmp, &ma, e, n, &tmp, n0,
2356                     info->ncp, info->reqp);
2357         } else {
2358                 err = big_modexp_ncp_sw(&tmp, &ma, e, n, &tmp, n0);
2359         }
2360         if (err != BIG_OK) {
2361                 goto ret;
2362         }
2363
2364         ma.value[0] = 1;
2365         ma.len = 1;
2366         if ((err = big_mont_mul(&tmp, &tmp, &ma, n, n0)) != BIG_OK) {
2367                 goto ret;
2368         }
2369         err = big_copy(result, &tmp);
2370
2371 ret:
2372         if (rr.malloced) {
2373                 big_finish(&rr);
2374         }
2375 ret2:
2376         big_finish(&tmp);
2377 ret1:
2378         big_finish(&ma);
2379
2380         return (err);
2381 }
2382
2383 BIG_ERR_CODE
2384 big_modexp(BIGNUM *result, BIGNUM *a, BIGNUM *e, BIGNUM *n, BIGNUM *n_rr)
2385 {
2386         return (big_modexp_ext(result, a, e, n, n_rr, NULL));
2387 }
2388
2389
2390 BIG_ERR_CODE
2391 big_modexp_crt_ext(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2392     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2393     BIGNUM *p_rr, BIGNUM *q_rr, big_modexp_ncp_info_t *info)
2394 {
2395         BIGNUM          ap, aq, tmp;
2396         int             alen, biglen, sign;
2397         BIG_ERR_CODE    err;
2398
2399         if (p->len > q->len) {
2400                 biglen = p->len;
2401         } else {
2402                 biglen = q->len;
2403         }
2404
2405         if ((err = big_init1(&ap, p->len, NULL, 0)) != BIG_OK) {
2406                 return (err);
2407         }
2408         if ((err = big_init1(&aq, q->len, NULL, 0)) != BIG_OK) {
2409                 goto ret1;
2410         }
2411         if ((err = big_init1(&tmp, biglen + q->len + 1, NULL, 0)) != BIG_OK) {
2412                 goto ret2;
2413         }
2414
2415         /*
2416          * check whether a is too short - to avoid timing attacks
2417          */
2418         alen = a->len;
2419         while ((alen > p->len) && (a->value[alen - 1] == 0)) {
2420                 alen--;
2421         }
2422         if (alen < p->len + q->len) {
2423                 /*
2424                  * a is too short, add p*q to it before
2425                  * taking it modulo p and q
2426                  * this will also affect timing, but this difference
2427                  * does not depend on p or q, only on a
2428                  * (in "normal" operation, this path will never be
2429                  * taken, so it is not a performance penalty
2430                  */
2431                 if ((err = big_mul(&tmp, p, q)) != BIG_OK) {
2432                         goto ret;
2433                 }
2434                 if ((err = big_add(&tmp, &tmp, a)) != BIG_OK) {
2435                         goto ret;
2436                 }
2437                 if ((err = big_div_pos(NULL, &ap, &tmp, p)) != BIG_OK) {
2438                         goto ret;
2439                 }
2440                 if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2441                         goto ret;
2442                 }
2443         } else {
2444                 if ((err = big_div_pos(NULL, &ap, a, p)) != BIG_OK) {
2445                         goto ret;
2446                 }
2447                 if ((err = big_div_pos(NULL, &aq, a, q)) != BIG_OK) {
2448                         goto ret;
2449                 }
2450         }
2451
2452         if ((err = big_modexp_ext(&ap, &ap, dmodpminus1, p, p_rr, info)) !=
2453             BIG_OK) {
2454                 goto ret;
2455         }
2456         if ((err = big_modexp_ext(&aq, &aq, dmodqminus1, q, q_rr, info)) !=
2457             BIG_OK) {
2458                 goto ret;
2459         }
2460         if ((err = big_sub(&tmp, &aq, &ap)) != BIG_OK) {
2461                 goto ret;
2462         }
2463         if ((err = big_mul(&tmp, &tmp, pinvmodq)) != BIG_OK) {
2464                 goto ret;
2465         }
2466         sign = tmp.sign;
2467         tmp.sign = 1;
2468         if ((err = big_div_pos(NULL, &aq, &tmp, q)) != BIG_OK) {
2469                 goto ret;
2470         }
2471         if ((sign == -1) && (!big_is_zero(&aq))) {
2472                 (void) big_sub_pos(&aq, q, &aq);
2473         }
2474         if ((err = big_mul(&tmp, &aq, p)) != BIG_OK) {
2475                 goto ret;
2476         }
2477         err = big_add_abs(result, &ap, &tmp);
2478
2479 ret:
2480         big_finish(&tmp);
2481 ret2:
2482         big_finish(&aq);
2483 ret1:
2484         big_finish(&ap);
2485
2486         return (err);
2487 }
2488
2489
2490 BIG_ERR_CODE
2491 big_modexp_crt(BIGNUM *result, BIGNUM *a, BIGNUM *dmodpminus1,
2492     BIGNUM *dmodqminus1, BIGNUM *p, BIGNUM *q, BIGNUM *pinvmodq,
2493     BIGNUM *p_rr, BIGNUM *q_rr)
2494 {
2495         return (big_modexp_crt_ext(result, a, dmodpminus1, dmodqminus1,
2496             p, q, pinvmodq, p_rr, q_rr, NULL));
2497 }
2498
2499
2500 static BIG_CHUNK_TYPE onearr[1] = {(BIG_CHUNK_TYPE)1};
2501 #if     !defined(NO_BIG_ONE)
2502 BIGNUM big_One = {1, 1, 1, 0, onearr};
2503 #endif
2504
2505 static BIG_CHUNK_TYPE twoarr[1] = {(BIG_CHUNK_TYPE)2};
2506 #if     !defined(NO_BIG_TWO)
2507 BIGNUM big_Two = {1, 1, 1, 0, twoarr};
2508 #endif
2509
2510 static BIG_CHUNK_TYPE fourarr[1] = {(BIG_CHUNK_TYPE)4};
2511 static BIGNUM big_Four = {1, 1, 1, 0, fourarr};
2512
2513
2514 BIG_ERR_CODE
2515 big_sqrt_pos(BIGNUM *result, BIGNUM *n)
2516 {
2517         BIGNUM          *high, *low, *mid, *t;
2518         BIGNUM          t1, t2, t3, prod;
2519         BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
2520         BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
2521         BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
2522         BIG_CHUNK_TYPE  prodvalue[BIGTMPSIZE];
2523         int             i, diff;
2524         uint32_t        nbits, nrootbits, highbits;
2525         BIG_ERR_CODE    err;
2526
2527         nbits = big_numbits(n);
2528
2529         if ((err = big_init1(&t1, n->len + 1,
2530             t1value, arraysize(t1value))) != BIG_OK)
2531                 return (err);
2532         if ((err = big_init1(&t2, n->len + 1,
2533             t2value, arraysize(t2value))) != BIG_OK)
2534                 goto ret1;
2535         if ((err = big_init1(&t3, n->len + 1,
2536             t3value, arraysize(t3value))) != BIG_OK)
2537                 goto ret2;
2538         if ((err = big_init1(&prod, n->len + 1,
2539             prodvalue, arraysize(prodvalue))) != BIG_OK)
2540                 goto ret3;
2541
2542         nrootbits = (nbits + 1) / 2;
2543         t1.len = t2.len = t3.len = (nrootbits - 1) / BIG_CHUNK_SIZE + 1;
2544         for (i = 0; i < t1.len; i++) {
2545                 t1.value[i] = 0;
2546                 t2.value[i] = BIG_CHUNK_ALLBITS;
2547         }
2548         highbits = nrootbits - BIG_CHUNK_SIZE * (t1.len - 1);
2549         if (highbits == BIG_CHUNK_SIZE) {
2550                 t1.value[t1.len - 1] = BIG_CHUNK_HIGHBIT;
2551                 t2.value[t2.len - 1] = BIG_CHUNK_ALLBITS;
2552         } else {
2553                 t1.value[t1.len - 1] = (BIG_CHUNK_TYPE)1 << (highbits - 1);
2554                 t2.value[t2.len - 1] = 2 * t1.value[t1.len - 1] - 1;
2555         }
2556
2557         high = &t2;
2558         low = &t1;
2559         mid = &t3;
2560
2561         if ((err = big_mul(&prod, high, high)) != BIG_OK) {
2562                 goto ret;
2563         }
2564         diff = big_cmp_abs(&prod, n);
2565         if (diff <= 0) {
2566                 err = big_copy(result, high);
2567                 goto ret;
2568         }
2569
2570         (void) big_sub_pos(mid, high, low);
2571         while (big_cmp_abs(&big_One, mid) != 0) {
2572                 (void) big_add_abs(mid, high, low);
2573                 (void) big_half_pos(mid, mid);
2574                 if ((err = big_mul(&prod, mid, mid)) != BIG_OK)
2575                         goto ret;
2576                 diff = big_cmp_abs(&prod, n);
2577                 if (diff > 0) {
2578                         t = high;
2579                         high = mid;
2580                         mid = t;
2581                 } else if (diff < 0) {
2582                         t = low;
2583                         low = mid;
2584                         mid = t;
2585                 } else {
2586                         err = big_copy(result, low);
2587                         goto ret;
2588                 }
2589                 (void) big_sub_pos(mid, high, low);
2590         }
2591
2592         err = big_copy(result, low);
2593 ret:
2594         if (prod.malloced) big_finish(&prod);
2595 ret3:
2596         if (t3.malloced) big_finish(&t3);
2597 ret2:
2598         if (t2.malloced) big_finish(&t2);
2599 ret1:
2600         if (t1.malloced) big_finish(&t1);
2601
2602         return (err);
2603 }
2604
2605
2606 BIG_ERR_CODE
2607 big_Jacobi_pos(int *jac, BIGNUM *nn, BIGNUM *mm)
2608 {
2609         BIGNUM          *t, *tmp2, *m, *n;
2610         BIGNUM          t1, t2, t3;
2611         BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
2612         BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
2613         BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
2614         int             len, err;
2615
2616         if (big_is_zero(nn) ||
2617             (((nn->value[0] & 1) | (mm->value[0] & 1)) == 0)) {
2618                 *jac = 0;
2619                 return (BIG_OK);
2620         }
2621
2622         if (nn->len > mm->len) {
2623                 len = nn->len;
2624         } else {
2625                 len = mm->len;
2626         }
2627
2628         if ((err = big_init1(&t1, len,
2629             t1value, arraysize(t1value))) != BIG_OK) {
2630                 return (err);
2631         }
2632         if ((err = big_init1(&t2, len,
2633             t2value, arraysize(t2value))) != BIG_OK) {
2634                 goto ret1;
2635         }
2636         if ((err = big_init1(&t3, len,
2637             t3value, arraysize(t3value))) != BIG_OK) {
2638                 goto ret2;
2639         }
2640
2641         n = &t1;
2642         m = &t2;
2643         tmp2 = &t3;
2644
2645         (void) big_copy(n, nn);
2646         (void) big_copy(m, mm);
2647
2648         *jac = 1;
2649         while (big_cmp_abs(&big_One, m) != 0) {
2650                 if (big_is_zero(n)) {
2651                         *jac = 0;
2652                         goto ret;
2653                 }
2654                 if ((m->value[0] & 1) == 0) {
2655                         if (((n->value[0] & 7) == 3) ||
2656                             ((n->value[0] & 7) == 5))
2657                                 *jac = -*jac;
2658                         (void) big_half_pos(m, m);
2659                 } else if ((n->value[0] & 1) == 0) {
2660                         if (((m->value[0] & 7) == 3) ||
2661                             ((m->value[0] & 7) == 5))
2662                                 *jac = -*jac;
2663                         (void) big_half_pos(n, n);
2664                 } else {
2665                         if (((m->value[0] & 3) == 3) &&
2666                             ((n->value[0] & 3) == 3)) {
2667                                 *jac = -*jac;
2668                         }
2669                         if ((err = big_div_pos(NULL, tmp2, m, n)) != BIG_OK) {
2670                                 goto ret;
2671                         }
2672                         t = tmp2;
2673                         tmp2 = m;
2674                         m = n;
2675                         n = t;
2676                 }
2677         }
2678         err = BIG_OK;
2679
2680 ret:
2681         if (t3.malloced) big_finish(&t3);
2682 ret2:
2683         if (t2.malloced) big_finish(&t2);
2684 ret1:
2685         if (t1.malloced) big_finish(&t1);
2686
2687         return (err);
2688 }
2689
2690
2691 BIG_ERR_CODE
2692 big_Lucas(BIGNUM *Lkminus1, BIGNUM *Lk, BIGNUM *p, BIGNUM *k, BIGNUM *n)
2693 {
2694         int             i;
2695         uint32_t        m, w;
2696         BIG_CHUNK_TYPE  bit;
2697         BIGNUM          ki, tmp, tmp2;
2698         BIG_CHUNK_TYPE  kivalue[BIGTMPSIZE];
2699         BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
2700         BIG_CHUNK_TYPE  tmp2value[BIGTMPSIZE];
2701         BIG_ERR_CODE    err;
2702
2703         if (big_cmp_abs(k, &big_One) == 0) {
2704                 (void) big_copy(Lk, p);
2705                 (void) big_copy(Lkminus1, &big_Two);
2706                 return (BIG_OK);
2707         }
2708
2709         if ((err = big_init1(&ki, k->len + 1,
2710             kivalue, arraysize(kivalue))) != BIG_OK)
2711                 return (err);
2712
2713         if ((err = big_init1(&tmp, 2 * n->len + 1,
2714             tmpvalue, arraysize(tmpvalue))) != BIG_OK)
2715                 goto ret1;
2716
2717         if ((err = big_init1(&tmp2, n->len,
2718             tmp2value, arraysize(tmp2value))) != BIG_OK)
2719                 goto ret2;
2720
2721         m = big_numbits(k);
2722         ki.len = (m - 1) / BIG_CHUNK_SIZE + 1;
2723         w = (m - 1) / BIG_CHUNK_SIZE;
2724         bit = (BIG_CHUNK_TYPE)1 << ((m - 1) % BIG_CHUNK_SIZE);
2725         for (i = 0; i < ki.len; i++) {
2726                 ki.value[i] = 0;
2727         }
2728         ki.value[ki.len - 1] = bit;
2729         if (big_cmp_abs(k, &ki) != 0) {
2730                 (void) big_double(&ki, &ki);
2731         }
2732         (void) big_sub_pos(&ki, &ki, k);
2733
2734         (void) big_copy(Lk, p);
2735         (void) big_copy(Lkminus1, &big_Two);
2736
2737         for (i = 0; i < m; i++) {
2738                 if ((err = big_mul(&tmp, Lk, Lkminus1)) != BIG_OK) {
2739                         goto ret;
2740                 }
2741                 (void) big_add_abs(&tmp, &tmp, n);
2742                 (void) big_sub_pos(&tmp, &tmp, p);
2743                 if ((err = big_div_pos(NULL, &tmp2, &tmp, n)) != BIG_OK) {
2744                         goto ret;
2745                 }
2746                 if ((ki.value[w] & bit) != 0) {
2747                         if ((err = big_mul(&tmp, Lkminus1, Lkminus1)) !=
2748                             BIG_OK) {
2749                                 goto ret;
2750                         }
2751                         (void) big_add_abs(&tmp, &tmp, n);
2752                         (void) big_sub_pos(&tmp, &tmp, &big_Two);
2753                         if ((err = big_div_pos(NULL, Lkminus1, &tmp, n)) !=
2754                             BIG_OK) {
2755                                 goto ret;
2756                         }
2757                         (void) big_copy(Lk, &tmp2);
2758                 } else {
2759                         if ((err = big_mul(&tmp, Lk, Lk)) != BIG_OK) {
2760                                 goto ret;
2761                         }
2762                         (void) big_add_abs(&tmp, &tmp, n);
2763                         (void) big_sub_pos(&tmp, &tmp, &big_Two);
2764                         if ((err = big_div_pos(NULL, Lk, &tmp, n)) != BIG_OK) {
2765                                 goto ret;
2766                         }
2767                         (void) big_copy(Lkminus1, &tmp2);
2768                 }
2769                 bit = bit >> 1;
2770                 if (bit == 0) {
2771                         bit = BIG_CHUNK_HIGHBIT;
2772                         w--;
2773                 }
2774         }
2775
2776         err = BIG_OK;
2777
2778 ret:
2779         if (tmp2.malloced) big_finish(&tmp2);
2780 ret2:
2781         if (tmp.malloced) big_finish(&tmp);
2782 ret1:
2783         if (ki.malloced) big_finish(&ki);
2784
2785         return (err);
2786 }
2787
2788
2789 BIG_ERR_CODE
2790 big_isprime_pos_ext(BIGNUM *n, big_modexp_ncp_info_t *info)
2791 {
2792         BIGNUM          o, nminus1, tmp, Lkminus1, Lk;
2793         BIG_CHUNK_TYPE  ovalue[BIGTMPSIZE];
2794         BIG_CHUNK_TYPE  nminus1value[BIGTMPSIZE];
2795         BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
2796         BIG_CHUNK_TYPE  Lkminus1value[BIGTMPSIZE];
2797         BIG_CHUNK_TYPE  Lkvalue[BIGTMPSIZE];
2798         BIG_ERR_CODE    err;
2799         int             e, i, jac;
2800
2801         if (big_cmp_abs(n, &big_One) == 0) {
2802                 return (BIG_FALSE);
2803         }
2804         if (big_cmp_abs(n, &big_Two) == 0) {
2805                 return (BIG_TRUE);
2806         }
2807         if ((n->value[0] & 1) == 0) {
2808                 return (BIG_FALSE);
2809         }
2810
2811         if ((err = big_init1(&o, n->len, ovalue, arraysize(ovalue))) !=
2812             BIG_OK) {
2813                 return (err);
2814         }
2815
2816         if ((err = big_init1(&nminus1, n->len,
2817             nminus1value, arraysize(nminus1value))) != BIG_OK) {
2818                 goto ret1;
2819         }
2820
2821         if ((err = big_init1(&tmp, 2 * n->len,
2822             tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
2823                 goto ret2;
2824         }
2825
2826         if ((err = big_init1(&Lkminus1, n->len,
2827             Lkminus1value, arraysize(Lkminus1value))) != BIG_OK) {
2828                 goto ret3;
2829         }
2830
2831         if ((err = big_init1(&Lk, n->len,
2832             Lkvalue, arraysize(Lkvalue))) != BIG_OK) {
2833                 goto ret4;
2834         }
2835
2836         (void) big_sub_pos(&o, n, &big_One);    /* cannot fail */
2837         (void) big_copy(&nminus1, &o);          /* cannot fail */
2838         e = 0;
2839         while ((o.value[0] & 1) == 0) {
2840                 e++;
2841                 (void) big_half_pos(&o, &o);  /* cannot fail */
2842         }
2843         if ((err = big_modexp_ext(&tmp, &big_Two, &o, n, NULL, info)) !=
2844             BIG_OK) {
2845                 goto ret;
2846         }
2847
2848         i = 0;
2849         while ((i < e) &&
2850             (big_cmp_abs(&tmp, &big_One) != 0) &&
2851             (big_cmp_abs(&tmp, &nminus1) != 0)) {
2852                 if ((err =
2853                     big_modexp_ext(&tmp, &tmp, &big_Two, n, NULL, info)) !=
2854                     BIG_OK)
2855                         goto ret;
2856                 i++;
2857         }
2858
2859         if (!((big_cmp_abs(&tmp, &nminus1) == 0) ||
2860             ((i == 0) && (big_cmp_abs(&tmp, &big_One) == 0)))) {
2861                 err = BIG_FALSE;
2862                 goto ret;
2863         }
2864
2865         if ((err = big_sqrt_pos(&tmp, n)) != BIG_OK) {
2866                 goto ret;
2867         }
2868
2869         if ((err = big_mul(&tmp, &tmp, &tmp)) != BIG_OK) {
2870                 goto ret;
2871         }
2872         if (big_cmp_abs(&tmp, n) == 0) {
2873                 err = BIG_FALSE;
2874                 goto ret;
2875         }
2876
2877         (void) big_copy(&o, &big_Two);
2878         do {
2879                 (void) big_add_abs(&o, &o, &big_One);
2880                 if ((err = big_mul(&tmp, &o, &o)) != BIG_OK) {
2881                         goto ret;
2882                 }
2883                 (void) big_sub_pos(&tmp, &tmp, &big_Four);
2884                 if ((err = big_Jacobi_pos(&jac, &tmp, n)) != BIG_OK) {
2885                         goto ret;
2886                 }
2887         } while (jac != -1);
2888
2889         (void) big_add_abs(&tmp, n, &big_One);
2890         if ((err = big_Lucas(&Lkminus1, &Lk, &o, &tmp, n)) != BIG_OK) {
2891                 goto ret;
2892         }
2893
2894         if ((big_cmp_abs(&Lkminus1, &o) == 0) &&
2895             (big_cmp_abs(&Lk, &big_Two) == 0)) {
2896                 err = BIG_TRUE;
2897         } else {
2898                 err = BIG_FALSE;
2899         }
2900
2901 ret:
2902         if (Lk.malloced) big_finish(&Lk);
2903 ret4:
2904         if (Lkminus1.malloced) big_finish(&Lkminus1);
2905 ret3:
2906         if (tmp.malloced) big_finish(&tmp);
2907 ret2:
2908         if (nminus1.malloced) big_finish(&nminus1);
2909 ret1:
2910         if (o.malloced) big_finish(&o);
2911
2912         return (err);
2913 }
2914
2915
2916 BIG_ERR_CODE
2917 big_isprime_pos(BIGNUM *n)
2918 {
2919         return (big_isprime_pos_ext(n, NULL));
2920 }
2921
2922
2923 #define SIEVESIZE 1000
2924
2925
2926 BIG_ERR_CODE
2927 big_nextprime_pos_ext(BIGNUM *result, BIGNUM *n, big_modexp_ncp_info_t *info)
2928 {
2929         static const uint32_t smallprimes[] = {
2930             3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
2931             51, 53, 59, 61, 67, 71, 73, 79, 83, 89, 91, 97 };
2932         BIG_ERR_CODE    err;
2933         int             sieve[SIEVESIZE];
2934         int             i;
2935         uint32_t        off, p;
2936
2937         if ((err = big_copy(result, n)) != BIG_OK) {
2938                 return (err);
2939         }
2940         result->value[0] |= 1;
2941         /* CONSTCOND */
2942         while (1) {
2943                 for (i = 0; i < SIEVESIZE; i++) sieve[i] = 0;
2944                 for (i = 0;
2945                     i < sizeof (smallprimes) / sizeof (smallprimes[0]); i++) {
2946                         p = smallprimes[i];
2947                         off = big_modhalf_pos(result, p);
2948                         off = p - off;
2949                         if ((off % 2) == 1) {
2950                                 off = off + p;
2951                         }
2952                         off = off / 2;
2953                         while (off < SIEVESIZE) {
2954                                 sieve[off] = 1;
2955                                 off = off + p;
2956                         }
2957                 }
2958
2959                 for (i = 0; i < SIEVESIZE; i++) {
2960                         if (sieve[i] == 0) {
2961                                 err = big_isprime_pos_ext(result, info);
2962                                 if (err != BIG_FALSE) {
2963                                         if (err != BIG_TRUE) {
2964                                                 return (err);
2965                                         } else {
2966                                                 goto out;
2967                                         }
2968                                 }
2969
2970                         }
2971                         if ((err = big_add_abs(result, result, &big_Two)) !=
2972                             BIG_OK) {
2973                                 return (err);
2974                         }
2975                 }
2976         }
2977
2978 out:
2979         return (BIG_OK);
2980 }
2981
2982
2983 BIG_ERR_CODE
2984 big_nextprime_pos(BIGNUM *result, BIGNUM *n)
2985 {
2986         return (big_nextprime_pos_ext(result, n, NULL));
2987 }
2988
2989
2990 BIG_ERR_CODE
2991 big_nextprime_pos_slow(BIGNUM *result, BIGNUM *n)
2992 {
2993         BIG_ERR_CODE err;
2994
2995
2996         if ((err = big_copy(result, n)) != BIG_OK)
2997                 return (err);
2998         result->value[0] |= 1;
2999         while ((err = big_isprime_pos_ext(result, NULL)) != BIG_TRUE) {
3000                 if (err != BIG_FALSE)
3001                         return (err);
3002                 if ((err = big_add_abs(result, result, &big_Two)) != BIG_OK)
3003                         return (err);
3004         }
3005         return (BIG_OK);
3006 }
3007
3008
3009 /*
3010  * given m and e, computes the rest in the equation
3011  * gcd(m, e) = cm * m + ce * e
3012  */
3013 BIG_ERR_CODE
3014 big_ext_gcd_pos(BIGNUM *gcd, BIGNUM *cm, BIGNUM *ce, BIGNUM *m, BIGNUM *e)
3015 {
3016         BIGNUM          *xi, *ri, *riminus1, *riminus2, *t;
3017         BIGNUM          *vmi, *vei, *vmiminus1, *veiminus1;
3018         BIGNUM          t1, t2, t3, t4, t5, t6, t7, t8, tmp;
3019         BIG_CHUNK_TYPE  t1value[BIGTMPSIZE];
3020         BIG_CHUNK_TYPE  t2value[BIGTMPSIZE];
3021         BIG_CHUNK_TYPE  t3value[BIGTMPSIZE];
3022         BIG_CHUNK_TYPE  t4value[BIGTMPSIZE];
3023         BIG_CHUNK_TYPE  t5value[BIGTMPSIZE];
3024         BIG_CHUNK_TYPE  t6value[BIGTMPSIZE];
3025         BIG_CHUNK_TYPE  t7value[BIGTMPSIZE];
3026         BIG_CHUNK_TYPE  t8value[BIGTMPSIZE];
3027         BIG_CHUNK_TYPE  tmpvalue[BIGTMPSIZE];
3028         BIG_ERR_CODE    err;
3029         int             len;
3030
3031         if (big_cmp_abs(m, e) >= 0) {
3032                 len = m->len;
3033         } else {
3034                 len = e->len;
3035         }
3036
3037         if ((err = big_init1(&t1, len,
3038             t1value, arraysize(t1value))) != BIG_OK) {
3039                 return (err);
3040         }
3041         if ((err = big_init1(&t2, len,
3042             t2value, arraysize(t2value))) != BIG_OK) {
3043                 goto ret1;
3044         }
3045         if ((err = big_init1(&t3, len,
3046             t3value, arraysize(t3value))) != BIG_OK) {
3047                 goto ret2;
3048         }
3049         if ((err = big_init1(&t4, len,
3050             t4value, arraysize(t3value))) != BIG_OK) {
3051                 goto ret3;
3052         }
3053         if ((err = big_init1(&t5, len,
3054             t5value, arraysize(t5value))) != BIG_OK) {
3055                 goto ret4;
3056         }
3057         if ((err = big_init1(&t6, len,
3058             t6value, arraysize(t6value))) != BIG_OK) {
3059                 goto ret5;
3060         }
3061         if ((err = big_init1(&t7, len,
3062             t7value, arraysize(t7value))) != BIG_OK) {
3063                 goto ret6;
3064         }
3065         if ((err = big_init1(&t8, len,
3066             t8value, arraysize(t8value))) != BIG_OK) {
3067                 goto ret7;
3068         }
3069
3070         if ((err = big_init1(&tmp, 2 * len,
3071             tmpvalue, arraysize(tmpvalue))) != BIG_OK) {
3072                 goto ret8;
3073         }
3074
3075         ri = &t1;
3076         ri->value[0] = 1;
3077         ri->len = 1;
3078         xi = &t2;
3079         riminus1 = &t3;
3080         riminus2 = &t4;
3081         vmi = &t5;
3082         vei = &t6;
3083         vmiminus1 = &t7;
3084         veiminus1 = &t8;
3085
3086         (void) big_copy(vmiminus1, &big_One);
3087         (void) big_copy(vmi, &big_One);
3088         (void) big_copy(veiminus1, &big_One);
3089         (void) big_copy(xi, &big_One);
3090         vei->len = 1;
3091         vei->value[0] = 0;
3092
3093         (void) big_copy(riminus1, m);
3094         (void) big_copy(ri, e);
3095
3096         while (!big_is_zero(ri)) {
3097                 t = riminus2;
3098                 riminus2 = riminus1;
3099                 riminus1 = ri;
3100                 ri = t;
3101                 if ((err = big_mul(&tmp, vmi, xi)) != BIG_OK) {
3102                         goto ret;
3103                 }
3104                 if ((err = big_sub(vmiminus1, vmiminus1, &tmp)) != BIG_OK) {
3105                         goto ret;
3106                 }
3107                 t = vmiminus1;
3108                 vmiminus1 = vmi;
3109                 vmi = t;
3110                 if ((err = big_mul(&tmp, vei, xi)) != BIG_OK) {
3111                         goto ret;
3112                 }
3113                 if ((err = big_sub(veiminus1, veiminus1, &tmp)) != BIG_OK) {
3114                         goto ret;
3115                 }
3116                 t = veiminus1;
3117                 veiminus1 = vei;
3118                 vei = t;
3119                 if ((err = big_div_pos(xi, ri, riminus2, riminus1)) !=
3120                     BIG_OK) {
3121                         goto ret;
3122                 }
3123         }
3124         if ((gcd != NULL) && ((err = big_copy(gcd, riminus1)) != BIG_OK)) {
3125                 goto ret;
3126         }
3127         if ((cm != NULL) && ((err = big_copy(cm, vmi)) != BIG_OK)) {
3128                 goto ret;
3129         }
3130         if (ce != NULL) {
3131                 err = big_copy(ce, vei);
3132         }
3133 ret:
3134         if (tmp.malloced) big_finish(&tmp);
3135 ret8:
3136         if (t8.malloced) big_finish(&t8);
3137 ret7:
3138         if (t7.malloced) big_finish(&t7);
3139 ret6:
3140         if (t6.malloced) big_finish(&t6);
3141 ret5:
3142         if (t5.malloced) big_finish(&t5);
3143 ret4:
3144         if (t4.malloced) big_finish(&t4);
3145 ret3:
3146         if (t3.malloced) big_finish(&t3);
3147 ret2:
3148         if (t2.malloced) big_finish(&t2);
3149 ret1:
3150         if (t1.malloced) big_finish(&t1);
3151
3152         return (err);
3153 }
3154
3155 /*
3156  * Get a rlen-bit random number in BIGNUM format.  Caller-supplied
3157  * (*rfunc)(void *dbuf, size_t dlen) must return 0 for success and
3158  * -1 for failure.  Note:  (*rfunc)() takes length in bytes, not bits.
3159  */
3160 BIG_ERR_CODE
3161 big_random(BIGNUM *r, size_t rlen, int (*rfunc)(void *, size_t))
3162 {
3163         size_t  rwords, rbytes;
3164         int     shift;
3165
3166         if (r == NULL || rlen == 0 || rfunc == NULL)
3167                 return (BIG_INVALID_ARGS);
3168
3169         /*
3170          * Convert rlen bits to r->len words (32- or 64-bit), rbytes bytes
3171          * and extend r if it's not big enough to hold the random number.
3172          */
3173         rwords = BITLEN2BIGNUMLEN(rlen);
3174         rbytes = rwords * sizeof (BIG_CHUNK_TYPE);
3175         if (big_extend(r, rwords) != BIG_OK)
3176                 return (BIG_NO_MEM);
3177 #ifdef BIGNUM_CHUNK_32
3178         r->len = rwords;
3179 #else
3180         r->len = (uint32_t)rwords;
3181 #endif
3182
3183         if ((*rfunc)(r->value, rbytes) < 0)
3184                 return (BIG_NO_RANDOM);
3185
3186         r->value[rwords - 1] |= BIG_CHUNK_HIGHBIT;
3187
3188         /*
3189          * If the bit length is not a word boundary, shift the most
3190          * significant word so that we have an exactly rlen-long number.
3191          */
3192         if ((shift = rlen % BIG_CHUNK_SIZE) != 0)
3193                 r->value[rwords - 1] >>= (BIG_CHUNK_SIZE - shift);
3194
3195         r->sign = 1;    /* non-negative */
3196
3197         return (BIG_OK);
3198 }