c_cpp/etc/calc/zmul.c

   1 /*
   2  * zmul - faster than usual multiplying and squaring routines
   3  *
   4  * Copyright (C) 1999-2007  David I. Bell
   5  *
   6  * Calc is open software; you can redistribute it and/or modify it under
   7  * the terms of the version 2.1 of the GNU Lesser General Public License
   8  * as published by the Free Software Foundation.
   9  *
  10  * Calc is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General
  13  * Public License for more details.
  14  *
  15  * A copy of version 2.1 of the GNU Lesser General Public License is
  16  * distributed with calc under the filename COPYING-LGPL.  You should have
  17  * received a copy with calc; if not, write to Free Software Foundation, Inc.
  18  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  19  *
  20  * @(#) $Revision: 30.1 $
  21  * @(#) $Id: zmul.c,v 30.1 2007/03/16 11:09:46 chongo Exp $
  22  * @(#) $Source: /usr/local/src/bin/calc/RCS/zmul.c,v $
  23  *
  24  * Under source code control:   1991/01/09 20:01:31
  25  * File existed as early as:    1991
  26  *
  27  * Share and enjoy!  :-)        http://www.isthe.com/chongo/tech/comp/calc/
  28  */
  29
  30 /*
  31  * Faster than usual multiplying and squaring routines.
  32  * The algorithm used is the reasonably simple one from Knuth, volume 2,
  33  * section 4.3.3.  These recursive routines are of speed O(N^1.585)
  34  * instead of O(N^2).  The usual multiplication and (almost usual) squaring
  35  * algorithms are used for small numbers.  On a 386 with its compiler, the
  36  * two algorithms are equal in speed at about 100 decimal digits.
  37  */
  38
  39
  40 #include "config.h"
  41 #include "zmath.h"
  42
  43
  44 STATIC HALF *tempbuf;           /* temporary buffer for multiply and square */
  45
  46 S_FUNC LEN domul(HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans);
  47 S_FUNC LEN dosquare(HALF *vp, LEN size, HALF *ans);
  48
  49
  50 /*
  51  * Multiply two numbers using the following formula recursively:
  52  *      (A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
  53  * where S is a power of 2^16, and so multiplies by it are shifts, and
  54  * A,B,C,D are the left and right halfs of the numbers to be multiplied.
  55  *
  56  * given:
  57  *      z1              numbers to multiply
  58  *      z2              numbers to multiply
  59  *      res             result of multiplication
  60  */
  61 void
  62 zmul(ZVALUE z1, ZVALUE z2, ZVALUE *res)
  63 {
  64         LEN len;                /* size of array */
  65
  66         if (ziszero(z1) || ziszero(z2)) {
  67                 *res = _zero_;
  68                 return;
  69         }
  70         if (zisunit(z1)) {
  71                 zcopy(z2, res);
  72                 res->sign = (z1.sign != z2.sign);
  73                 return;
  74         }
  75         if (zisunit(z2)) {
  76                 zcopy(z1, res);
  77                 res->sign = (z1.sign != z2.sign);
  78                 return;
  79         }
  80
  81         /*
  82          * Allocate a temporary buffer for the recursion levels to use.
  83          * An array needs to be allocated large enough for all of the
  84          * temporary results to fit in.  This size is about twice the size
  85          * of the largest original number, since each recursion level uses
  86          * the size of its given number, and whose size is 1/2 the size of
  87          * the previous level.  The sum of the infinite series is 2.
  88          * Add some extra words because of rounding when dividing by 2
  89          * and also because of the extra word that each multiply needs.
  90          */
  91         len = z1.len;
  92         if (len < z2.len)
  93                 len = z2.len;
  94         len = 2 * len + 64;
  95         tempbuf = zalloctemp(len);
  96
  97         res->sign = (z1.sign != z2.sign);
  98         res->v = alloc(z1.len + z2.len + 2);
  99         res->len = domul(z1.v, z1.len, z2.v, z2.len, res->v);
 100 }
 101
 102
 103 /*
 104  * Recursive routine to multiply two numbers by splitting them up into
 105  * two numbers of half the size, and using the results of multiplying the
 106  * subpieces.  The result is placed in the indicated array, which must be
 107  * large enough for the result plus one extra word (size1 + size2 + 1).
 108  * Returns the actual size of the result with leading zeroes stripped.
 109  * This also uses a temporary array which must be twice as large as
 110  * one more than the size of the number at the top level recursive call.
 111  *
 112  * given:
 113  *      v1              first number
 114  *      size1           size of first number
 115  *      v2              second number
 116  *      size2           size of second number
 117  *      ans             location for result
 118  */
 119 S_FUNC LEN
 120 domul(HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans)
 121 {
 122         LEN shift;              /* amount numbers are shifted by */
 123         LEN sizeA;              /* size of left half of first number */
 124         LEN sizeB;              /* size of right half of first number */
 125         LEN sizeC;              /* size of left half of second number */
 126         LEN sizeD;              /* size of right half of second number */
 127         LEN sizeAB;             /* size of subtraction of A and B */
 128         LEN sizeDC;             /* size of subtraction of D and C */
 129         LEN sizeABDC;           /* size of product of above two results */
 130         LEN subsize;            /* size of difference of halfs */
 131         LEN copysize;           /* size of number left to copy */
 132         LEN sizetotal;          /* total size of product */
 133         LEN len;                /* temporary length */
 134         HALF *baseA;            /* base of left half of first number */
 135         HALF *baseB;            /* base of right half of first number */
 136         HALF *baseC;            /* base of left half of second number */
 137         HALF *baseD;            /* base of right half of second number */
 138         HALF *baseAB;           /* base of result of subtraction of A and B */
 139         HALF *baseDC;           /* base of result of subtraction of D and C */
 140         HALF *baseABDC;         /* base of product of above two results */
 141         HALF *baseAC;           /* base of product of A and C */
 142         HALF *baseBD;           /* base of product of B and D */
 143         FULL carry;             /* carry digit for small multiplications */
 144         FULL carryACBD;         /* carry from addition of A*C and B*D */
 145         FULL digit;             /* single digit multiplying by */
 146         HALF *temp;             /* base for temporary calculations */
 147         BOOL neg;               /* whether imtermediate term is negative */
 148         register HALF *hd, *h1=NULL, *h2=NULL;  /* for inner loops */
 149         SIUNION sival;          /* for addition of digits */
 150
 151         /*
 152          * Trim the numbers of leading zeroes and initialize the
 153          * estimated size of the result.
 154          */
 155         hd = &v1[size1 - 1];
 156         while ((*hd == 0) && (size1 > 1)) {
 157                 hd--;
 158                 size1--;
 159         }
 160         hd = &v2[size2 - 1];
 161         while ((*hd == 0) && (size2 > 1)) {
 162                 hd--;
 163                 size2--;
 164         }
 165         sizetotal = size1 + size2;
 166
 167         /*
 168          * First check for zero answer.
 169          */
 170         if (((size1 == 1) && (*v1 == 0)) || ((size2 == 1) && (*v2 == 0))) {
 171                 *ans = 0;
 172                 return 1;
 173         }
 174
 175         /*
 176          * Exchange the two numbers if necessary to make the number of
 177          * digits of the first number be greater than or equal to the
 178          * second number.
 179          */
 180         if (size1 < size2) {
 181                 len = size1; size1 = size2; size2 = len;
 182                 hd = v1; v1 = v2; v2 = hd;
 183         }
 184
 185         /*
 186          * If the smaller number has only a few digits, then calculate
 187          * the result in the normal manner in order to avoid the overhead
 188          * of the recursion for small numbers.  The number of digits where
 189          * the algorithm changes is settable from 2 to maxint.
 190          */
 191         if (size2 < conf->mul2) {
 192                 /*
 193                  * First clear the top part of the result, and then multiply
 194                  * by the lowest digit to get the first partial sum.  Later
 195                  * products will then add into this result.
 196                  */
 197                 hd = &ans[size1];
 198                 len = size2;
 199                 while (len--)
 200                         *hd++ = 0;
 201
 202                 digit = *v2++;
 203                 h1 = v1;
 204                 hd = ans;
 205                 carry = 0;
 206                 len = size1;
 207                 while (len >= 4) {      /* expand the loop some */
 208                         len -= 4;
 209                         sival.ivalue = ((FULL) *h1++) * digit + carry;
 210                         /* ignore Saber-C warning #112 - get ushort from uint */
 211                         /*        ok to ignore on name domul`sival */
 212                         *hd++ = sival.silow;
 213                         carry = sival.sihigh;
 214                         sival.ivalue = ((FULL) *h1++) * digit + carry;
 215                         *hd++ = sival.silow;
 216                         carry = sival.sihigh;
 217                         sival.ivalue = ((FULL) *h1++) * digit + carry;
 218                         *hd++ = sival.silow;
 219                         carry = sival.sihigh;
 220                         sival.ivalue = ((FULL) *h1++) * digit + carry;
 221                         *hd++ = sival.silow;
 222                         carry = sival.sihigh;
 223                 }
 224                 while (len--) {
 225                         sival.ivalue = ((FULL) *h1++) * digit + carry;
 226                         *hd++ = sival.silow;
 227                         carry = sival.sihigh;
 228                 }
 229                 *hd = (HALF)carry;
 230
 231                 /*
 232                  * Now multiply by the remaining digits of the second number,
 233                  * adding each product into the final result.
 234                  */
 235                 h2 = ans;
 236                 while (--size2 > 0) {
 237                         digit = *v2++;
 238                         h1 = v1;
 239                         hd = ++h2;
 240                         if (digit == 0)
 241                                 continue;
 242                         carry = 0;
 243                         len = size1;
 244                         while (len >= 4) {      /* expand the loop some */
 245                                 len -= 4;
 246                                 sival.ivalue = ((FULL) *h1++) * digit
 247                                         + ((FULL) *hd) + carry;
 248                                 *hd++ = sival.silow;
 249                                 carry = sival.sihigh;
 250                                 sival.ivalue = ((FULL) *h1++) * digit
 251                                         + ((FULL) *hd) + carry;
 252                                 *hd++ = sival.silow;
 253                                 carry = sival.sihigh;
 254                                 sival.ivalue = ((FULL) *h1++) * digit
 255                                         + ((FULL) *hd) + carry;
 256                                 *hd++ = sival.silow;
 257                                 carry = sival.sihigh;
 258                                 sival.ivalue = ((FULL) *h1++) * digit
 259                                         + ((FULL) *hd) + carry;
 260                                 *hd++ = sival.silow;
 261                                 carry = sival.sihigh;
 262                         }
 263                         while (len--) {
 264                                 sival.ivalue = ((FULL) *h1++) * digit
 265                                         + ((FULL) *hd) + carry;
 266                                 *hd++ = sival.silow;
 267                                 carry = sival.sihigh;
 268                         }
 269                         while (carry) {
 270                                 sival.ivalue = ((FULL) *hd) + carry;
 271                                 *hd++ = sival.silow;
 272                                 carry = sival.sihigh;
 273                         }
 274                 }
 275
 276                 /*
 277                  * Now return the true size of the number.
 278                  */
 279                 len = sizetotal;
 280                 hd = &ans[len - 1];
 281                 while ((*hd == 0) && (len > 1)) {
 282                         hd--;
 283                         len--;
 284                 }
 285                 return len;
 286         }
 287
 288         /*
 289          * Need to multiply by a large number.
 290          * Allocate temporary space for calculations, and calculate the
 291          * value for the shift.  The shift value is 1/2 the size of the
 292          * larger (first) number (rounded up).  The amount of temporary
 293          * space needed is twice the size of the shift, plus one more word
 294          * for the multiply to use.
 295          */
 296         shift = (size1 + 1) / 2;
 297         temp = tempbuf;
 298         tempbuf += (2 * shift) + 1;
 299
 300         /*
 301          * Determine the sizes and locations of all the numbers.
 302          * The value of sizeC can be negative, and this is checked later.
 303          * The value of sizeD is limited by the full size of the number.
 304          */
 305         baseA = v1 + shift;
 306         baseB = v1;
 307         baseC = v2 + ((shift <= size2)  ? shift : size2);
 308         baseD = v2;
 309         baseAB = ans;
 310         baseDC = ans + shift;
 311         baseAC = ans + shift * 2;
 312         baseBD = ans;
 313
 314         sizeA = size1 - shift;
 315         sizeC = size2 - shift;
 316
 317         sizeB = shift;
 318         hd = &baseB[shift - 1];
 319         while ((*hd == 0) && (sizeB > 1)) {
 320                 hd--;
 321                 sizeB--;
 322         }
 323
 324         sizeD = shift;
 325         if (sizeD > size2)
 326                 sizeD = size2;
 327         hd = &baseD[sizeD - 1];
 328         while ((*hd == 0) && (sizeD > 1)) {
 329                 hd--;
 330                 sizeD--;
 331         }
 332
 333         /*
 334          * If the smaller number has a high half of zero, then calculate
 335          * the result by breaking up the first number into two numbers
 336          * and combining the results using the obvious formula:
 337          *      (A*S+B) * D = (A*D)*S + B*D
 338          */
 339         if (sizeC <= 0) {
 340                 len = domul(baseB, sizeB, baseD, sizeD, ans);
 341                 hd = &ans[len];
 342                 len = sizetotal - len;
 343                 while (len--)
 344                         *hd++ = 0;
 345
 346                 /*
 347                  * Add the second number into the first number, shifted
 348                  * over at the correct position.
 349                  */
 350                 len = domul(baseA, sizeA, baseD, sizeD, temp);
 351                 h1 = temp;
 352                 hd = ans + shift;
 353                 carry = 0;
 354                 while (len--) {
 355                         sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
 356                         *hd++ = sival.silow;
 357                         carry = sival.sihigh;
 358                 }
 359                 while (carry) {
 360                         sival.ivalue = ((FULL) *hd) + carry;
 361                         *hd++ = sival.silow;
 362                         carry = sival.sihigh;
 363                 }
 364
 365                 /*
 366                  * Determine the final size of the number and return it.
 367                  */
 368                 len = sizetotal;
 369                 hd = &ans[len - 1];
 370                 while ((*hd == 0) && (len > 1)) {
 371                         hd--;
 372                         len--;
 373                 }
 374                 tempbuf = temp;
 375                 return len;
 376         }
 377
 378         /*
 379          * Now we know that the high halfs of the numbers are nonzero,
 380          * so we can use the complete formula.
 381          *      (A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D.
 382          * The steps are done in the following order:
 383          *      A-B
 384          *      D-C
 385          *      (A-B)*(D-C)
 386          *      S^2*A*C + B*D
 387          *      (S^2+S)*A*C + (S+1)*B*D                         (*)
 388          *      (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
 389          *
 390          * Note: step (*) above can produce a result which is larger than
 391          * the final product will be, and this is where the extra word
 392          * needed in the product comes from.  After the final subtraction is
 393          * done, the result fits in the expected size.  Using the extra word
 394          * is easier than suppressing the carries and borrows everywhere.
 395          *
 396          * Begin by forming the product (A-B)*(D-C) into a temporary
 397          * location that we save until the final step.  Do each subtraction
 398          * at positions 0 and S.  Be very careful about the relative sizes
 399          * of the numbers since this result can be negative.  For the first
 400          * step calculate the absolute difference of A and B into a temporary
 401          * location at position 0 of the result.  Negate the sign if A is
 402          * smaller than B.
 403          */
 404         neg = FALSE;
 405         if (sizeA == sizeB) {
 406                 len = sizeA;
 407                 h1 = &baseA[len - 1];
 408                 h2 = &baseB[len - 1];
 409                 while ((len > 1) && (*h1 == *h2)) {
 410                         len--;
 411                         h1--;
 412                         h2--;
 413                 }
 414         }
 415         if ((sizeA > sizeB) || ((sizeA == sizeB) && h1 && h2 && (*h1 > *h2))) {
 416                 h1 = baseA;
 417                 h2 = baseB;
 418                 sizeAB = sizeA;
 419                 subsize = sizeB;
 420         } else {
 421                 neg = !neg;
 422                 h1 = baseB;
 423                 h2 = baseA;
 424                 sizeAB = sizeB;
 425                 subsize = sizeA;
 426         }
 427         copysize = sizeAB - subsize;
 428
 429         hd = baseAB;
 430         carry = 0;
 431         while (subsize--) {
 432                 sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
 433                 *hd++ = (HALF)(BASE1 - sival.silow);
 434                 carry = sival.sihigh;
 435         }
 436         while (copysize--) {
 437                 sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
 438                 *hd++ = (HALF)(BASE1 - sival.silow);
 439                 carry = sival.sihigh;
 440         }
 441
 442         hd = &baseAB[sizeAB - 1];
 443         while ((*hd == 0) && (sizeAB > 1)) {
 444                 hd--;
 445                 sizeAB--;
 446         }
 447
 448         /*
 449          * This completes the calculation of abs(A-B).  For the next step
 450          * calculate the absolute difference of D and C into a temporary
 451          * location at position S of the result.  Negate the sign if C is
 452          * larger than D.
 453          */
 454         if (sizeC == sizeD) {
 455                 len = sizeC;
 456                 h1 = &baseC[len - 1];
 457                 h2 = &baseD[len - 1];
 458                 while ((len > 1) && (*h1 == *h2)) {
 459                         len--;
 460                         h1--;
 461                         h2--;
 462                 }
 463         }
 464         if ((sizeC > sizeD) || ((sizeC == sizeD) && (*h1 > *h2))) {
 465                 neg = !neg;
 466                 h1 = baseC;
 467                 h2 = baseD;
 468                 sizeDC = sizeC;
 469                 subsize = sizeD;
 470         } else {
 471                 h1 = baseD;
 472                 h2 = baseC;
 473                 sizeDC = sizeD;
 474                 subsize = sizeC;
 475         }
 476         copysize = sizeDC - subsize;
 477
 478         hd = baseDC;
 479         carry = 0;
 480         while (subsize--) {
 481                 sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
 482                 *hd++ = (HALF)(BASE1 - sival.silow);
 483                 carry = sival.sihigh;
 484         }
 485         while (copysize--) {
 486                 sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
 487                 *hd++ = (HALF)(BASE1 - sival.silow);
 488                 carry = sival.sihigh;
 489         }
 490         hd = &baseDC[sizeDC - 1];
 491         while ((*hd == 0) && (sizeDC > 1)) {
 492                 hd--;
 493                 sizeDC--;
 494         }
 495
 496         /*
 497          * This completes the calculation of abs(D-C).  Now multiply
 498          * together abs(A-B) and abs(D-C) into a temporary location,
 499          * which is preserved until the final steps.
 500          */
 501         baseABDC = temp;
 502         sizeABDC = domul(baseAB, sizeAB, baseDC, sizeDC, baseABDC);
 503
 504         /*
 505          * Now calculate B*D and A*C into one of their two final locations.
 506          * Make sure the high order digits of the products are zeroed since
 507          * this initializes the final result.  Be careful about this zeroing
 508          * since the size of the high order words might be smaller than
 509          * the shift size.  Do B*D first since the multiplies use one more
 510          * word than the size of the product.  Also zero the final extra
 511          * word in the result for possible carries to use.
 512          */
 513         len = domul(baseB, sizeB, baseD, sizeD, baseBD);
 514         hd = &baseBD[len];
 515         len = shift * 2 - len;
 516         while (len--)
 517                 *hd++ = 0;
 518
 519         len = domul(baseA, sizeA, baseC, sizeC, baseAC);
 520         hd = &baseAC[len];
 521         len = sizetotal - shift * 2 - len + 1;
 522         while (len--)
 523                 *hd++ = 0;
 524
 525         /*
 526          * Now add in A*C and B*D into themselves at the other shifted
 527          * position that they need.  This addition is tricky in order to
 528          * make sure that the two additions cannot interfere with each other.
 529          * Therefore we first add in the top half of B*D and the lower half
 530          * of A*C.  The sources and destinations of these two additions
 531          * overlap, and so the same answer results from the two additions,
 532          * thus only two pointers suffice for both additions.  Keep the
 533          * final carry from these additions for later use since we cannot
 534          * afford to change the top half of A*C yet.
 535          */
 536         h1 = baseBD + shift;
 537         h2 = baseAC;
 538         carryACBD = 0;
 539         len = shift;
 540         while (len--) {
 541                 sival.ivalue = ((FULL) *h1) + ((FULL) *h2) + carryACBD;
 542                 *h1++ = sival.silow;
 543                 *h2++ = sival.silow;
 544                 carryACBD = sival.sihigh;
 545         }
 546
 547         /*
 548          * Now add in the bottom half of B*D and the top half of A*C.
 549          * These additions are straightforward, except that A*C should
 550          * be done first because of possible carries from B*D, and the
 551          * top half of A*C might not exist.  Add in one of the carries
 552          * from the previous addition while we are at it.
 553          */
 554         h1 = baseAC + shift;
 555         hd = baseAC;
 556         carry = carryACBD;
 557         len = sizetotal - 3 * shift;
 558         while (len--) {
 559                 sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
 560                 *hd++ = sival.silow;
 561                 carry = sival.sihigh;
 562         }
 563         while (carry) {
 564                 sival.ivalue = ((FULL) *hd) + carry;
 565                 *hd++ = sival.silow;
 566                 carry = sival.sihigh;
 567         }
 568
 569         h1 = baseBD;
 570         hd = baseBD + shift;
 571         carry = 0;
 572         len = shift;
 573         while (len--) {
 574                 sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
 575                 *hd++ = sival.silow;
 576                 carry = sival.sihigh;
 577         }
 578         while (carry) {
 579                 sival.ivalue = ((FULL) *hd) + carry;
 580                 *hd++ = sival.silow;
 581                 carry = sival.sihigh;
 582         }
 583
 584         /*
 585          * Now finally add in the other delayed carry from the
 586          * above addition.
 587          */
 588         hd = baseAC + shift;
 589         while (carryACBD) {
 590                 sival.ivalue = ((FULL) *hd) + carryACBD;
 591                 *hd++ = sival.silow;
 592                 carryACBD = sival.sihigh;
 593         }
 594
 595         /*
 596          * Now finally add or subtract (A-B)*(D-C) into the final result at
 597          * the correct position (S), according to whether it is positive or
 598          * negative.  When subtracting, the answer cannot go negative.
 599          */
 600         h1 = baseABDC;
 601         hd = ans + shift;
 602         carry = 0;
 603         len = sizeABDC;
 604         if (neg) {
 605                 while (len--) {
 606                         sival.ivalue = BASE1 - ((FULL) *hd) +
 607                                 ((FULL) *h1++) + carry;
 608                         *hd++ = (HALF)(BASE1 - sival.silow);
 609                         carry = sival.sihigh;
 610                 }
 611                 while (carry) {
 612                         sival.ivalue = BASE1 - ((FULL) *hd) + carry;
 613                         *hd++ = (HALF)(BASE1 - sival.silow);
 614                         carry = sival.sihigh;
 615                 }
 616         } else {
 617                 while (len--) {
 618                         sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
 619                         *hd++ = sival.silow;
 620                         carry = sival.sihigh;
 621                 }
 622                 while (carry) {
 623                         sival.ivalue = ((FULL) *hd) + carry;
 624                         *hd++ = sival.silow;
 625                         carry = sival.sihigh;
 626                 }
 627         }
 628
 629         /*
 630          * Finally determine the size of the final result and return that.
 631          */
 632         len = sizetotal;
 633         hd = &ans[len - 1];
 634         while ((*hd == 0) && (len > 1)) {
 635                 hd--;
 636                 len--;
 637         }
 638         tempbuf = temp;
 639         return len;
 640 }
 641
 642
 643 /*
 644  * Square a number by using the following formula recursively:
 645  *      (A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2
 646  * where S is a power of 2^16, and so multiplies by it are shifts,
 647  * and A and B are the left and right halfs of the number to square.
 648  */
 649 void
 650 zsquare(ZVALUE z, ZVALUE *res)
 651 {
 652         LEN len;
 653
 654         if (ziszero(z)) {
 655                 *res = _zero_;
 656                 return;
 657         }
 658         if (zisunit(z)) {
 659                 *res = _one_;
 660                 return;
 661         }
 662
 663         /*
 664          * Allocate a temporary array if necessary for the recursion to use.
 665          * The array needs to be allocated large enough for all of the
 666          * temporary results to fit in.  This size is about 3 times the
 667          * size of the original number, since each recursion level uses 3/2
 668          * of the size of its given number, and whose size is 1/2 the size
 669          * of the previous level.  The sum of the infinite series is 3.
 670          * Allocate some extra words for rounding up the sizes.
 671          */
 672         len = 3 * z.len + 32;
 673         tempbuf = zalloctemp(len);
 674
 675         res->sign = 0;
 676         res->v = alloc((z.len+2) * 2);
 677         /*
 678          * Without the memset below, Purify reports that dosquare()
 679          *       will read uninitialized memory at the dosquare() line below
 680          *       the comment:
 681          *
 682          *              uninitialized memory read (see zsquare)
 683          *
 684          * This problem occurs during regression test #622 and may
 685          * be duplicated by executing:
 686          *
 687          *      config("sq2", 2);
 688          *      0xffff0000ffffffff00000000ffff0000000000000000ffff^2;
 689          */
 690         memset((char *)res->v, 0, ((z.len+2) * 2)*sizeof(HALF));
 691         res->len = dosquare(z.v, z.len, res->v);
 692 }
 693
 694
 695 /*
 696  * Recursive routine to square a number by splitting it up into two numbers
 697  * of half the size, and using the results of squaring the subpieces.
 698  * The result is placed in the indicated array, which must be large
 699  * enough for the result (size * 2).  Returns the size of the result.
 700  * This uses a temporary array which must be 3 times as large as the
 701  * size of the number at the top level recursive call.
 702  *
 703  * given:
 704  *      vp              value to be squared
 705  *      size            length of value to square
 706  *      ans             location for result
 707  */
 708 S_FUNC LEN
 709 dosquare(HALF *vp, LEN size, HALF *ans)
 710 {
 711         LEN shift;              /* amount numbers are shifted by */
 712         LEN sizeA;              /* size of left half of number to square */
 713         LEN sizeB;              /* size of right half of number to square */
 714         LEN sizeAA;             /* size of square of left half */
 715         LEN sizeBB;             /* size of square of right half */
 716         LEN sizeAABB;           /* size of sum of squares of A and B */
 717         LEN sizeAB;             /* size of difference of A and B */
 718         LEN sizeABAB;           /* size of square of difference of A and B */
 719         LEN subsize;            /* size of difference of halfs */
 720         LEN copysize;           /* size of number left to copy */
 721         LEN sumsize;            /* size of sum */
 722         LEN sizetotal;          /* total size of square */
 723         LEN len;                /* temporary length */
 724         LEN len1;               /* another temporary length */
 725         FULL carry;             /* carry digit for small multiplications */
 726         FULL digit;             /* single digit multiplying by */
 727         HALF *temp;             /* base for temporary calculations */
 728         HALF *baseA;            /* base of left half of number */
 729         HALF *baseB;            /* base of right half of number */
 730         HALF *baseAA;           /* base of square of left half of number */
 731         HALF *baseBB;           /* base of square of right half of number */
 732         HALF *baseAABB;         /* base of sum of squares of A and B */
 733         HALF *baseAB;           /* base of difference of A and B */
 734         HALF *baseABAB;         /* base of square of difference of A and B */
 735         register HALF *hd, *h1, *h2, *h3;       /* for inner loops */
 736         SIUNION sival;          /* for addition of digits */
 737
 738         /*
 739          * First trim the number of leading zeroes.
 740          */
 741         hd = &vp[size - 1];
 742         while ((*hd == 0) && (size > 1)) {
 743                 size--;
 744                 hd--;
 745         }
 746         sizetotal = size + size;
 747
 748         /*
 749          * If the number has only a small number of digits, then use the
 750          * (almost) normal multiplication method.  Multiply each halfword
 751          * only by those halfwards further on in the number.  Missed terms
 752          * will then be the same pairs of products repeated, and the squares
 753          * of each halfword.  The first case is handled by doubling the
 754          * result.  The second case is handled explicitly.  The number of
 755          * digits where the algorithm changes is settable from 2 to maxint.
 756          */
 757         if (size < conf->sq2) {
 758                 hd = ans;
 759                 len = sizetotal;
 760                 while (len--)
 761                         *hd++ = 0;
 762
 763                 h2 = vp;
 764                 hd = ans + 1;
 765                 for (len = size; len--; hd += 2) {
 766                         digit = (FULL) *h2++;
 767                         if (digit == 0)
 768                                 continue;
 769                         h3 = h2;
 770                         h1 = hd;
 771                         carry = 0;
 772                         len1 = len;
 773                         while (len1 >= 4) {     /* expand the loop some */
 774                                 len1 -= 4;
 775                                 sival.ivalue = (digit * ((FULL) *h3++))
 776                                         + ((FULL) *h1) + carry;
 777                                 *h1++ = sival.silow;
 778                                 sival.ivalue = (digit * ((FULL) *h3++))
 779                                         + ((FULL) *h1) + ((FULL) sival.sihigh);
 780                                 *h1++ = sival.silow;
 781                                 sival.ivalue = (digit * ((FULL) *h3++))
 782                                         + ((FULL) *h1) + ((FULL) sival.sihigh);
 783                                 *h1++ = sival.silow;
 784                                 sival.ivalue = (digit * ((FULL) *h3++))
 785                                         + ((FULL) *h1) + ((FULL) sival.sihigh);
 786                                 *h1++ = sival.silow;
 787                                 carry = sival.sihigh;
 788                         }
 789                         while (len1--) {
 790                                 sival.ivalue = (digit * ((FULL) *h3++))
 791                                         + ((FULL) *h1) + carry;
 792                                 *h1++ = sival.silow;
 793                                 carry = sival.sihigh;
 794                         }
 795                         while (carry) {
 796                                 sival.ivalue = ((FULL) *h1) + carry;
 797                                 *h1++ = sival.silow;
 798                                 carry = sival.sihigh;
 799                         }
 800                 }
 801
 802                 /*
 803                  * Now double the result.
 804                  * There is no final carry to worry about because we
 805                  * handle all digits of the result which must fit.
 806                  */
 807                 carry = 0;
 808                 hd = ans;
 809                 len = sizetotal;
 810                 while (len--) {
 811                         digit = ((FULL) *hd);
 812                         sival.ivalue = digit + digit + carry;
 813                         /* ignore Saber-C warning #112 - get ushort from uint */
 814                         /*        ok to ignore on name dosquare`sival */
 815                         *hd++ = sival.silow;
 816                         carry = sival.sihigh;
 817                 }
 818
 819                 /*
 820                  * Now add in the squares of each halfword.
 821                  */
 822                 carry = 0;
 823                 hd = ans;
 824                 h3 = vp;
 825                 len = size;
 826                 while (len--) {
 827                         digit = ((FULL) *h3++);
 828                         sival.ivalue = digit * digit + ((FULL) *hd) + carry;
 829                         *hd++ = sival.silow;
 830                         carry = sival.sihigh;
 831                         sival.ivalue = ((FULL) *hd) + carry;
 832                         *hd++ = sival.silow;
 833                         carry = sival.sihigh;
 834                 }
 835                 while (carry) {
 836                         sival.ivalue = ((FULL) *hd) + carry;
 837                         *hd++ = sival.silow;
 838                         carry = sival.sihigh;
 839                 }
 840
 841                 /*
 842                  * Finally return the size of the result.
 843                  */
 844                 len = sizetotal;
 845                 hd = &ans[len - 1];
 846                 while ((*hd == 0) && (len > 1)) {
 847                         len--;
 848                         hd--;
 849                 }
 850                 return len;
 851         }
 852
 853         /*
 854          * The number to be squared is large.
 855          * Allocate temporary space and determine the sizes and
 856          * positions of the values to be calculated.
 857          */
 858         temp = tempbuf;
 859         tempbuf += (3 * (size + 1) / 2);
 860
 861         sizeA = size / 2;
 862         sizeB = size - sizeA;
 863         shift = sizeB;
 864         baseA = vp + sizeB;
 865         baseB = vp;
 866         baseAA = &ans[shift * 2];
 867         baseBB = ans;
 868         baseAABB = temp;
 869         baseAB = temp;
 870         baseABAB = &temp[shift];
 871
 872         /*
 873          * Trim the second number of leading zeroes.
 874          */
 875         hd = &baseB[sizeB - 1];
 876         while ((*hd == 0) && (sizeB > 1)) {
 877                 sizeB--;
 878                 hd--;
 879         }
 880
 881         /*
 882          * Now to proceed to calculate the result using the formula.
 883          *      (A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
 884          * The steps are done in the following order:
 885          *      S^2*A^2 + B^2
 886          *      A^2 + B^2
 887          *      (S^2+S)*A^2 + (S+1)*B^2
 888          *      (A-B)^2
 889          *      (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
 890          *
 891          * Begin by forming the squares of two the halfs concatenated
 892          * together in the final result location.  Make sure that the
 893          * highest words of the results are zero.
 894          */
 895         sizeBB = dosquare(baseB, sizeB, baseBB);
 896         hd = &baseBB[sizeBB];
 897         len = shift * 2 - sizeBB;
 898         while (len--)
 899                 *hd++ = 0;
 900
 901         sizeAA = dosquare(baseA, sizeA, baseAA);
 902         hd = &baseAA[sizeAA];
 903         len = sizetotal - shift * 2 - sizeAA;
 904         while (len--)
 905                 *hd++ = 0;
 906
 907         /*
 908          * Sum the two squares into a temporary location.
 909          */
 910         if (sizeAA >= sizeBB) {
 911                 h1 = baseAA;
 912                 h2 = baseBB;
 913                 sizeAABB = sizeAA;
 914                 sumsize = sizeBB;
 915         } else {
 916                 h1 = baseBB;
 917                 h2 = baseAA;
 918                 sizeAABB = sizeBB;
 919                 sumsize = sizeAA;
 920         }
 921         copysize = sizeAABB - sumsize;
 922
 923         hd = baseAABB;
 924         carry = 0;
 925         while (sumsize--) {
 926                 sival.ivalue = ((FULL) *h1++) + ((FULL) *h2++) + carry;
 927                 *hd++ = sival.silow;
 928                 carry = sival.sihigh;
 929         }
 930         while (copysize--) {
 931                 sival.ivalue = ((FULL) *h1++) + carry;
 932                 *hd++ = sival.silow;
 933                 carry = sival.sihigh;
 934         }
 935         if (carry) {
 936                 *hd = (HALF)carry;
 937                 sizeAABB++;
 938         }
 939
 940         /*
 941          * Add the sum back into the previously calculated squares
 942          * shifted over to the proper location.
 943          */
 944         h1 = baseAABB;
 945         hd = ans + shift;
 946         carry = 0;
 947         len = sizeAABB;
 948         while (len--) {
 949                 sival.ivalue = ((FULL) *hd) + ((FULL) *h1++) + carry;
 950                 *hd++ = sival.silow;
 951                 carry = sival.sihigh;
 952         }
 953         while (carry) {
 954                 /* uninitialized memory read (see zsquare) */
 955                 sival.ivalue = ((FULL) *hd) + carry;
 956                 *hd++ = sival.silow;
 957                 carry = sival.sihigh;
 958         }
 959
 960         /*
 961          * Calculate the absolute value of the difference of the two halfs
 962          * into a temporary location.
 963          */
 964         if (sizeA == sizeB) {
 965                 len = sizeA;
 966                 h1 = &baseA[len - 1];
 967                 h2 = &baseB[len - 1];
 968                 while ((len > 1) && (*h1 == *h2)) {
 969                         len--;
 970                         h1--;
 971                         h2--;
 972                 }
 973         }
 974         if ((sizeA > sizeB) || ((sizeA == sizeB) && (*h1 > *h2))) {
 975                 h1 = baseA;
 976                 h2 = baseB;
 977                 sizeAB = sizeA;
 978                 subsize = sizeB;
 979         } else {
 980                 h1 = baseB;
 981                 h2 = baseA;
 982                 sizeAB = sizeB;
 983                 subsize = sizeA;
 984         }
 985         copysize = sizeAB - subsize;
 986
 987         hd = baseAB;
 988         carry = 0;
 989         while (subsize--) {
 990                 sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
 991                 *hd++ = (HALF)(BASE1 - sival.silow);
 992                 carry = sival.sihigh;
 993         }
 994         while (copysize--) {
 995                 sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
 996                 *hd++ = (HALF)(BASE1 - sival.silow);
 997                 carry = sival.sihigh;
 998         }
 999
1000         hd = &baseAB[sizeAB - 1];
1001         while ((*hd == 0) && (sizeAB > 1)) {
1002                 sizeAB--;
1003                 hd--;
1004         }
1005
1006         /*
1007          * Now square the number into another temporary location,
1008          * and subtract that from the final result.
1009          */
1010         sizeABAB = dosquare(baseAB, sizeAB, baseABAB);
1011
1012         h1 = baseABAB;
1013         hd = ans + shift;
1014         carry = 0;
1015         while (sizeABAB--) {
1016                 sival.ivalue = BASE1 - ((FULL) *hd) + ((FULL) *h1++) + carry;
1017                 *hd++ = (HALF)(BASE1 - sival.silow);
1018                 carry = sival.sihigh;
1019         }
1020         while (carry) {
1021                 sival.ivalue = BASE1 - ((FULL) *hd) + carry;
1022                 *hd++ = (HALF)(BASE1 - sival.silow);
1023                 carry = sival.sihigh;
1024         }
1025
1026         /*
1027          * Return the size of the result.
1028          */
1029         len = sizetotal;
1030         hd = &ans[len - 1];
1031         while ((*hd == 0) && (len > 1)) {
1032                 len--;
1033                 hd--;
1034         }
1035         tempbuf = temp;
1036         return len;
1037 }
1038
1039
1040 /*
1041  * Return a pointer to a buffer to be used for holding a temporary number.
1042  * The buffer will be at least as large as the specified number of HALFs,
1043  * and remains valid until the next call to this routine.  The buffer cannot
1044  * be freed by the caller.  There is only one temporary buffer, and so as to
1045  * avoid possible conflicts this is only used by the lowest level routines
1046  * such as divide, multiply, and square.
1047  *
1048  * given:
1049  *      len             required number of HALFs in buffer
1050  */
1051 HALF *
1052 zalloctemp(LEN len)
1053 {
1054         HALF *hp;
1055         STATIC LEN buflen;      /* current length of temp buffer */
1056         STATIC HALF *bufptr;    /* pointer to current temp buffer */
1057
1058         if (len <= buflen)
1059                 return bufptr;
1060
1061         /*
1062          * We need to grow the temporary buffer.
1063          * First free any existing buffer, and then allocate the new one.
1064          * While we are at it, make the new buffer bigger than necessary
1065          * in order to reduce the number of reallocations.
1066          */
1067         len += 100;
1068         if (buflen) {
1069                 buflen = 0;
1070                 free(bufptr);
1071         }
1072         /* don't call alloc() because _math_abort_ may not be set right */
1073         hp = (HALF *) malloc((len+1) * sizeof(HALF));
1074         if (hp == NULL) {
1075                 math_error("No memory for temp buffer");
1076                 /*NOTREACHED*/
1077         }
1078         bufptr = hp;
1079         buflen = len;
1080         return hp;
1081 }