modified: SpatialOmicsCoord.py
[GalaxyCodeBases.git] / c_cpp / etc / calc / zmul.c
blob837f4570d1b0a781d30a40d8f30c2e85f57f2191
1 /*
2 * zmul - faster than usual multiplying and squaring routines
4 * Copyright (C) 1999-2007 David I. Bell
6 * Calc is open software; you can redistribute it and/or modify it under
7 * the terms of the version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
10 * Calc is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
13 * Public License for more details.
15 * A copy of version 2.1 of the GNU Lesser General Public License is
16 * distributed with calc under the filename COPYING-LGPL. You should have
17 * received a copy with calc; if not, write to Free Software Foundation, Inc.
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * @(#) $Revision: 30.1 $
21 * @(#) $Id: zmul.c,v 30.1 2007/03/16 11:09:46 chongo Exp $
22 * @(#) $Source: /usr/local/src/bin/calc/RCS/zmul.c,v $
24 * Under source code control: 1991/01/09 20:01:31
25 * File existed as early as: 1991
27 * Share and enjoy! :-) http://www.isthe.com/chongo/tech/comp/calc/
31 * Faster than usual multiplying and squaring routines.
32 * The algorithm used is the reasonably simple one from Knuth, volume 2,
33 * section 4.3.3. These recursive routines are of speed O(N^1.585)
34 * instead of O(N^2). The usual multiplication and (almost usual) squaring
35 * algorithms are used for small numbers. On a 386 with its compiler, the
36 * two algorithms are equal in speed at about 100 decimal digits.
40 #include "config.h"
41 #include "zmath.h"
44 STATIC HALF *tempbuf; /* temporary buffer for multiply and square */
46 S_FUNC LEN domul(HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans);
47 S_FUNC LEN dosquare(HALF *vp, LEN size, HALF *ans);
51 * Multiply two numbers using the following formula recursively:
52 * (A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
53 * where S is a power of 2^16, and so multiplies by it are shifts, and
54 * A,B,C,D are the left and right halfs of the numbers to be multiplied.
56 * given:
57 * z1 numbers to multiply
58 * z2 numbers to multiply
59 * res result of multiplication
61 void
62 zmul(ZVALUE z1, ZVALUE z2, ZVALUE *res)
64 LEN len; /* size of array */
66 if (ziszero(z1) || ziszero(z2)) {
67 *res = _zero_;
68 return;
70 if (zisunit(z1)) {
71 zcopy(z2, res);
72 res->sign = (z1.sign != z2.sign);
73 return;
75 if (zisunit(z2)) {
76 zcopy(z1, res);
77 res->sign = (z1.sign != z2.sign);
78 return;
82 * Allocate a temporary buffer for the recursion levels to use.
83 * An array needs to be allocated large enough for all of the
84 * temporary results to fit in. This size is about twice the size
85 * of the largest original number, since each recursion level uses
86 * the size of its given number, and whose size is 1/2 the size of
87 * the previous level. The sum of the infinite series is 2.
88 * Add some extra words because of rounding when dividing by 2
89 * and also because of the extra word that each multiply needs.
91 len = z1.len;
92 if (len < z2.len)
93 len = z2.len;
94 len = 2 * len + 64;
95 tempbuf = zalloctemp(len);
97 res->sign = (z1.sign != z2.sign);
98 res->v = alloc(z1.len + z2.len + 2);
99 res->len = domul(z1.v, z1.len, z2.v, z2.len, res->v);
104 * Recursive routine to multiply two numbers by splitting them up into
105 * two numbers of half the size, and using the results of multiplying the
106 * subpieces. The result is placed in the indicated array, which must be
107 * large enough for the result plus one extra word (size1 + size2 + 1).
108 * Returns the actual size of the result with leading zeroes stripped.
109 * This also uses a temporary array which must be twice as large as
110 * one more than the size of the number at the top level recursive call.
112 * given:
113 * v1 first number
114 * size1 size of first number
115 * v2 second number
116 * size2 size of second number
117 * ans location for result
119 S_FUNC LEN
120 domul(HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans)
122 LEN shift; /* amount numbers are shifted by */
123 LEN sizeA; /* size of left half of first number */
124 LEN sizeB; /* size of right half of first number */
125 LEN sizeC; /* size of left half of second number */
126 LEN sizeD; /* size of right half of second number */
127 LEN sizeAB; /* size of subtraction of A and B */
128 LEN sizeDC; /* size of subtraction of D and C */
129 LEN sizeABDC; /* size of product of above two results */
130 LEN subsize; /* size of difference of halfs */
131 LEN copysize; /* size of number left to copy */
132 LEN sizetotal; /* total size of product */
133 LEN len; /* temporary length */
134 HALF *baseA; /* base of left half of first number */
135 HALF *baseB; /* base of right half of first number */
136 HALF *baseC; /* base of left half of second number */
137 HALF *baseD; /* base of right half of second number */
138 HALF *baseAB; /* base of result of subtraction of A and B */
139 HALF *baseDC; /* base of result of subtraction of D and C */
140 HALF *baseABDC; /* base of product of above two results */
141 HALF *baseAC; /* base of product of A and C */
142 HALF *baseBD; /* base of product of B and D */
143 FULL carry; /* carry digit for small multiplications */
144 FULL carryACBD; /* carry from addition of A*C and B*D */
145 FULL digit; /* single digit multiplying by */
146 HALF *temp; /* base for temporary calculations */
147 BOOL neg; /* whether imtermediate term is negative */
148 register HALF *hd, *h1=NULL, *h2=NULL; /* for inner loops */
149 SIUNION sival; /* for addition of digits */
152 * Trim the numbers of leading zeroes and initialize the
153 * estimated size of the result.
155 hd = &v1[size1 - 1];
156 while ((*hd == 0) && (size1 > 1)) {
157 hd--;
158 size1--;
160 hd = &v2[size2 - 1];
161 while ((*hd == 0) && (size2 > 1)) {
162 hd--;
163 size2--;
165 sizetotal = size1 + size2;
168 * First check for zero answer.
170 if (((size1 == 1) && (*v1 == 0)) || ((size2 == 1) && (*v2 == 0))) {
171 *ans = 0;
172 return 1;
176 * Exchange the two numbers if necessary to make the number of
177 * digits of the first number be greater than or equal to the
178 * second number.
180 if (size1 < size2) {
181 len = size1; size1 = size2; size2 = len;
182 hd = v1; v1 = v2; v2 = hd;
186 * If the smaller number has only a few digits, then calculate
187 * the result in the normal manner in order to avoid the overhead
188 * of the recursion for small numbers. The number of digits where
189 * the algorithm changes is settable from 2 to maxint.
191 if (size2 < conf->mul2) {
193 * First clear the top part of the result, and then multiply
194 * by the lowest digit to get the first partial sum. Later
195 * products will then add into this result.
197 hd = &ans[size1];
198 len = size2;
199 while (len--)
200 *hd++ = 0;
202 digit = *v2++;
203 h1 = v1;
204 hd = ans;
205 carry = 0;
206 len = size1;
207 while (len >= 4) { /* expand the loop some */
208 len -= 4;
209 sival.ivalue = ((FULL) *h1++) * digit + carry;
210 /* ignore Saber-C warning #112 - get ushort from uint */
211 /* ok to ignore on name domul`sival */
212 *hd++ = sival.silow;
213 carry = sival.sihigh;
214 sival.ivalue = ((FULL) *h1++) * digit + carry;
215 *hd++ = sival.silow;
216 carry = sival.sihigh;
217 sival.ivalue = ((FULL) *h1++) * digit + carry;
218 *hd++ = sival.silow;
219 carry = sival.sihigh;
220 sival.ivalue = ((FULL) *h1++) * digit + carry;
221 *hd++ = sival.silow;
222 carry = sival.sihigh;
224 while (len--) {
225 sival.ivalue = ((FULL) *h1++) * digit + carry;
226 *hd++ = sival.silow;
227 carry = sival.sihigh;
229 *hd = (HALF)carry;
232 * Now multiply by the remaining digits of the second number,
233 * adding each product into the final result.
235 h2 = ans;
236 while (--size2 > 0) {
237 digit = *v2++;
238 h1 = v1;
239 hd = ++h2;
240 if (digit == 0)
241 continue;
242 carry = 0;
243 len = size1;
244 while (len >= 4) { /* expand the loop some */
245 len -= 4;
246 sival.ivalue = ((FULL) *h1++) * digit
247 + ((FULL) *hd) + carry;
248 *hd++ = sival.silow;
249 carry = sival.sihigh;
250 sival.ivalue = ((FULL) *h1++) * digit
251 + ((FULL) *hd) + carry;
252 *hd++ = sival.silow;
253 carry = sival.sihigh;
254 sival.ivalue = ((FULL) *h1++) * digit
255 + ((FULL) *hd) + carry;
256 *hd++ = sival.silow;
257 carry = sival.sihigh;
258 sival.ivalue = ((FULL) *h1++) * digit
259 + ((FULL) *hd) + carry;
260 *hd++ = sival.silow;
261 carry = sival.sihigh;
263 while (len--) {
264 sival.ivalue = ((FULL) *h1++) * digit
265 + ((FULL) *hd) + carry;
266 *hd++ = sival.silow;
267 carry = sival.sihigh;
269 while (carry) {
270 sival.ivalue = ((FULL) *hd) + carry;
271 *hd++ = sival.silow;
272 carry = sival.sihigh;
277 * Now return the true size of the number.
279 len = sizetotal;
280 hd = &ans[len - 1];
281 while ((*hd == 0) && (len > 1)) {
282 hd--;
283 len--;
285 return len;
289 * Need to multiply by a large number.
290 * Allocate temporary space for calculations, and calculate the
291 * value for the shift. The shift value is 1/2 the size of the
292 * larger (first) number (rounded up). The amount of temporary
293 * space needed is twice the size of the shift, plus one more word
294 * for the multiply to use.
296 shift = (size1 + 1) / 2;
297 temp = tempbuf;
298 tempbuf += (2 * shift) + 1;
301 * Determine the sizes and locations of all the numbers.
302 * The value of sizeC can be negative, and this is checked later.
303 * The value of sizeD is limited by the full size of the number.
305 baseA = v1 + shift;
306 baseB = v1;
307 baseC = v2 + ((shift <= size2) ? shift : size2);
308 baseD = v2;
309 baseAB = ans;
310 baseDC = ans + shift;
311 baseAC = ans + shift * 2;
312 baseBD = ans;
314 sizeA = size1 - shift;
315 sizeC = size2 - shift;
317 sizeB = shift;
318 hd = &baseB[shift - 1];
319 while ((*hd == 0) && (sizeB > 1)) {
320 hd--;
321 sizeB--;
324 sizeD = shift;
325 if (sizeD > size2)
326 sizeD = size2;
327 hd = &baseD[sizeD - 1];
328 while ((*hd == 0) && (sizeD > 1)) {
329 hd--;
330 sizeD--;
334 * If the smaller number has a high half of zero, then calculate
335 * the result by breaking up the first number into two numbers
336 * and combining the results using the obvious formula:
337 * (A*S+B) * D = (A*D)*S + B*D
339 if (sizeC <= 0) {
340 len = domul(baseB, sizeB, baseD, sizeD, ans);
341 hd = &ans[len];
342 len = sizetotal - len;
343 while (len--)
344 *hd++ = 0;
347 * Add the second number into the first number, shifted
348 * over at the correct position.
350 len = domul(baseA, sizeA, baseD, sizeD, temp);
351 h1 = temp;
352 hd = ans + shift;
353 carry = 0;
354 while (len--) {
355 sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
356 *hd++ = sival.silow;
357 carry = sival.sihigh;
359 while (carry) {
360 sival.ivalue = ((FULL) *hd) + carry;
361 *hd++ = sival.silow;
362 carry = sival.sihigh;
366 * Determine the final size of the number and return it.
368 len = sizetotal;
369 hd = &ans[len - 1];
370 while ((*hd == 0) && (len > 1)) {
371 hd--;
372 len--;
374 tempbuf = temp;
375 return len;
379 * Now we know that the high halfs of the numbers are nonzero,
380 * so we can use the complete formula.
381 * (A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D.
382 * The steps are done in the following order:
383 * A-B
384 * D-C
385 * (A-B)*(D-C)
386 * S^2*A*C + B*D
387 * (S^2+S)*A*C + (S+1)*B*D (*)
388 * (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
390 * Note: step (*) above can produce a result which is larger than
391 * the final product will be, and this is where the extra word
392 * needed in the product comes from. After the final subtraction is
393 * done, the result fits in the expected size. Using the extra word
394 * is easier than suppressing the carries and borrows everywhere.
396 * Begin by forming the product (A-B)*(D-C) into a temporary
397 * location that we save until the final step. Do each subtraction
398 * at positions 0 and S. Be very careful about the relative sizes
399 * of the numbers since this result can be negative. For the first
400 * step calculate the absolute difference of A and B into a temporary
401 * location at position 0 of the result. Negate the sign if A is
402 * smaller than B.
404 neg = FALSE;
405 if (sizeA == sizeB) {
406 len = sizeA;
407 h1 = &baseA[len - 1];
408 h2 = &baseB[len - 1];
409 while ((len > 1) && (*h1 == *h2)) {
410 len--;
411 h1--;
412 h2--;
415 if ((sizeA > sizeB) || ((sizeA == sizeB) && h1 && h2 && (*h1 > *h2))) {
416 h1 = baseA;
417 h2 = baseB;
418 sizeAB = sizeA;
419 subsize = sizeB;
420 } else {
421 neg = !neg;
422 h1 = baseB;
423 h2 = baseA;
424 sizeAB = sizeB;
425 subsize = sizeA;
427 copysize = sizeAB - subsize;
429 hd = baseAB;
430 carry = 0;
431 while (subsize--) {
432 sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
433 *hd++ = (HALF)(BASE1 - sival.silow);
434 carry = sival.sihigh;
436 while (copysize--) {
437 sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
438 *hd++ = (HALF)(BASE1 - sival.silow);
439 carry = sival.sihigh;
442 hd = &baseAB[sizeAB - 1];
443 while ((*hd == 0) && (sizeAB > 1)) {
444 hd--;
445 sizeAB--;
449 * This completes the calculation of abs(A-B). For the next step
450 * calculate the absolute difference of D and C into a temporary
451 * location at position S of the result. Negate the sign if C is
452 * larger than D.
454 if (sizeC == sizeD) {
455 len = sizeC;
456 h1 = &baseC[len - 1];
457 h2 = &baseD[len - 1];
458 while ((len > 1) && (*h1 == *h2)) {
459 len--;
460 h1--;
461 h2--;
464 if ((sizeC > sizeD) || ((sizeC == sizeD) && (*h1 > *h2))) {
465 neg = !neg;
466 h1 = baseC;
467 h2 = baseD;
468 sizeDC = sizeC;
469 subsize = sizeD;
470 } else {
471 h1 = baseD;
472 h2 = baseC;
473 sizeDC = sizeD;
474 subsize = sizeC;
476 copysize = sizeDC - subsize;
478 hd = baseDC;
479 carry = 0;
480 while (subsize--) {
481 sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
482 *hd++ = (HALF)(BASE1 - sival.silow);
483 carry = sival.sihigh;
485 while (copysize--) {
486 sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
487 *hd++ = (HALF)(BASE1 - sival.silow);
488 carry = sival.sihigh;
490 hd = &baseDC[sizeDC - 1];
491 while ((*hd == 0) && (sizeDC > 1)) {
492 hd--;
493 sizeDC--;
497 * This completes the calculation of abs(D-C). Now multiply
498 * together abs(A-B) and abs(D-C) into a temporary location,
499 * which is preserved until the final steps.
501 baseABDC = temp;
502 sizeABDC = domul(baseAB, sizeAB, baseDC, sizeDC, baseABDC);
505 * Now calculate B*D and A*C into one of their two final locations.
506 * Make sure the high order digits of the products are zeroed since
507 * this initializes the final result. Be careful about this zeroing
508 * since the size of the high order words might be smaller than
509 * the shift size. Do B*D first since the multiplies use one more
510 * word than the size of the product. Also zero the final extra
511 * word in the result for possible carries to use.
513 len = domul(baseB, sizeB, baseD, sizeD, baseBD);
514 hd = &baseBD[len];
515 len = shift * 2 - len;
516 while (len--)
517 *hd++ = 0;
519 len = domul(baseA, sizeA, baseC, sizeC, baseAC);
520 hd = &baseAC[len];
521 len = sizetotal - shift * 2 - len + 1;
522 while (len--)
523 *hd++ = 0;
526 * Now add in A*C and B*D into themselves at the other shifted
527 * position that they need. This addition is tricky in order to
528 * make sure that the two additions cannot interfere with each other.
529 * Therefore we first add in the top half of B*D and the lower half
530 * of A*C. The sources and destinations of these two additions
531 * overlap, and so the same answer results from the two additions,
532 * thus only two pointers suffice for both additions. Keep the
533 * final carry from these additions for later use since we cannot
534 * afford to change the top half of A*C yet.
536 h1 = baseBD + shift;
537 h2 = baseAC;
538 carryACBD = 0;
539 len = shift;
540 while (len--) {
541 sival.ivalue = ((FULL) *h1) + ((FULL) *h2) + carryACBD;
542 *h1++ = sival.silow;
543 *h2++ = sival.silow;
544 carryACBD = sival.sihigh;
548 * Now add in the bottom half of B*D and the top half of A*C.
549 * These additions are straightforward, except that A*C should
550 * be done first because of possible carries from B*D, and the
551 * top half of A*C might not exist. Add in one of the carries
552 * from the previous addition while we are at it.
554 h1 = baseAC + shift;
555 hd = baseAC;
556 carry = carryACBD;
557 len = sizetotal - 3 * shift;
558 while (len--) {
559 sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
560 *hd++ = sival.silow;
561 carry = sival.sihigh;
563 while (carry) {
564 sival.ivalue = ((FULL) *hd) + carry;
565 *hd++ = sival.silow;
566 carry = sival.sihigh;
569 h1 = baseBD;
570 hd = baseBD + shift;
571 carry = 0;
572 len = shift;
573 while (len--) {
574 sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
575 *hd++ = sival.silow;
576 carry = sival.sihigh;
578 while (carry) {
579 sival.ivalue = ((FULL) *hd) + carry;
580 *hd++ = sival.silow;
581 carry = sival.sihigh;
585 * Now finally add in the other delayed carry from the
586 * above addition.
588 hd = baseAC + shift;
589 while (carryACBD) {
590 sival.ivalue = ((FULL) *hd) + carryACBD;
591 *hd++ = sival.silow;
592 carryACBD = sival.sihigh;
596 * Now finally add or subtract (A-B)*(D-C) into the final result at
597 * the correct position (S), according to whether it is positive or
598 * negative. When subtracting, the answer cannot go negative.
600 h1 = baseABDC;
601 hd = ans + shift;
602 carry = 0;
603 len = sizeABDC;
604 if (neg) {
605 while (len--) {
606 sival.ivalue = BASE1 - ((FULL) *hd) +
607 ((FULL) *h1++) + carry;
608 *hd++ = (HALF)(BASE1 - sival.silow);
609 carry = sival.sihigh;
611 while (carry) {
612 sival.ivalue = BASE1 - ((FULL) *hd) + carry;
613 *hd++ = (HALF)(BASE1 - sival.silow);
614 carry = sival.sihigh;
616 } else {
617 while (len--) {
618 sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
619 *hd++ = sival.silow;
620 carry = sival.sihigh;
622 while (carry) {
623 sival.ivalue = ((FULL) *hd) + carry;
624 *hd++ = sival.silow;
625 carry = sival.sihigh;
630 * Finally determine the size of the final result and return that.
632 len = sizetotal;
633 hd = &ans[len - 1];
634 while ((*hd == 0) && (len > 1)) {
635 hd--;
636 len--;
638 tempbuf = temp;
639 return len;
644 * Square a number by using the following formula recursively:
645 * (A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2
646 * where S is a power of 2^16, and so multiplies by it are shifts,
647 * and A and B are the left and right halfs of the number to square.
649 void
650 zsquare(ZVALUE z, ZVALUE *res)
652 LEN len;
654 if (ziszero(z)) {
655 *res = _zero_;
656 return;
658 if (zisunit(z)) {
659 *res = _one_;
660 return;
664 * Allocate a temporary array if necessary for the recursion to use.
665 * The array needs to be allocated large enough for all of the
666 * temporary results to fit in. This size is about 3 times the
667 * size of the original number, since each recursion level uses 3/2
668 * of the size of its given number, and whose size is 1/2 the size
669 * of the previous level. The sum of the infinite series is 3.
670 * Allocate some extra words for rounding up the sizes.
672 len = 3 * z.len + 32;
673 tempbuf = zalloctemp(len);
675 res->sign = 0;
676 res->v = alloc((z.len+2) * 2);
678 * Without the memset below, Purify reports that dosquare()
679 * will read uninitialized memory at the dosquare() line below
680 * the comment:
682 * uninitialized memory read (see zsquare)
684 * This problem occurs during regression test #622 and may
685 * be duplicated by executing:
687 * config("sq2", 2);
688 * 0xffff0000ffffffff00000000ffff0000000000000000ffff^2;
690 memset((char *)res->v, 0, ((z.len+2) * 2)*sizeof(HALF));
691 res->len = dosquare(z.v, z.len, res->v);
696 * Recursive routine to square a number by splitting it up into two numbers
697 * of half the size, and using the results of squaring the subpieces.
698 * The result is placed in the indicated array, which must be large
699 * enough for the result (size * 2). Returns the size of the result.
700 * This uses a temporary array which must be 3 times as large as the
701 * size of the number at the top level recursive call.
703 * given:
704 * vp value to be squared
705 * size length of value to square
706 * ans location for result
708 S_FUNC LEN
709 dosquare(HALF *vp, LEN size, HALF *ans)
711 LEN shift; /* amount numbers are shifted by */
712 LEN sizeA; /* size of left half of number to square */
713 LEN sizeB; /* size of right half of number to square */
714 LEN sizeAA; /* size of square of left half */
715 LEN sizeBB; /* size of square of right half */
716 LEN sizeAABB; /* size of sum of squares of A and B */
717 LEN sizeAB; /* size of difference of A and B */
718 LEN sizeABAB; /* size of square of difference of A and B */
719 LEN subsize; /* size of difference of halfs */
720 LEN copysize; /* size of number left to copy */
721 LEN sumsize; /* size of sum */
722 LEN sizetotal; /* total size of square */
723 LEN len; /* temporary length */
724 LEN len1; /* another temporary length */
725 FULL carry; /* carry digit for small multiplications */
726 FULL digit; /* single digit multiplying by */
727 HALF *temp; /* base for temporary calculations */
728 HALF *baseA; /* base of left half of number */
729 HALF *baseB; /* base of right half of number */
730 HALF *baseAA; /* base of square of left half of number */
731 HALF *baseBB; /* base of square of right half of number */
732 HALF *baseAABB; /* base of sum of squares of A and B */
733 HALF *baseAB; /* base of difference of A and B */
734 HALF *baseABAB; /* base of square of difference of A and B */
735 register HALF *hd, *h1, *h2, *h3; /* for inner loops */
736 SIUNION sival; /* for addition of digits */
739 * First trim the number of leading zeroes.
741 hd = &vp[size - 1];
742 while ((*hd == 0) && (size > 1)) {
743 size--;
744 hd--;
746 sizetotal = size + size;
749 * If the number has only a small number of digits, then use the
750 * (almost) normal multiplication method. Multiply each halfword
751 * only by those halfwards further on in the number. Missed terms
752 * will then be the same pairs of products repeated, and the squares
753 * of each halfword. The first case is handled by doubling the
754 * result. The second case is handled explicitly. The number of
755 * digits where the algorithm changes is settable from 2 to maxint.
757 if (size < conf->sq2) {
758 hd = ans;
759 len = sizetotal;
760 while (len--)
761 *hd++ = 0;
763 h2 = vp;
764 hd = ans + 1;
765 for (len = size; len--; hd += 2) {
766 digit = (FULL) *h2++;
767 if (digit == 0)
768 continue;
769 h3 = h2;
770 h1 = hd;
771 carry = 0;
772 len1 = len;
773 while (len1 >= 4) { /* expand the loop some */
774 len1 -= 4;
775 sival.ivalue = (digit * ((FULL) *h3++))
776 + ((FULL) *h1) + carry;
777 *h1++ = sival.silow;
778 sival.ivalue = (digit * ((FULL) *h3++))
779 + ((FULL) *h1) + ((FULL) sival.sihigh);
780 *h1++ = sival.silow;
781 sival.ivalue = (digit * ((FULL) *h3++))
782 + ((FULL) *h1) + ((FULL) sival.sihigh);
783 *h1++ = sival.silow;
784 sival.ivalue = (digit * ((FULL) *h3++))
785 + ((FULL) *h1) + ((FULL) sival.sihigh);
786 *h1++ = sival.silow;
787 carry = sival.sihigh;
789 while (len1--) {
790 sival.ivalue = (digit * ((FULL) *h3++))
791 + ((FULL) *h1) + carry;
792 *h1++ = sival.silow;
793 carry = sival.sihigh;
795 while (carry) {
796 sival.ivalue = ((FULL) *h1) + carry;
797 *h1++ = sival.silow;
798 carry = sival.sihigh;
803 * Now double the result.
804 * There is no final carry to worry about because we
805 * handle all digits of the result which must fit.
807 carry = 0;
808 hd = ans;
809 len = sizetotal;
810 while (len--) {
811 digit = ((FULL) *hd);
812 sival.ivalue = digit + digit + carry;
813 /* ignore Saber-C warning #112 - get ushort from uint */
814 /* ok to ignore on name dosquare`sival */
815 *hd++ = sival.silow;
816 carry = sival.sihigh;
820 * Now add in the squares of each halfword.
822 carry = 0;
823 hd = ans;
824 h3 = vp;
825 len = size;
826 while (len--) {
827 digit = ((FULL) *h3++);
828 sival.ivalue = digit * digit + ((FULL) *hd) + carry;
829 *hd++ = sival.silow;
830 carry = sival.sihigh;
831 sival.ivalue = ((FULL) *hd) + carry;
832 *hd++ = sival.silow;
833 carry = sival.sihigh;
835 while (carry) {
836 sival.ivalue = ((FULL) *hd) + carry;
837 *hd++ = sival.silow;
838 carry = sival.sihigh;
842 * Finally return the size of the result.
844 len = sizetotal;
845 hd = &ans[len - 1];
846 while ((*hd == 0) && (len > 1)) {
847 len--;
848 hd--;
850 return len;
854 * The number to be squared is large.
855 * Allocate temporary space and determine the sizes and
856 * positions of the values to be calculated.
858 temp = tempbuf;
859 tempbuf += (3 * (size + 1) / 2);
861 sizeA = size / 2;
862 sizeB = size - sizeA;
863 shift = sizeB;
864 baseA = vp + sizeB;
865 baseB = vp;
866 baseAA = &ans[shift * 2];
867 baseBB = ans;
868 baseAABB = temp;
869 baseAB = temp;
870 baseABAB = &temp[shift];
873 * Trim the second number of leading zeroes.
875 hd = &baseB[sizeB - 1];
876 while ((*hd == 0) && (sizeB > 1)) {
877 sizeB--;
878 hd--;
882 * Now to proceed to calculate the result using the formula.
883 * (A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
884 * The steps are done in the following order:
885 * S^2*A^2 + B^2
886 * A^2 + B^2
887 * (S^2+S)*A^2 + (S+1)*B^2
888 * (A-B)^2
889 * (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
891 * Begin by forming the squares of two the halfs concatenated
892 * together in the final result location. Make sure that the
893 * highest words of the results are zero.
895 sizeBB = dosquare(baseB, sizeB, baseBB);
896 hd = &baseBB[sizeBB];
897 len = shift * 2 - sizeBB;
898 while (len--)
899 *hd++ = 0;
901 sizeAA = dosquare(baseA, sizeA, baseAA);
902 hd = &baseAA[sizeAA];
903 len = sizetotal - shift * 2 - sizeAA;
904 while (len--)
905 *hd++ = 0;
908 * Sum the two squares into a temporary location.
910 if (sizeAA >= sizeBB) {
911 h1 = baseAA;
912 h2 = baseBB;
913 sizeAABB = sizeAA;
914 sumsize = sizeBB;
915 } else {
916 h1 = baseBB;
917 h2 = baseAA;
918 sizeAABB = sizeBB;
919 sumsize = sizeAA;
921 copysize = sizeAABB - sumsize;
923 hd = baseAABB;
924 carry = 0;
925 while (sumsize--) {
926 sival.ivalue = ((FULL) *h1++) + ((FULL) *h2++) + carry;
927 *hd++ = sival.silow;
928 carry = sival.sihigh;
930 while (copysize--) {
931 sival.ivalue = ((FULL) *h1++) + carry;
932 *hd++ = sival.silow;
933 carry = sival.sihigh;
935 if (carry) {
936 *hd = (HALF)carry;
937 sizeAABB++;
941 * Add the sum back into the previously calculated squares
942 * shifted over to the proper location.
944 h1 = baseAABB;
945 hd = ans + shift;
946 carry = 0;
947 len = sizeAABB;
948 while (len--) {
949 sival.ivalue = ((FULL) *hd) + ((FULL) *h1++) + carry;
950 *hd++ = sival.silow;
951 carry = sival.sihigh;
953 while (carry) {
954 /* uninitialized memory read (see zsquare) */
955 sival.ivalue = ((FULL) *hd) + carry;
956 *hd++ = sival.silow;
957 carry = sival.sihigh;
961 * Calculate the absolute value of the difference of the two halfs
962 * into a temporary location.
964 if (sizeA == sizeB) {
965 len = sizeA;
966 h1 = &baseA[len - 1];
967 h2 = &baseB[len - 1];
968 while ((len > 1) && (*h1 == *h2)) {
969 len--;
970 h1--;
971 h2--;
974 if ((sizeA > sizeB) || ((sizeA == sizeB) && (*h1 > *h2))) {
975 h1 = baseA;
976 h2 = baseB;
977 sizeAB = sizeA;
978 subsize = sizeB;
979 } else {
980 h1 = baseB;
981 h2 = baseA;
982 sizeAB = sizeB;
983 subsize = sizeA;
985 copysize = sizeAB - subsize;
987 hd = baseAB;
988 carry = 0;
989 while (subsize--) {
990 sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
991 *hd++ = (HALF)(BASE1 - sival.silow);
992 carry = sival.sihigh;
994 while (copysize--) {
995 sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
996 *hd++ = (HALF)(BASE1 - sival.silow);
997 carry = sival.sihigh;
1000 hd = &baseAB[sizeAB - 1];
1001 while ((*hd == 0) && (sizeAB > 1)) {
1002 sizeAB--;
1003 hd--;
1007 * Now square the number into another temporary location,
1008 * and subtract that from the final result.
1010 sizeABAB = dosquare(baseAB, sizeAB, baseABAB);
1012 h1 = baseABAB;
1013 hd = ans + shift;
1014 carry = 0;
1015 while (sizeABAB--) {
1016 sival.ivalue = BASE1 - ((FULL) *hd) + ((FULL) *h1++) + carry;
1017 *hd++ = (HALF)(BASE1 - sival.silow);
1018 carry = sival.sihigh;
1020 while (carry) {
1021 sival.ivalue = BASE1 - ((FULL) *hd) + carry;
1022 *hd++ = (HALF)(BASE1 - sival.silow);
1023 carry = sival.sihigh;
1027 * Return the size of the result.
1029 len = sizetotal;
1030 hd = &ans[len - 1];
1031 while ((*hd == 0) && (len > 1)) {
1032 len--;
1033 hd--;
1035 tempbuf = temp;
1036 return len;
1041 * Return a pointer to a buffer to be used for holding a temporary number.
1042 * The buffer will be at least as large as the specified number of HALFs,
1043 * and remains valid until the next call to this routine. The buffer cannot
1044 * be freed by the caller. There is only one temporary buffer, and so as to
1045 * avoid possible conflicts this is only used by the lowest level routines
1046 * such as divide, multiply, and square.
1048 * given:
1049 * len required number of HALFs in buffer
1051 HALF *
1052 zalloctemp(LEN len)
1054 HALF *hp;
1055 STATIC LEN buflen; /* current length of temp buffer */
1056 STATIC HALF *bufptr; /* pointer to current temp buffer */
1058 if (len <= buflen)
1059 return bufptr;
1062 * We need to grow the temporary buffer.
1063 * First free any existing buffer, and then allocate the new one.
1064 * While we are at it, make the new buffer bigger than necessary
1065 * in order to reduce the number of reallocations.
1067 len += 100;
1068 if (buflen) {
1069 buflen = 0;
1070 free(bufptr);
1072 /* don't call alloc() because _math_abort_ may not be set right */
1073 hp = (HALF *) malloc((len+1) * sizeof(HALF));
1074 if (hp == NULL) {
1075 math_error("No memory for temp buffer");
1076 /*NOTREACHED*/
1078 bufptr = hp;
1079 buflen = len;
1080 return hp;