1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
38 Using GMP is optional. Define HAVE_GMP to make this code include GMP
39 factoring code. The GMP factoring code is based on GMP's demos/factorize.c
40 (last synched 2012-09-07). The GMP-based factoring code will stay in GMP
41 factoring code even if numbers get small enough for using the two-word
46 (1) Perform trial division using a small primes table, but without hardware
47 division since the primes table store inverses modulo the word base.
48 (The GMP variant of this code doesn't make use of the precomputed
49 inverses, but instead relies on GMP for fast divisibility testing.)
50 (2) Check the nature of any non-factored part using Miller-Rabin for
51 detecting composites, and Lucas for detecting primes.
52 (3) Factor any remaining composite part using the Pollard-Brent rho
53 algorithm or if USE_SQUFOF is defined to 1, try that first.
54 Status of found factors are checked again using Miller-Rabin and Lucas.
56 We prefer using Hensel norm in the divisions, not the more familiar
57 Euclidian norm, since the former leads to much faster code. In the
58 Pollard-Brent rho code and the prime testing code, we use Montgomery's
59 trick of multiplying all n-residues by the word base, allowing cheap Hensel
64 * Use modular inverses also for exact division in the Lucas code, and
65 elsewhere. A problem is to locate the inverses not from an index, but
66 from a prime. We might instead compute the inverse on-the-fly.
68 * Tune trial division table size (not forgetting that this is a standalone
69 program where the table will be read from disk for each invocation).
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidian)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methoods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
109 # if !HAVE_DECL_MPZ_INITS
118 #include "full-write.h"
120 #include "readtokens.h"
123 /* The official name of this program (e.g., no 'g' prefix). */
124 #define PROGRAM_NAME "factor"
127 proper_name ("Paul Rubin"), \
128 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
129 proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
131 /* Token delimiters when reading from a file. */
132 #define DELIM "\n\t "
134 #ifndef USE_LONGLONG_H
135 /* With the way we use longlong.h, it's only safe to use
136 when UWtype = UHWtype, as there were various cases
137 (as can be seen in the history for longlong.h) where
138 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
139 to avoid compile time or run time issues. */
140 # if LONG_MAX == INTMAX_MAX
141 # define USE_LONGLONG_H 1
147 /* Make definitions for longlong.h to make it do what it can do for us */
149 /* bitcount for uintmax_t */
150 # if UINTMAX_MAX == UINT32_MAX
151 # define W_TYPE_SIZE 32
152 # elif UINTMAX_MAX == UINT64_MAX
153 # define W_TYPE_SIZE 64
154 # elif UINTMAX_MAX == UINT128_MAX
155 # define W_TYPE_SIZE 128
158 # define UWtype uintmax_t
159 # define UHWtype unsigned long int
161 # if HAVE_ATTRIBUTE_MODE
162 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
163 typedef int SItype
__attribute__ ((mode (SI
)));
164 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
165 typedef int DItype
__attribute__ ((mode (DI
)));
166 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
168 typedef unsigned char UQItype
;
170 typedef unsigned long int USItype
;
171 # if HAVE_LONG_LONG_INT
172 typedef long long int DItype
;
173 typedef unsigned long long int UDItype
;
174 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
175 typedef long int DItype
;
176 typedef unsigned long int UDItype
;
179 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
180 # define ASSERT(x) /* FIXME make longlong.h really standalone */
181 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
182 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
183 # ifndef __GMP_GNUC_PREREQ
184 # define __GMP_GNUC_PREREQ(a,b) 1
187 /* These stub macros are only used in longlong.h in certain system compiler
188 combinations, so ensure usage to avoid -Wunused-macros warnings. */
189 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
195 # define HAVE_HOST_CPU_FAMILY_powerpc 1
197 # include "longlong.h"
198 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
199 const unsigned char factor_clz_tab
[129] =
201 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
202 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
203 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
204 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
209 #else /* not USE_LONGLONG_H */
211 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
212 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
213 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
214 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
218 #if !defined __clz_tab && !defined UHWtype
219 /* Without this seemingly useless conditional, gcc -Wunused-macros
220 warns that each of the two tested macros is unused on Fedora 18.
221 FIXME: this is just an ugly band-aid. Fix it properly. */
224 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
225 #define MAX_NFACTS 26
229 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
232 static struct option
const long_options
[] =
234 {"-debug", no_argument
, NULL
, DEV_DEBUG_OPTION
},
235 {GETOPT_HELP_OPTION_DECL
},
236 {GETOPT_VERSION_OPTION_DECL
},
242 uintmax_t plarge
[2]; /* Can have a single large factor */
243 uintmax_t p
[MAX_NFACTS
];
244 unsigned char e
[MAX_NFACTS
];
245 unsigned char nfactors
;
252 unsigned long int *e
;
253 unsigned long int nfactors
;
257 static void factor (uintmax_t, uintmax_t, struct factors
*);
260 # define umul_ppmm(w1, w0, u, v) \
262 uintmax_t __x0, __x1, __x2, __x3; \
263 unsigned long int __ul, __vl, __uh, __vh; \
264 uintmax_t __u = (u), __v = (v); \
266 __ul = __ll_lowpart (__u); \
267 __uh = __ll_highpart (__u); \
268 __vl = __ll_lowpart (__v); \
269 __vh = __ll_highpart (__v); \
271 __x0 = (uintmax_t) __ul * __vl; \
272 __x1 = (uintmax_t) __ul * __vh; \
273 __x2 = (uintmax_t) __uh * __vl; \
274 __x3 = (uintmax_t) __uh * __vh; \
276 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
277 __x1 += __x2; /* but this indeed can */ \
278 if (__x1 < __x2) /* did we get it? */ \
279 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
281 (w1) = __x3 + __ll_highpart (__x1); \
282 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
286 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
287 /* Define our own, not needing normalization. This function is
288 currently not performance critical, so keep it simple. Similar to
289 the mod macro below. */
291 # define udiv_qrnnd(q, r, n1, n0, d) \
293 uintmax_t __d1, __d0, __q, __r1, __r0; \
295 assert ((n1) < (d)); \
296 __d1 = (d); __d0 = 0; \
297 __r1 = (n1); __r0 = (n0); \
299 for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--) \
301 rsh2 (__d1, __d0, __d1, __d0, 1); \
303 if (ge2 (__r1, __r0, __d1, __d0)) \
306 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
314 #if !defined add_ssaaaa
315 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
318 _add_x = (al) + (bl); \
319 (sh) = (ah) + (bh) + (_add_x < (al)); \
324 #define rsh2(rh, rl, ah, al, cnt) \
326 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
327 (rh) = (ah) >> (cnt); \
330 #define lsh2(rh, rl, ah, al, cnt) \
332 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
333 (rl) = (al) << (cnt); \
336 #define ge2(ah, al, bh, bl) \
337 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
339 #define gt2(ah, al, bh, bl) \
340 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
343 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
347 (rl) = (al) - (bl); \
348 (rh) = (ah) - (bh) - _cy; \
352 #ifndef count_leading_zeros
353 # define count_leading_zeros(count, x) do { \
354 uintmax_t __clz_x = (x); \
355 unsigned int __clz_c; \
357 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
360 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
366 #ifndef count_trailing_zeros
367 # define count_trailing_zeros(count, x) do { \
368 uintmax_t __ctz_x = (x); \
369 unsigned int __ctz_c = 0; \
370 while ((__ctz_x & 1) == 0) \
379 /* Requires that a < n and b <= n */
380 #define submod(r,a,b,n) \
382 uintmax_t _t = - (uintmax_t) (a < b); \
383 (r) = ((n) & _t) + (a) - (b); \
386 #define addmod(r,a,b,n) \
387 submod ((r), (a), ((n) - (b)), (n))
389 /* Modular two-word addition and subtraction. For performance reasons, the
390 most significant bit of n1 must be clear. The destination variables must be
391 distinct from the mod operand. */
392 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
394 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
395 if (ge2 ((r1), (r0), (n1), (n0))) \
396 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
398 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
400 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
401 if ((intmax_t) (r1) < 0) \
402 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
405 #define HIGHBIT_TO_MASK(x) \
406 (((intmax_t)-1 >> 1) < 0 \
407 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
408 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
409 ? UINTMAX_MAX : (uintmax_t) 0))
411 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
412 Requires that d1 != 0. */
414 mod2 (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
426 count_leading_zeros (cntd
, d1
);
427 count_leading_zeros (cnta
, a1
);
428 int cnt
= cntd
- cnta
;
429 lsh2 (d1
, d0
, d1
, d0
, cnt
);
430 for (int i
= 0; i
< cnt
; i
++)
432 if (ge2 (a1
, a0
, d1
, d0
))
433 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
434 rsh2 (d1
, d0
, d1
, d0
, 1);
441 static uintmax_t _GL_ATTRIBUTE_CONST
442 gcd_odd (uintmax_t a
, uintmax_t b
)
453 /* Take out least significant one bit, to make room for sign */
469 bgta
= HIGHBIT_TO_MASK (t
);
471 /* b <-- min (a, b) */
475 a
= (t
^ bgta
) - bgta
;
480 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
482 while ((a0
& 1) == 0)
483 rsh2 (a1
, a0
, a1
, a0
, 1);
484 while ((b0
& 1) == 0)
485 rsh2 (b1
, b0
, b1
, b0
, 1);
492 return gcd_odd (b0
, a0
);
495 if (gt2 (a1
, a0
, b1
, b0
))
497 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
499 rsh2 (a1
, a0
, a1
, a0
, 1);
500 while ((a0
& 1) == 0);
502 else if (gt2 (b1
, b0
, a1
, a0
))
504 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
506 rsh2 (b1
, b0
, b1
, b0
, 1);
507 while ((b0
& 1) == 0);
518 factor_insert_multiplicity (struct factors
*factors
,
519 uintmax_t prime
, unsigned int m
)
521 unsigned int nfactors
= factors
->nfactors
;
522 uintmax_t *p
= factors
->p
;
523 unsigned char *e
= factors
->e
;
525 /* Locate position for insert new or increment e. */
527 for (i
= nfactors
- 1; i
>= 0; i
--)
533 if (i
< 0 || p
[i
] != prime
)
535 for (int j
= nfactors
- 1; j
> i
; j
--)
542 factors
->nfactors
= nfactors
+ 1;
550 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
553 factor_insert_large (struct factors
*factors
,
554 uintmax_t p1
, uintmax_t p0
)
558 assert (factors
->plarge
[1] == 0);
559 factors
->plarge
[0] = p0
;
560 factors
->plarge
[1] = p1
;
563 factor_insert (factors
, p0
);
568 # if !HAVE_DECL_MPZ_INITS
570 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
571 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
574 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
578 va_start (ap
, mpz_single_init
);
581 while ((mpz
= va_arg (ap
, mpz_t
*)))
582 mpz_single_init (*mpz
);
588 static void mp_factor (mpz_t
, struct mp_factors
*);
591 mp_factor_init (struct mp_factors
*factors
)
595 factors
->nfactors
= 0;
599 mp_factor_clear (struct mp_factors
*factors
)
601 for (unsigned int i
= 0; i
< factors
->nfactors
; i
++)
602 mpz_clear (factors
->p
[i
]);
609 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
611 unsigned long int nfactors
= factors
->nfactors
;
612 mpz_t
*p
= factors
->p
;
613 unsigned long int *e
= factors
->e
;
616 /* Locate position for insert new or increment e. */
617 for (i
= nfactors
- 1; i
>= 0; i
--)
619 if (mpz_cmp (p
[i
], prime
) <= 0)
623 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
625 p
= xrealloc (p
, (nfactors
+ 1) * sizeof p
[0]);
626 e
= xrealloc (e
, (nfactors
+ 1) * sizeof e
[0]);
628 mpz_init (p
[nfactors
]);
629 for (long j
= nfactors
- 1; j
> i
; j
--)
631 mpz_set (p
[j
+ 1], p
[j
]);
634 mpz_set (p
[i
+ 1], prime
);
639 factors
->nfactors
= nfactors
+ 1;
648 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
652 mpz_init_set_ui (pz
, prime
);
653 mp_factor_insert (factors
, pz
);
656 #endif /* HAVE_GMP */
659 /* Number of bits in an uintmax_t. */
660 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
662 /* Verify that uintmax_t does not have holes in its representation. */
663 verify (UINTMAX_MAX
>> (W
- 1) == 1);
665 #define P(a,b,c,d) a,
666 static const unsigned char primes_diff
[] = {
668 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
672 #define PRIMES_PTAB_ENTRIES \
673 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
675 #define P(a,b,c,d) b,
676 static const unsigned char primes_diff8
[] = {
678 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
687 #define P(a,b,c,d) {c,d},
688 static const struct primes_dtab primes_dtab
[] = {
690 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
694 /* Verify that uintmax_t is not wider than
695 the integers used to generate primes.h. */
696 verify (W
<= WIDE_UINT_BITS
);
698 /* debugging for developers. Enables devmsg().
699 This flag is used only in the GMP code. */
700 static bool dev_debug
= false;
702 /* Prove primality or run probabilistic tests. */
703 static bool flag_prove_primality
= PROVE_PRIMALITY
;
705 /* Number of Miller-Rabin tests to run when not proving primality. */
709 factor_insert_refind (struct factors
*factors
, uintmax_t p
, unsigned int i
,
712 for (unsigned int j
= 0; j
< off
; j
++)
713 p
+= primes_diff
[i
+ j
];
714 factor_insert (factors
, p
);
717 /* Trial division with odd primes uses the following trick.
719 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
720 consider the case t < B (this is the second loop below).
722 From our tables we get
724 binv = p^{-1} (mod B)
725 lim = floor ( (B-1) / p ).
727 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
728 (and all quotients in this range occur for some t).
730 Then t = q * p is true also (mod B), and p is invertible we get
732 q = t * binv (mod B).
734 Next, assume that t is *not* divisible by p. Since multiplication
735 by binv (mod B) is a one-to-one mapping,
737 t * binv (mod B) > lim,
739 because all the smaller values are already taken.
741 This can be summed up by saying that the function
743 q(t) = binv * t (mod B)
745 is a permutation of the range 0 <= t < B, with the curious property
746 that it maps the multiples of p onto the range 0 <= q <= lim, in
747 order, and the non-multiples of p onto the range lim < q < B.
751 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
752 struct factors
*factors
)
760 count_trailing_zeros (cnt
, t1
);
767 count_trailing_zeros (cnt
, t0
);
768 rsh2 (t1
, t0
, t1
, t0
, cnt
);
771 factor_insert_multiplicity (factors
, 2, cnt
);
776 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
780 uintmax_t q1
, q0
, hi
, lo _GL_UNUSED
;
782 q0
= t0
* primes_dtab
[i
].binv
;
783 umul_ppmm (hi
, lo
, q0
, p
);
787 q1
= hi
* primes_dtab
[i
].binv
;
788 if (LIKELY (q1
> primes_dtab
[i
].lim
))
791 factor_insert (factors
, p
);
793 p
+= primes_diff
[i
+ 1];
798 #define DIVBLOCK(I) \
802 q = t0 * pd[I].binv; \
803 if (LIKELY (q > pd[I].lim)) \
806 factor_insert_refind (factors, p, i + 1, I); \
810 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
813 const struct primes_dtab
*pd
= &primes_dtab
[i
];
823 p
+= primes_diff8
[i
];
833 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
838 devmsg ("[trial division] ");
842 p
= mpz_scan1 (t
, 0);
843 mpz_div_2exp (t
, t
, p
);
846 mp_factor_insert_ui (factors
, 2);
851 for (unsigned int i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
853 if (! mpz_divisible_ui_p (t
, p
))
855 p
+= primes_diff
[i
++];
856 if (mpz_cmp_ui (t
, p
* p
) < 0)
861 mpz_tdiv_q_ui (t
, t
, p
);
862 mp_factor_insert_ui (factors
, p
);
870 /* Entry i contains (2i+1)^(-1) mod 2^8. */
871 static const unsigned char binvert_table
[128] =
873 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
874 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
875 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
876 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
877 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
878 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
879 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
880 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
881 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
882 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
883 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
884 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
885 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
886 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
887 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
888 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
891 /* Compute n^(-1) mod B, using a Newton iteration. */
892 #define binv(inv,n) \
894 uintmax_t __n = (n); \
897 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
898 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
899 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
900 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
902 if (W_TYPE_SIZE > 64) \
904 int __invbits = 64; \
906 __inv = 2 * __inv - __inv * __inv * __n; \
908 } while (__invbits < W_TYPE_SIZE); \
914 /* q = u / d, assuming d|u. */
915 #define divexact_21(q1, q0, u1, u0, d) \
917 uintmax_t _di, _q0; \
922 uintmax_t _p1, _p0 _GL_UNUSED; \
923 umul_ppmm (_p1, _p0, _q0, d); \
924 (q1) = ((u1) - _p1) * _di; \
935 #define redcify(r_prim, r, n) \
937 uintmax_t _redcify_q _GL_UNUSED; \
938 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
941 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
942 #define redcify2(r1, r0, x, n1, n0) \
944 uintmax_t _r1, _r0, _i; \
947 _r1 = (x); _r0 = 0; \
952 _r1 = 0; _r0 = (x); \
953 _i = 2*W_TYPE_SIZE; \
957 lsh2 (_r1, _r0, _r1, _r0, 1); \
958 if (ge2 (_r1, _r0, (n1), (n0))) \
959 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
965 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
966 Both a and b must be in redc form, the result will be in redc form too. */
967 static inline uintmax_t
968 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
970 uintmax_t rh
, rl
, q
, th
, tl _GL_UNUSED
, xh
;
972 umul_ppmm (rh
, rl
, a
, b
);
974 umul_ppmm (th
, tl
, q
, m
);
982 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
983 Both a and b must be in redc form, the result will be in redc form too.
984 For performance reasons, the most significant bit of m must be clear. */
986 mulredc2 (uintmax_t *r1p
,
987 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
988 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
990 uintmax_t r1
, r0
, q
, p1
, p0 _GL_UNUSED
, t1
, t0
, s1
, s0
;
992 assert ( (a1
>> (W_TYPE_SIZE
- 1)) == 0);
993 assert ( (b1
>> (W_TYPE_SIZE
- 1)) == 0);
994 assert ( (m1
>> (W_TYPE_SIZE
- 1)) == 0);
996 /* First compute a0 * <b1, b0> B^{-1}
1009 umul_ppmm (t1
, t0
, a0
, b0
);
1010 umul_ppmm (r1
, r0
, a0
, b1
);
1012 umul_ppmm (p1
, p0
, q
, m0
);
1013 umul_ppmm (s1
, s0
, q
, m1
);
1014 r0
+= (t0
!= 0); /* Carry */
1015 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1016 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1017 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1019 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1034 umul_ppmm (t1
, t0
, a1
, b0
);
1035 umul_ppmm (s1
, s0
, a1
, b1
);
1036 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1038 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1039 umul_ppmm (p1
, p0
, q
, m0
);
1040 umul_ppmm (s1
, s0
, q
, m1
);
1041 r0
+= (t0
!= 0); /* Carry */
1042 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1043 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1044 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1046 if (ge2 (r1
, r0
, m1
, m0
))
1047 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1053 static uintmax_t _GL_ATTRIBUTE_CONST
1054 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1063 b
= mulredc (b
, b
, n
, ni
);
1067 y
= mulredc (y
, b
, n
, ni
);
1074 powm2 (uintmax_t *r1m
,
1075 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1076 uintmax_t ni
, const uintmax_t *one
)
1078 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1090 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1094 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1097 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1100 for (e
= ep
[1]; e
> 0; e
>>= 1)
1104 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1107 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1114 static bool _GL_ATTRIBUTE_CONST
1115 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1116 unsigned int k
, uintmax_t one
)
1118 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1120 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1122 if (y
== one
|| y
== nm1
)
1125 for (unsigned int i
= 1; i
< k
; i
++)
1127 y
= mulredc (y
, y
, n
, ni
);
1138 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1139 const uintmax_t *qp
, unsigned int k
, const uintmax_t *one
)
1141 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1143 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1146 if (y0
== one
[0] && y1
== one
[1])
1149 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1151 if (y0
== nm1_0
&& y1
== nm1_1
)
1154 for (unsigned int i
= 1; i
< k
; i
++)
1156 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1159 if (y0
== nm1_0
&& y1
== nm1_1
)
1161 if (y0
== one
[0] && y1
== one
[1])
1169 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1170 mpz_srcptr q
, unsigned long int k
)
1172 mpz_powm (y
, x
, q
, n
);
1174 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1177 for (unsigned long int i
= 1; i
< k
; i
++)
1179 mpz_powm_ui (y
, y
, 2, n
);
1180 if (mpz_cmp (y
, nm1
) == 0)
1182 if (mpz_cmp_ui (y
, 1) == 0)
1189 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1190 have been observed. The average seem to be about 2. */
1192 prime_p (uintmax_t n
)
1196 uintmax_t a_prim
, one
, ni
;
1197 struct factors factors
;
1202 /* We have already casted out small primes. */
1203 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1206 /* Precomputation for Miller-Rabin. */
1207 uintmax_t q
= n
- 1;
1208 for (k
= 0; (q
& 1) == 0; k
++)
1212 binv (ni
, n
); /* ni <- 1/n mod B */
1213 redcify (one
, 1, n
);
1214 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1216 /* Perform a Miller-Rabin test, finds most composites quickly. */
1217 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1220 if (flag_prove_primality
)
1222 /* Factor n-1 for Lucas. */
1223 factor (0, n
- 1, &factors
);
1226 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1227 number composite. */
1228 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1230 if (flag_prove_primality
)
1233 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1236 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1241 /* After enough Miller-Rabin runs, be content. */
1242 is_prime
= (r
== MR_REPS
- 1);
1248 a
+= primes_diff
[r
]; /* Establish new base. */
1250 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1251 on most processors, since it avoids udiv_qrnnd. If we go down the
1252 udiv_qrnnd_preinv path, this code should be replaced. */
1255 umul_ppmm (s1
, s0
, one
, a
);
1256 if (LIKELY (s1
== 0))
1260 uintmax_t dummy _GL_UNUSED
;
1261 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1265 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1269 error (0, 0, _("Lucas prime test failure. This should not happen"));
1274 prime2_p (uintmax_t n1
, uintmax_t n0
)
1276 uintmax_t q
[2], nm1
[2];
1277 uintmax_t a_prim
[2];
1282 struct factors factors
;
1285 return prime_p (n0
);
1287 nm1
[1] = n1
- (n0
== 0);
1291 count_trailing_zeros (k
, nm1
[1]);
1299 count_trailing_zeros (k
, nm1
[0]);
1300 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1305 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1306 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1308 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1312 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1315 if (flag_prove_primality
)
1317 /* Factor n-1 for Lucas. */
1318 factor (nm1
[1], nm1
[0], &factors
);
1321 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1322 number composite. */
1323 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1326 uintmax_t e
[2], y
[2];
1328 if (flag_prove_primality
)
1331 if (factors
.plarge
[1])
1334 binv (pi
, factors
.plarge
[0]);
1337 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1338 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1340 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1342 /* FIXME: We always have the factor 2. Do we really need to
1343 handle it here? We have done the same powering as part
1345 if (factors
.p
[i
] == 2)
1346 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1348 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1349 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1350 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1355 /* After enough Miller-Rabin runs, be content. */
1356 is_prime
= (r
== MR_REPS
- 1);
1362 a
+= primes_diff
[r
]; /* Establish new base. */
1363 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1365 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1369 error (0, 0, _("Lucas prime test failure. This should not happen"));
1375 mp_prime_p (mpz_t n
)
1378 mpz_t q
, a
, nm1
, tmp
;
1379 struct mp_factors factors
;
1381 if (mpz_cmp_ui (n
, 1) <= 0)
1384 /* We have already casted out small primes. */
1385 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1388 mpz_inits (q
, a
, nm1
, tmp
, NULL
);
1390 /* Precomputation for Miller-Rabin. */
1391 mpz_sub_ui (nm1
, n
, 1);
1393 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1394 unsigned long int k
= mpz_scan1 (nm1
, 0);
1395 mpz_tdiv_q_2exp (q
, nm1
, k
);
1399 /* Perform a Miller-Rabin test, finds most composites quickly. */
1400 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1406 if (flag_prove_primality
)
1408 /* Factor n-1 for Lucas. */
1410 mp_factor (tmp
, &factors
);
1413 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1414 number composite. */
1415 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1417 if (flag_prove_primality
)
1420 for (unsigned long int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1422 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1423 mpz_powm (tmp
, a
, tmp
, n
);
1424 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1429 /* After enough Miller-Rabin runs, be content. */
1430 is_prime
= (r
== MR_REPS
- 1);
1436 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1438 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1445 error (0, 0, _("Lucas prime test failure. This should not happen"));
1449 if (flag_prove_primality
)
1450 mp_factor_clear (&factors
);
1452 mpz_clears (q
, a
, nm1
, tmp
, NULL
);
1459 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1460 struct factors
*factors
)
1462 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1464 unsigned long int k
= 1;
1465 unsigned long int l
= 1;
1468 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1475 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1481 x
= mulredc (x
, x
, n
, ni
);
1482 addmod (x
, x
, a
, n
);
1484 submod (t
, z
, x
, n
);
1485 P
= mulredc (P
, t
, n
, ni
);
1489 if (gcd_odd (P
, n
) != 1)
1499 for (unsigned long int i
= 0; i
< k
; i
++)
1501 x
= mulredc (x
, x
, n
, ni
);
1502 addmod (x
, x
, a
, n
);
1510 y
= mulredc (y
, y
, n
, ni
);
1511 addmod (y
, y
, a
, n
);
1513 submod (t
, z
, y
, n
);
1521 factor_using_pollard_rho (g
, a
+ 1, factors
);
1523 factor_insert (factors
, g
);
1527 factor_insert (factors
, n
);
1538 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1539 struct factors
*factors
)
1541 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1543 unsigned long int k
= 1;
1544 unsigned long int l
= 1;
1546 redcify2 (P1
, P0
, 1, n1
, n0
);
1547 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1551 while (n1
!= 0 || n0
!= 1)
1559 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1561 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1563 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1564 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1569 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1570 if (g1
!= 0 || g0
!= 1)
1580 for (unsigned long int i
= 0; i
< k
; i
++)
1582 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1584 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1592 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1594 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1596 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1597 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1599 while (g1
== 0 && g0
== 1);
1603 /* The found factor is one word. */
1604 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1607 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1609 factor_insert (factors
, g0
);
1613 /* The found factor is two words. This is highly unlikely, thus hard
1614 to trigger. Please be careful before you change this code! */
1617 binv (ginv
, g0
); /* Compute n = n / g. Since the result will */
1618 n0
= ginv
* n0
; /* fit one word, we can compute the quotient */
1619 n1
= 0; /* modulo B, ignoring the high divisor word. */
1621 if (!prime2_p (g1
, g0
))
1622 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1624 factor_insert_large (factors
, g1
, g0
);
1631 factor_insert (factors
, n0
);
1635 factor_using_pollard_rho (n0
, a
, factors
);
1639 if (prime2_p (n1
, n0
))
1641 factor_insert_large (factors
, n1
, n0
);
1645 x0
= mod2 (&x1
, x1
, x0
, n1
, n0
);
1646 z0
= mod2 (&z1
, z1
, z0
, n1
, n0
);
1647 y0
= mod2 (&y1
, y1
, y0
, n1
, n0
);
1653 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1654 struct mp_factors
*factors
)
1659 devmsg ("[pollard-rho (%lu)] ", a
);
1661 mpz_inits (t
, t2
, NULL
);
1662 mpz_init_set_si (y
, 2);
1663 mpz_init_set_si (x
, 2);
1664 mpz_init_set_si (z
, 2);
1665 mpz_init_set_ui (P
, 1);
1667 unsigned long long int k
= 1;
1668 unsigned long long int l
= 1;
1670 while (mpz_cmp_ui (n
, 1) != 0)
1678 mpz_add_ui (x
, x
, a
);
1687 if (mpz_cmp_ui (t
, 1) != 0)
1697 for (unsigned long long int i
= 0; i
< k
; i
++)
1701 mpz_add_ui (x
, x
, a
);
1711 mpz_add_ui (y
, y
, a
);
1716 while (mpz_cmp_ui (t
, 1) == 0);
1718 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1720 if (!mp_prime_p (t
))
1722 devmsg ("[composite factor--restarting pollard-rho] ");
1723 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1727 mp_factor_insert (factors
, t
);
1732 mp_factor_insert (factors
, n
);
1741 mpz_clears (P
, t2
, t
, z
, x
, y
, NULL
);
1746 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1747 algorithm is replaced, consider also returning the remainder. */
1748 static uintmax_t _GL_ATTRIBUTE_CONST
1756 count_leading_zeros (c
, n
);
1758 /* Make x > sqrt(n). This will be invariant through the loop. */
1759 x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) / 2);
1763 uintmax_t y
= (x
+ n
/x
) / 2;
1771 static uintmax_t _GL_ATTRIBUTE_CONST
1772 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1777 /* Ensures the remainder fits in an uintmax_t. */
1778 assert (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1783 count_leading_zeros (shift
, nh
);
1786 /* Make x > sqrt(n) */
1787 x
= isqrt ( (nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1788 x
<<= (W_TYPE_SIZE
- shift
) / 2;
1790 /* Do we need more than one iteration? */
1793 uintmax_t r _GL_UNUSED
;
1795 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1801 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1802 assert (gt2 (hi
, lo
, nh
, nl
));
1804 umul_ppmm (hi
, lo
, x
, x
);
1805 assert (ge2 (nh
, nl
, hi
, lo
));
1806 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1816 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1817 # define MAGIC64 0x0202021202030213ULL
1818 # define MAGIC63 0x0402483012450293ULL
1819 # define MAGIC65 0x218a019866014613ULL
1820 # define MAGIC11 0x23b
1822 /* Return the square root if the input is a square, otherwise 0. */
1823 static uintmax_t _GL_ATTRIBUTE_CONST
1824 is_square (uintmax_t x
)
1826 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1827 computing the square root. */
1828 if (((MAGIC64
>> (x
& 63)) & 1)
1829 && ((MAGIC63
>> (x
% 63)) & 1)
1830 /* Both 0 and 64 are squares mod (65) */
1831 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1832 && ((MAGIC11
>> (x
% 11) & 1)))
1834 uintmax_t r
= isqrt (x
);
1841 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1842 static const unsigned short invtab
[0x81] =
1845 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1846 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1847 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1848 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1849 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1850 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1851 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1852 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1853 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1854 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1855 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1856 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1857 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1858 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1859 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1860 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1863 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1864 that q < 0x40; here it instead uses a table of (Euclidian) inverses. */
1865 # define div_smallq(q, r, u, d) \
1867 if ((u) / 0x40 < (d)) \
1870 uintmax_t _dinv, _mask, _q, _r; \
1871 count_leading_zeros (_cnt, (d)); \
1873 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1875 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1876 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1880 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1881 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1885 _mask = -(uintmax_t) (_r >= (d)); \
1886 (r) = _r - (_mask & (d)); \
1888 assert ( (q) * (d) + (r) == u); \
1892 uintmax_t _q = (u) / (d); \
1893 (r) = (u) - _q * (d); \
1898 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1899 = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1902 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1903 representing G_0 = (-55, 2*4652, 8653).
1905 In the notation of the paper:
1907 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1911 t_0 = floor([q_0 + R_0] / S0) = 1
1912 R_1 = t_0 * S_0 - R_0 = 4001
1913 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1916 /* Multipliers, in order of efficiency:
1917 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1918 0.7317 3*5*7 = 105 = 1
1919 0.7820 3*5*11 = 165 = 1
1921 0.8101 3*7*11 = 231 = 3
1923 0.8284 5*7*11 = 385 = 1
1925 0.8716 3*11 = 33 = 1
1927 0.8913 5*11 = 55 = 3
1929 0.9233 7*11 = 77 = 1
1933 # define QUEUE_SIZE 50
1937 # define Q_FREQ_SIZE 50
1938 /* Element 0 keeps the total */
1939 static unsigned int q_freq
[Q_FREQ_SIZE
+ 1];
1940 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1944 /* Return true on success. Expected to fail only for numbers
1945 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1947 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1949 /* Uses algorithm and notation from
1951 SQUARE FORM FACTORIZATION
1952 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1954 http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1957 static const unsigned int multipliers_1
[] =
1959 105, 165, 21, 385, 33, 5, 77, 1, 0
1961 static const unsigned int multipliers_3
[] =
1963 1155, 15, 231, 35, 3, 55, 7, 11, 0
1966 const unsigned int *m
;
1968 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1970 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1973 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1975 if (n0
== sqrt_n
* sqrt_n
)
1979 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
1984 if (prime_p (sqrt_n
))
1985 factor_insert_multiplicity (factors
, sqrt_n
, 2);
1991 if (!factor_using_squfof (0, sqrt_n
, &f
))
1993 /* Try pollard rho instead */
1994 factor_using_pollard_rho (sqrt_n
, 1, &f
);
1996 /* Duplicate the new factors */
1997 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
1998 factor_insert_multiplicity (factors
, f
.p
[i
], 2*f
.e
[i
]);
2004 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2005 for (m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2008 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2010 unsigned int mu
= *m
;
2011 unsigned int qpos
= 0;
2013 assert (mu
* n0
% 4 == 3);
2015 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2016 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2017 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2020 However, this seems insufficient: With n = 37243139 and mu =
2021 105, we get a trivial factor, from the square 38809 = 197^2,
2022 without any corresponding Q earlier in the iteration.
2024 Requiring 64 mu^3 < n seems sufficient. */
2027 if ((uintmax_t) mu
*mu
*mu
>= n0
/ 64)
2032 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2035 umul_ppmm (Dh
, Dl
, n0
, mu
);
2038 assert (Dl
% 4 != 1);
2039 assert (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2041 S
= isqrt2 (Dh
, Dl
);
2046 /* Square root remainder fits in one word, so ignore high part. */
2048 /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2053 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2056 for (i
= 0; i
<= B
; i
++)
2058 uintmax_t q
, P1
, t
, rem
;
2060 div_smallq (q
, rem
, S
+P
, Q
);
2061 P1
= S
- rem
; /* P1 = q*Q - P */
2063 IF_LINT (assert (q
> 0 && Q
> 0));
2067 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2077 g
/= gcd_odd (g
, mu
);
2081 if (qpos
>= QUEUE_SIZE
)
2082 error (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2084 queue
[qpos
].P
= P
% g
;
2089 /* I think the difference can be either sign, but mod
2090 2^W_TYPE_SIZE arithmetic should be fine. */
2091 t
= Q1
+ q
* (P
- P1
);
2098 uintmax_t r
= is_square (Q
);
2101 for (unsigned int j
= 0; j
< qpos
; j
++)
2103 if (queue
[j
].Q
== r
)
2106 /* Traversed entire cycle. */
2107 goto next_multiplier
;
2109 /* Need the absolute value for divisibility test. */
2110 if (P
>= queue
[j
].P
)
2116 /* Delete entries up to and including entry
2117 j, which matched. */
2118 memmove (queue
, queue
+ j
+ 1,
2119 (qpos
- j
- 1) * sizeof (queue
[0]));
2126 /* We have found a square form, which should give a
2129 assert (S
>= P
); /* What signs are possible? */
2130 P
+= r
* ((S
- P
) / r
);
2132 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2133 for the case D = 2N. */
2134 /* Compute Q = (D - P*P) / Q1, but we need double
2137 umul_ppmm (hi
, lo
, P
, P
);
2138 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2139 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2144 /* Note: There appears to by a typo in the paper,
2145 Step 4a in the algorithm description says q <--
2146 floor([S+P]/\hat Q), but looking at the equations
2147 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2148 (In this code, \hat Q is Q1). */
2149 div_smallq (q
, rem
, S
+P
, Q
);
2150 P1
= S
- rem
; /* P1 = q*Q - P */
2154 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2158 t
= Q1
+ q
* (P
- P1
);
2166 Q
/= gcd_odd (Q
, mu
);
2168 assert (Q
> 1 && (n1
|| Q
< n0
));
2171 factor_insert (factors
, Q
);
2172 else if (!factor_using_squfof (0, Q
, factors
))
2173 factor_using_pollard_rho (Q
, 2, factors
);
2175 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2177 if (prime2_p (n1
, n0
))
2178 factor_insert_large (factors
, n1
, n0
);
2181 if (!factor_using_squfof (n1
, n0
, factors
))
2184 factor_using_pollard_rho (n0
, 1, factors
);
2186 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2201 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2202 results in FACTORS. */
2204 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2206 factors
->nfactors
= 0;
2207 factors
->plarge
[1] = 0;
2209 if (t1
== 0 && t0
< 2)
2212 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2214 if (t1
== 0 && t0
< 2)
2217 if (prime2_p (t1
, t0
))
2218 factor_insert_large (factors
, t1
, t0
);
2222 if (factor_using_squfof (t1
, t0
, factors
))
2227 factor_using_pollard_rho (t0
, 1, factors
);
2229 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2234 /* Use Pollard-rho to compute the prime factors of
2235 arbitrary-precision T, and put the results in FACTORS. */
2237 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2239 mp_factor_init (factors
);
2241 if (mpz_sgn (t
) != 0)
2243 mp_factor_using_division (t
, factors
);
2245 if (mpz_cmp_ui (t
, 1) != 0)
2247 devmsg ("[is number prime?] ");
2249 mp_factor_insert (factors
, t
);
2251 mp_factor_using_pollard_rho (t
, 1, factors
);
2258 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, const char *s
)
2260 unsigned int lo_carry
;
2261 uintmax_t hi
= 0, lo
= 0;
2263 strtol_error err
= LONGINT_INVALID
;
2265 /* Skip initial spaces and '+'. */
2280 /* Initial scan for invalid digits. */
2284 unsigned int c
= *p
++;
2288 if (UNLIKELY (!ISDIGIT (c
)))
2290 err
= LONGINT_INVALID
;
2294 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2297 for (;err
== LONGINT_OK
;)
2299 unsigned int c
= *s
++;
2305 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2307 err
= LONGINT_OVERFLOW
;
2312 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2313 lo_carry
+= 10 * lo
< 2 * lo
;
2320 if (UNLIKELY (hi
< lo_carry
))
2322 err
= LONGINT_OVERFLOW
;
2333 /* Structure and routines for buffering and outputting full lines,
2334 to support parallel operation efficiently. */
2341 /* 512 is chosen to give good performance,
2342 and also is the max guaranteed size that
2343 consumers can read atomically through pipes.
2344 Also it's big enough to cater for max line length
2345 even with 128 bit uintmax_t. */
2346 #define FACTOR_PIPE_BUF 512
2354 /* Double to ensure enough space for
2355 previous numbers + next number. */
2356 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2357 lbuf
.end
= lbuf
.buf
;
2360 /* Write complete LBUF to standard output. */
2364 size_t size
= lbuf
.end
- lbuf
.buf
;
2365 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2366 error (EXIT_FAILURE
, errno
, "%s", _("write error"));
2367 lbuf
.end
= lbuf
.buf
;
2370 /* Add a character C to LBUF and if it's a newline
2371 and enough bytes are already buffered,
2372 then write atomically to standard output. */
2380 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2382 if (buffered
>= FACTOR_PIPE_BUF
)
2384 /* Write output in <= PIPE_BUF chunks
2385 so consumers can read atomically. */
2386 char const *tend
= lbuf
.end
;
2388 /* Since a umaxint_t's factors must fit in 512
2389 we're guaranteed to find a newline here. */
2390 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2391 while (*--tlend
!= '\n');
2397 /* Buffer the remainder. */
2398 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2399 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2404 /* Buffer an int to the internal LBUF. */
2406 lbuf_putint (uintmax_t i
, size_t min_width
)
2408 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2409 char const *umaxstr
= umaxtostr (i
, buf
);
2410 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2413 for (; z
< min_width
; z
++)
2416 memcpy (lbuf
.end
, umaxstr
, width
);
2421 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2426 lbuf_putint (t0
, 0);
2429 /* Use very plain code here since it seems hard to write fast code
2430 without assuming a specific word size. */
2431 q
= t1
/ 1000000000;
2432 r
= t1
% 1000000000;
2433 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2434 print_uintmaxes (q
, t0
);
2439 /* Single-precision factoring */
2441 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2443 struct factors factors
;
2445 print_uintmaxes (t1
, t0
);
2448 factor (t1
, t0
, &factors
);
2450 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2451 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2454 print_uintmaxes (0, factors
.p
[j
]);
2457 if (factors
.plarge
[1])
2460 print_uintmaxes (factors
.plarge
[1], factors
.plarge
[0]);
2466 /* Emit the factors of the indicated number. If we have the option of using
2467 either algorithm, we select on the basis of the length of the number.
2468 For longer numbers, we prefer the MP algorithm even if the native algorithm
2469 has enough digits, because the algorithm is better. The turnover point
2470 depends on the value. */
2472 print_factors (const char *input
)
2476 /* Try converting the number to one or two words. If it fails, use GMP or
2477 print an error message. The 2nd condition checks that the most
2478 significant bit of the two-word number is clear, in a typesize neutral
2480 strtol_error err
= strto2uintmax (&t1
, &t0
, input
);
2485 if (((t1
<< 1) >> 1) == t1
)
2487 devmsg ("[using single-precision arithmetic] ");
2488 print_factors_single (t1
, t0
);
2493 case LONGINT_OVERFLOW
:
2498 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2503 devmsg ("[using arbitrary-precision arithmetic] ");
2505 struct mp_factors factors
;
2507 mpz_init_set_str (t
, input
, 10);
2509 gmp_printf ("%Zd:", t
);
2510 mp_factor (t
, &factors
);
2512 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2513 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2514 gmp_printf (" %Zd", factors
.p
[j
]);
2516 mp_factor_clear (&factors
);
2522 error (0, 0, _("%s is too large"), quote (input
));
2530 if (status
!= EXIT_SUCCESS
)
2535 Usage: %s [NUMBER]...\n\
2538 program_name
, program_name
);
2540 Print the prime factors of each specified integer NUMBER. If none\n\
2541 are specified on the command line, read them from standard input.\n\
2544 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2545 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2546 emit_ancillary_info (PROGRAM_NAME
);
2555 token_buffer tokenbuffer
;
2557 init_tokenbuffer (&tokenbuffer
);
2561 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2563 if (token_length
== (size_t) -1)
2565 ok
&= print_factors (tokenbuffer
.buffer
);
2567 free (tokenbuffer
.buffer
);
2573 main (int argc
, char **argv
)
2575 initialize_main (&argc
, &argv
);
2576 set_program_name (argv
[0]);
2577 setlocale (LC_ALL
, "");
2578 bindtextdomain (PACKAGE
, LOCALEDIR
);
2579 textdomain (PACKAGE
);
2582 atexit (close_stdout
);
2583 atexit (lbuf_flush
);
2586 while ((c
= getopt_long (argc
, argv
, "", long_options
, NULL
)) != -1)
2590 case DEV_DEBUG_OPTION
:
2594 case_GETOPT_HELP_CHAR
;
2596 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2599 usage (EXIT_FAILURE
);
2604 memset (q_freq
, 0, sizeof (q_freq
));
2613 for (int i
= optind
; i
< argc
; i
++)
2614 if (! print_factors (argv
[i
]))
2622 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2623 for (unsigned int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2625 double f
= (double) q_freq
[i
] / q_freq
[0];
2627 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2628 100.0 * f
, 100.0 * acc_f
);
2633 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;