1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
38 Using GMP is optional. Define HAVE_GMP to make this code include GMP
39 factoring code. The GMP factoring code is based on GMP's demos/factorize.c
40 (last synced 2012-09-07). The GMP-based factoring code will stay in GMP
41 factoring code even if numbers get small enough for using the two-word
46 (1) Perform trial division using a small primes table, but without hardware
47 division since the primes table store inverses modulo the word base.
48 (The GMP variant of this code doesn't make use of the precomputed
49 inverses, but instead relies on GMP for fast divisibility testing.)
50 (2) Check the nature of any non-factored part using Miller-Rabin for
51 detecting composites, and Lucas for detecting primes.
52 (3) Factor any remaining composite part using the Pollard-Brent rho
53 algorithm or if USE_SQUFOF is defined to 1, try that first.
54 Status of found factors are checked again using Miller-Rabin and Lucas.
56 We prefer using Hensel norm in the divisions, not the more familiar
57 Euclidian norm, since the former leads to much faster code. In the
58 Pollard-Brent rho code and the prime testing code, we use Montgomery's
59 trick of multiplying all n-residues by the word base, allowing cheap Hensel
64 * Use modular inverses also for exact division in the Lucas code, and
65 elsewhere. A problem is to locate the inverses not from an index, but
66 from a prime. We might instead compute the inverse on-the-fly.
68 * Tune trial division table size (not forgetting that this is a standalone
69 program where the table will be read from disk for each invocation).
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidian)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methoods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
109 # if !HAVE_DECL_MPZ_INITS
119 #include "full-write.h"
121 #include "readtokens.h"
124 /* The official name of this program (e.g., no 'g' prefix). */
125 #define PROGRAM_NAME "factor"
128 proper_name ("Paul Rubin"), \
129 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
130 proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
132 /* Token delimiters when reading from a file. */
133 #define DELIM "\n\t "
135 #ifndef USE_LONGLONG_H
136 /* With the way we use longlong.h, it's only safe to use
137 when UWtype = UHWtype, as there were various cases
138 (as can be seen in the history for longlong.h) where
139 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
140 to avoid compile time or run time issues. */
141 # if LONG_MAX == INTMAX_MAX
142 # define USE_LONGLONG_H 1
148 /* Make definitions for longlong.h to make it do what it can do for us */
150 /* bitcount for uintmax_t */
151 # if UINTMAX_MAX == UINT32_MAX
152 # define W_TYPE_SIZE 32
153 # elif UINTMAX_MAX == UINT64_MAX
154 # define W_TYPE_SIZE 64
155 # elif UINTMAX_MAX == UINT128_MAX
156 # define W_TYPE_SIZE 128
159 # define UWtype uintmax_t
160 # define UHWtype unsigned long int
162 # if HAVE_ATTRIBUTE_MODE
163 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
164 typedef int SItype
__attribute__ ((mode (SI
)));
165 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
166 typedef int DItype
__attribute__ ((mode (DI
)));
167 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
169 typedef unsigned char UQItype
;
171 typedef unsigned long int USItype
;
172 # if HAVE_LONG_LONG_INT
173 typedef long long int DItype
;
174 typedef unsigned long long int UDItype
;
175 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
176 typedef long int DItype
;
177 typedef unsigned long int UDItype
;
180 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
181 # define ASSERT(x) /* FIXME make longlong.h really standalone */
182 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
183 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
184 # ifndef __GMP_GNUC_PREREQ
185 # define __GMP_GNUC_PREREQ(a,b) 1
188 /* These stub macros are only used in longlong.h in certain system compiler
189 combinations, so ensure usage to avoid -Wunused-macros warnings. */
190 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
196 # define HAVE_HOST_CPU_FAMILY_powerpc 1
198 # include "longlong.h"
199 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
200 const unsigned char factor_clz_tab
[129] =
202 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
203 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
204 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
205 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
210 #else /* not USE_LONGLONG_H */
212 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
213 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
214 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
215 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
219 #if !defined __clz_tab && !defined UHWtype
220 /* Without this seemingly useless conditional, gcc -Wunused-macros
221 warns that each of the two tested macros is unused on Fedora 18.
222 FIXME: this is just an ugly band-aid. Fix it properly. */
225 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
226 #define MAX_NFACTS 26
230 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
233 static struct option
const long_options
[] =
235 {"-debug", no_argument
, NULL
, DEV_DEBUG_OPTION
},
236 {GETOPT_HELP_OPTION_DECL
},
237 {GETOPT_VERSION_OPTION_DECL
},
243 uintmax_t plarge
[2]; /* Can have a single large factor */
244 uintmax_t p
[MAX_NFACTS
];
245 unsigned char e
[MAX_NFACTS
];
246 unsigned char nfactors
;
253 unsigned long int *e
;
254 unsigned long int nfactors
;
258 static void factor (uintmax_t, uintmax_t, struct factors
*);
261 # define umul_ppmm(w1, w0, u, v) \
263 uintmax_t __x0, __x1, __x2, __x3; \
264 unsigned long int __ul, __vl, __uh, __vh; \
265 uintmax_t __u = (u), __v = (v); \
267 __ul = __ll_lowpart (__u); \
268 __uh = __ll_highpart (__u); \
269 __vl = __ll_lowpart (__v); \
270 __vh = __ll_highpart (__v); \
272 __x0 = (uintmax_t) __ul * __vl; \
273 __x1 = (uintmax_t) __ul * __vh; \
274 __x2 = (uintmax_t) __uh * __vl; \
275 __x3 = (uintmax_t) __uh * __vh; \
277 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
278 __x1 += __x2; /* but this indeed can */ \
279 if (__x1 < __x2) /* did we get it? */ \
280 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
282 (w1) = __x3 + __ll_highpart (__x1); \
283 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
287 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
288 /* Define our own, not needing normalization. This function is
289 currently not performance critical, so keep it simple. Similar to
290 the mod macro below. */
292 # define udiv_qrnnd(q, r, n1, n0, d) \
294 uintmax_t __d1, __d0, __q, __r1, __r0; \
296 assert ((n1) < (d)); \
297 __d1 = (d); __d0 = 0; \
298 __r1 = (n1); __r0 = (n0); \
300 for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--) \
302 rsh2 (__d1, __d0, __d1, __d0, 1); \
304 if (ge2 (__r1, __r0, __d1, __d0)) \
307 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
315 #if !defined add_ssaaaa
316 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
319 _add_x = (al) + (bl); \
320 (sh) = (ah) + (bh) + (_add_x < (al)); \
325 #define rsh2(rh, rl, ah, al, cnt) \
327 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
328 (rh) = (ah) >> (cnt); \
331 #define lsh2(rh, rl, ah, al, cnt) \
333 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
334 (rl) = (al) << (cnt); \
337 #define ge2(ah, al, bh, bl) \
338 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
340 #define gt2(ah, al, bh, bl) \
341 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
344 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
348 (rl) = (al) - (bl); \
349 (rh) = (ah) - (bh) - _cy; \
353 #ifndef count_leading_zeros
354 # define count_leading_zeros(count, x) do { \
355 uintmax_t __clz_x = (x); \
356 unsigned int __clz_c; \
358 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
361 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
367 #ifndef count_trailing_zeros
368 # define count_trailing_zeros(count, x) do { \
369 uintmax_t __ctz_x = (x); \
370 unsigned int __ctz_c = 0; \
371 while ((__ctz_x & 1) == 0) \
380 /* Requires that a < n and b <= n */
381 #define submod(r,a,b,n) \
383 uintmax_t _t = - (uintmax_t) (a < b); \
384 (r) = ((n) & _t) + (a) - (b); \
387 #define addmod(r,a,b,n) \
388 submod ((r), (a), ((n) - (b)), (n))
390 /* Modular two-word addition and subtraction. For performance reasons, the
391 most significant bit of n1 must be clear. The destination variables must be
392 distinct from the mod operand. */
393 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
395 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
396 if (ge2 ((r1), (r0), (n1), (n0))) \
397 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
399 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
401 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
402 if ((intmax_t) (r1) < 0) \
403 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
406 #define HIGHBIT_TO_MASK(x) \
407 (((intmax_t)-1 >> 1) < 0 \
408 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
409 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
410 ? UINTMAX_MAX : (uintmax_t) 0))
412 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
413 Requires that d1 != 0. */
415 mod2 (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
427 count_leading_zeros (cntd
, d1
);
428 count_leading_zeros (cnta
, a1
);
429 int cnt
= cntd
- cnta
;
430 lsh2 (d1
, d0
, d1
, d0
, cnt
);
431 for (int i
= 0; i
< cnt
; i
++)
433 if (ge2 (a1
, a0
, d1
, d0
))
434 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
435 rsh2 (d1
, d0
, d1
, d0
, 1);
442 static uintmax_t _GL_ATTRIBUTE_CONST
443 gcd_odd (uintmax_t a
, uintmax_t b
)
454 /* Take out least significant one bit, to make room for sign */
470 bgta
= HIGHBIT_TO_MASK (t
);
472 /* b <-- min (a, b) */
476 a
= (t
^ bgta
) - bgta
;
481 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
491 while ((a0
& 1) == 0)
492 rsh2 (a1
, a0
, a1
, a0
, 1);
499 return gcd_odd (b0
, a0
);
502 if (gt2 (a1
, a0
, b1
, b0
))
504 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
506 rsh2 (a1
, a0
, a1
, a0
, 1);
507 while ((a0
& 1) == 0);
509 else if (gt2 (b1
, b0
, a1
, a0
))
511 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
513 rsh2 (b1
, b0
, b1
, b0
, 1);
514 while ((b0
& 1) == 0);
525 factor_insert_multiplicity (struct factors
*factors
,
526 uintmax_t prime
, unsigned int m
)
528 unsigned int nfactors
= factors
->nfactors
;
529 uintmax_t *p
= factors
->p
;
530 unsigned char *e
= factors
->e
;
532 /* Locate position for insert new or increment e. */
534 for (i
= nfactors
- 1; i
>= 0; i
--)
540 if (i
< 0 || p
[i
] != prime
)
542 for (int j
= nfactors
- 1; j
> i
; j
--)
549 factors
->nfactors
= nfactors
+ 1;
557 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
560 factor_insert_large (struct factors
*factors
,
561 uintmax_t p1
, uintmax_t p0
)
565 assert (factors
->plarge
[1] == 0);
566 factors
->plarge
[0] = p0
;
567 factors
->plarge
[1] = p1
;
570 factor_insert (factors
, p0
);
575 # if !HAVE_DECL_MPZ_INITS
577 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
578 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
581 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
585 va_start (ap
, mpz_single_init
);
588 while ((mpz
= va_arg (ap
, mpz_t
*)))
589 mpz_single_init (*mpz
);
595 static void mp_factor (mpz_t
, struct mp_factors
*);
598 mp_factor_init (struct mp_factors
*factors
)
602 factors
->nfactors
= 0;
606 mp_factor_clear (struct mp_factors
*factors
)
608 for (unsigned int i
= 0; i
< factors
->nfactors
; i
++)
609 mpz_clear (factors
->p
[i
]);
616 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
618 unsigned long int nfactors
= factors
->nfactors
;
619 mpz_t
*p
= factors
->p
;
620 unsigned long int *e
= factors
->e
;
623 /* Locate position for insert new or increment e. */
624 for (i
= nfactors
- 1; i
>= 0; i
--)
626 if (mpz_cmp (p
[i
], prime
) <= 0)
630 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
632 p
= xrealloc (p
, (nfactors
+ 1) * sizeof p
[0]);
633 e
= xrealloc (e
, (nfactors
+ 1) * sizeof e
[0]);
635 mpz_init (p
[nfactors
]);
636 for (long j
= nfactors
- 1; j
> i
; j
--)
638 mpz_set (p
[j
+ 1], p
[j
]);
641 mpz_set (p
[i
+ 1], prime
);
646 factors
->nfactors
= nfactors
+ 1;
655 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
659 mpz_init_set_ui (pz
, prime
);
660 mp_factor_insert (factors
, pz
);
663 #endif /* HAVE_GMP */
666 /* Number of bits in an uintmax_t. */
667 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
669 /* Verify that uintmax_t does not have holes in its representation. */
670 verify (UINTMAX_MAX
>> (W
- 1) == 1);
672 #define P(a,b,c,d) a,
673 static const unsigned char primes_diff
[] = {
675 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
679 #define PRIMES_PTAB_ENTRIES \
680 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
682 #define P(a,b,c,d) b,
683 static const unsigned char primes_diff8
[] = {
685 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
694 #define P(a,b,c,d) {c,d},
695 static const struct primes_dtab primes_dtab
[] = {
697 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
701 /* Verify that uintmax_t is not wider than
702 the integers used to generate primes.h. */
703 verify (W
<= WIDE_UINT_BITS
);
705 /* debugging for developers. Enables devmsg().
706 This flag is used only in the GMP code. */
707 static bool dev_debug
= false;
709 /* Prove primality or run probabilistic tests. */
710 static bool flag_prove_primality
= PROVE_PRIMALITY
;
712 /* Number of Miller-Rabin tests to run when not proving primality. */
716 factor_insert_refind (struct factors
*factors
, uintmax_t p
, unsigned int i
,
719 for (unsigned int j
= 0; j
< off
; j
++)
720 p
+= primes_diff
[i
+ j
];
721 factor_insert (factors
, p
);
724 /* Trial division with odd primes uses the following trick.
726 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
727 consider the case t < B (this is the second loop below).
729 From our tables we get
731 binv = p^{-1} (mod B)
732 lim = floor ( (B-1) / p ).
734 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
735 (and all quotients in this range occur for some t).
737 Then t = q * p is true also (mod B), and p is invertible we get
739 q = t * binv (mod B).
741 Next, assume that t is *not* divisible by p. Since multiplication
742 by binv (mod B) is a one-to-one mapping,
744 t * binv (mod B) > lim,
746 because all the smaller values are already taken.
748 This can be summed up by saying that the function
750 q(t) = binv * t (mod B)
752 is a permutation of the range 0 <= t < B, with the curious property
753 that it maps the multiples of p onto the range 0 <= q <= lim, in
754 order, and the non-multiples of p onto the range lim < q < B.
758 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
759 struct factors
*factors
)
767 count_trailing_zeros (cnt
, t1
);
774 count_trailing_zeros (cnt
, t0
);
775 rsh2 (t1
, t0
, t1
, t0
, cnt
);
778 factor_insert_multiplicity (factors
, 2, cnt
);
783 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
787 uintmax_t q1
, q0
, hi
, lo _GL_UNUSED
;
789 q0
= t0
* primes_dtab
[i
].binv
;
790 umul_ppmm (hi
, lo
, q0
, p
);
794 q1
= hi
* primes_dtab
[i
].binv
;
795 if (LIKELY (q1
> primes_dtab
[i
].lim
))
798 factor_insert (factors
, p
);
800 p
+= primes_diff
[i
+ 1];
805 #define DIVBLOCK(I) \
809 q = t0 * pd[I].binv; \
810 if (LIKELY (q > pd[I].lim)) \
813 factor_insert_refind (factors, p, i + 1, I); \
817 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
820 const struct primes_dtab
*pd
= &primes_dtab
[i
];
830 p
+= primes_diff8
[i
];
840 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
845 devmsg ("[trial division] ");
849 p
= mpz_scan1 (t
, 0);
850 mpz_div_2exp (t
, t
, p
);
853 mp_factor_insert_ui (factors
, 2);
858 for (unsigned int i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
860 if (! mpz_divisible_ui_p (t
, p
))
862 p
+= primes_diff
[i
++];
863 if (mpz_cmp_ui (t
, p
* p
) < 0)
868 mpz_tdiv_q_ui (t
, t
, p
);
869 mp_factor_insert_ui (factors
, p
);
877 /* Entry i contains (2i+1)^(-1) mod 2^8. */
878 static const unsigned char binvert_table
[128] =
880 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
881 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
882 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
883 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
884 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
885 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
886 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
887 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
888 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
889 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
890 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
891 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
892 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
893 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
894 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
895 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
898 /* Compute n^(-1) mod B, using a Newton iteration. */
899 #define binv(inv,n) \
901 uintmax_t __n = (n); \
904 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
905 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
906 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
907 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
909 if (W_TYPE_SIZE > 64) \
911 int __invbits = 64; \
913 __inv = 2 * __inv - __inv * __inv * __n; \
915 } while (__invbits < W_TYPE_SIZE); \
921 /* q = u / d, assuming d|u. */
922 #define divexact_21(q1, q0, u1, u0, d) \
924 uintmax_t _di, _q0; \
929 uintmax_t _p1, _p0 _GL_UNUSED; \
930 umul_ppmm (_p1, _p0, _q0, d); \
931 (q1) = ((u1) - _p1) * _di; \
942 #define redcify(r_prim, r, n) \
944 uintmax_t _redcify_q _GL_UNUSED; \
945 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
948 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
949 #define redcify2(r1, r0, x, n1, n0) \
951 uintmax_t _r1, _r0, _i; \
954 _r1 = (x); _r0 = 0; \
959 _r1 = 0; _r0 = (x); \
960 _i = 2*W_TYPE_SIZE; \
964 lsh2 (_r1, _r0, _r1, _r0, 1); \
965 if (ge2 (_r1, _r0, (n1), (n0))) \
966 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
972 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
973 Both a and b must be in redc form, the result will be in redc form too. */
974 static inline uintmax_t
975 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
977 uintmax_t rh
, rl
, q
, th
, tl _GL_UNUSED
, xh
;
979 umul_ppmm (rh
, rl
, a
, b
);
981 umul_ppmm (th
, tl
, q
, m
);
989 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
990 Both a and b must be in redc form, the result will be in redc form too.
991 For performance reasons, the most significant bit of m must be clear. */
993 mulredc2 (uintmax_t *r1p
,
994 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
995 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
997 uintmax_t r1
, r0
, q
, p1
, p0 _GL_UNUSED
, t1
, t0
, s1
, s0
;
999 assert ( (a1
>> (W_TYPE_SIZE
- 1)) == 0);
1000 assert ( (b1
>> (W_TYPE_SIZE
- 1)) == 0);
1001 assert ( (m1
>> (W_TYPE_SIZE
- 1)) == 0);
1003 /* First compute a0 * <b1, b0> B^{-1}
1016 umul_ppmm (t1
, t0
, a0
, b0
);
1017 umul_ppmm (r1
, r0
, a0
, b1
);
1019 umul_ppmm (p1
, p0
, q
, m0
);
1020 umul_ppmm (s1
, s0
, q
, m1
);
1021 r0
+= (t0
!= 0); /* Carry */
1022 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1023 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1024 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1026 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1041 umul_ppmm (t1
, t0
, a1
, b0
);
1042 umul_ppmm (s1
, s0
, a1
, b1
);
1043 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1045 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1046 umul_ppmm (p1
, p0
, q
, m0
);
1047 umul_ppmm (s1
, s0
, q
, m1
);
1048 r0
+= (t0
!= 0); /* Carry */
1049 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1050 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1051 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1053 if (ge2 (r1
, r0
, m1
, m0
))
1054 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1060 static uintmax_t _GL_ATTRIBUTE_CONST
1061 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1070 b
= mulredc (b
, b
, n
, ni
);
1074 y
= mulredc (y
, b
, n
, ni
);
1081 powm2 (uintmax_t *r1m
,
1082 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1083 uintmax_t ni
, const uintmax_t *one
)
1085 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1097 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1101 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1104 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1107 for (e
= ep
[1]; e
> 0; e
>>= 1)
1111 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1114 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1121 static bool _GL_ATTRIBUTE_CONST
1122 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1123 unsigned int k
, uintmax_t one
)
1125 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1127 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1129 if (y
== one
|| y
== nm1
)
1132 for (unsigned int i
= 1; i
< k
; i
++)
1134 y
= mulredc (y
, y
, n
, ni
);
1145 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1146 const uintmax_t *qp
, unsigned int k
, const uintmax_t *one
)
1148 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1150 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1153 if (y0
== one
[0] && y1
== one
[1])
1156 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1158 if (y0
== nm1_0
&& y1
== nm1_1
)
1161 for (unsigned int i
= 1; i
< k
; i
++)
1163 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1166 if (y0
== nm1_0
&& y1
== nm1_1
)
1168 if (y0
== one
[0] && y1
== one
[1])
1176 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1177 mpz_srcptr q
, unsigned long int k
)
1179 mpz_powm (y
, x
, q
, n
);
1181 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1184 for (unsigned long int i
= 1; i
< k
; i
++)
1186 mpz_powm_ui (y
, y
, 2, n
);
1187 if (mpz_cmp (y
, nm1
) == 0)
1189 if (mpz_cmp_ui (y
, 1) == 0)
1196 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1197 have been observed. The average seem to be about 2. */
1199 prime_p (uintmax_t n
)
1203 uintmax_t a_prim
, one
, ni
;
1204 struct factors factors
;
1209 /* We have already casted out small primes. */
1210 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1213 /* Precomputation for Miller-Rabin. */
1214 uintmax_t q
= n
- 1;
1215 for (k
= 0; (q
& 1) == 0; k
++)
1219 binv (ni
, n
); /* ni <- 1/n mod B */
1220 redcify (one
, 1, n
);
1221 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1223 /* Perform a Miller-Rabin test, finds most composites quickly. */
1224 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1227 if (flag_prove_primality
)
1229 /* Factor n-1 for Lucas. */
1230 factor (0, n
- 1, &factors
);
1233 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1234 number composite. */
1235 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1237 if (flag_prove_primality
)
1240 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1243 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1248 /* After enough Miller-Rabin runs, be content. */
1249 is_prime
= (r
== MR_REPS
- 1);
1255 a
+= primes_diff
[r
]; /* Establish new base. */
1257 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1258 on most processors, since it avoids udiv_qrnnd. If we go down the
1259 udiv_qrnnd_preinv path, this code should be replaced. */
1262 umul_ppmm (s1
, s0
, one
, a
);
1263 if (LIKELY (s1
== 0))
1267 uintmax_t dummy _GL_UNUSED
;
1268 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1272 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1276 error (0, 0, _("Lucas prime test failure. This should not happen"));
1281 prime2_p (uintmax_t n1
, uintmax_t n0
)
1283 uintmax_t q
[2], nm1
[2];
1284 uintmax_t a_prim
[2];
1289 struct factors factors
;
1292 return prime_p (n0
);
1294 nm1
[1] = n1
- (n0
== 0);
1298 count_trailing_zeros (k
, nm1
[1]);
1306 count_trailing_zeros (k
, nm1
[0]);
1307 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1312 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1313 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1315 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1319 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1322 if (flag_prove_primality
)
1324 /* Factor n-1 for Lucas. */
1325 factor (nm1
[1], nm1
[0], &factors
);
1328 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1329 number composite. */
1330 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1333 uintmax_t e
[2], y
[2];
1335 if (flag_prove_primality
)
1338 if (factors
.plarge
[1])
1341 binv (pi
, factors
.plarge
[0]);
1344 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1345 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1347 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1349 /* FIXME: We always have the factor 2. Do we really need to
1350 handle it here? We have done the same powering as part
1352 if (factors
.p
[i
] == 2)
1353 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1355 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1356 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1357 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1362 /* After enough Miller-Rabin runs, be content. */
1363 is_prime
= (r
== MR_REPS
- 1);
1369 a
+= primes_diff
[r
]; /* Establish new base. */
1370 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1372 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1376 error (0, 0, _("Lucas prime test failure. This should not happen"));
1382 mp_prime_p (mpz_t n
)
1385 mpz_t q
, a
, nm1
, tmp
;
1386 struct mp_factors factors
;
1388 if (mpz_cmp_ui (n
, 1) <= 0)
1391 /* We have already casted out small primes. */
1392 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1395 mpz_inits (q
, a
, nm1
, tmp
, NULL
);
1397 /* Precomputation for Miller-Rabin. */
1398 mpz_sub_ui (nm1
, n
, 1);
1400 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1401 unsigned long int k
= mpz_scan1 (nm1
, 0);
1402 mpz_tdiv_q_2exp (q
, nm1
, k
);
1406 /* Perform a Miller-Rabin test, finds most composites quickly. */
1407 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1413 if (flag_prove_primality
)
1415 /* Factor n-1 for Lucas. */
1417 mp_factor (tmp
, &factors
);
1420 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1421 number composite. */
1422 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1424 if (flag_prove_primality
)
1427 for (unsigned long int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1429 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1430 mpz_powm (tmp
, a
, tmp
, n
);
1431 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1436 /* After enough Miller-Rabin runs, be content. */
1437 is_prime
= (r
== MR_REPS
- 1);
1443 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1445 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1452 error (0, 0, _("Lucas prime test failure. This should not happen"));
1456 if (flag_prove_primality
)
1457 mp_factor_clear (&factors
);
1459 mpz_clears (q
, a
, nm1
, tmp
, NULL
);
1466 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1467 struct factors
*factors
)
1469 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1471 unsigned long int k
= 1;
1472 unsigned long int l
= 1;
1475 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1482 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1488 x
= mulredc (x
, x
, n
, ni
);
1489 addmod (x
, x
, a
, n
);
1491 submod (t
, z
, x
, n
);
1492 P
= mulredc (P
, t
, n
, ni
);
1496 if (gcd_odd (P
, n
) != 1)
1506 for (unsigned long int i
= 0; i
< k
; i
++)
1508 x
= mulredc (x
, x
, n
, ni
);
1509 addmod (x
, x
, a
, n
);
1517 y
= mulredc (y
, y
, n
, ni
);
1518 addmod (y
, y
, a
, n
);
1520 submod (t
, z
, y
, n
);
1527 /* Found n itself as factor. Restart with different params. */
1528 factor_using_pollard_rho (n
, a
+ 1, factors
);
1535 factor_using_pollard_rho (g
, a
+ 1, factors
);
1537 factor_insert (factors
, g
);
1541 factor_insert (factors
, n
);
1552 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1553 struct factors
*factors
)
1555 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1557 unsigned long int k
= 1;
1558 unsigned long int l
= 1;
1560 redcify2 (P1
, P0
, 1, n1
, n0
);
1561 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1565 while (n1
!= 0 || n0
!= 1)
1573 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1575 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1577 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1578 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1583 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1584 if (g1
!= 0 || g0
!= 1)
1594 for (unsigned long int i
= 0; i
< k
; i
++)
1596 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1598 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1606 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1608 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1610 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1611 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1613 while (g1
== 0 && g0
== 1);
1617 /* The found factor is one word, and > 1. */
1618 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1621 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1623 factor_insert (factors
, g0
);
1627 /* The found factor is two words. This is highly unlikely, thus hard
1628 to trigger. Please be careful before you change this code! */
1631 if (n1
== g1
&& n0
== g0
)
1633 /* Found n itself as factor. Restart with different params. */
1634 factor_using_pollard_rho2 (n1
, n0
, a
+ 1, factors
);
1638 binv (ginv
, g0
); /* Compute n = n / g. Since the result will */
1639 n0
= ginv
* n0
; /* fit one word, we can compute the quotient */
1640 n1
= 0; /* modulo B, ignoring the high divisor word. */
1642 if (!prime2_p (g1
, g0
))
1643 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1645 factor_insert_large (factors
, g1
, g0
);
1652 factor_insert (factors
, n0
);
1656 factor_using_pollard_rho (n0
, a
, factors
);
1660 if (prime2_p (n1
, n0
))
1662 factor_insert_large (factors
, n1
, n0
);
1666 x0
= mod2 (&x1
, x1
, x0
, n1
, n0
);
1667 z0
= mod2 (&z1
, z1
, z0
, n1
, n0
);
1668 y0
= mod2 (&y1
, y1
, y0
, n1
, n0
);
1674 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1675 struct mp_factors
*factors
)
1680 devmsg ("[pollard-rho (%lu)] ", a
);
1682 mpz_inits (t
, t2
, NULL
);
1683 mpz_init_set_si (y
, 2);
1684 mpz_init_set_si (x
, 2);
1685 mpz_init_set_si (z
, 2);
1686 mpz_init_set_ui (P
, 1);
1688 unsigned long long int k
= 1;
1689 unsigned long long int l
= 1;
1691 while (mpz_cmp_ui (n
, 1) != 0)
1699 mpz_add_ui (x
, x
, a
);
1708 if (mpz_cmp_ui (t
, 1) != 0)
1718 for (unsigned long long int i
= 0; i
< k
; i
++)
1722 mpz_add_ui (x
, x
, a
);
1732 mpz_add_ui (y
, y
, a
);
1737 while (mpz_cmp_ui (t
, 1) == 0);
1739 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1741 if (!mp_prime_p (t
))
1743 devmsg ("[composite factor--restarting pollard-rho] ");
1744 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1748 mp_factor_insert (factors
, t
);
1753 mp_factor_insert (factors
, n
);
1762 mpz_clears (P
, t2
, t
, z
, x
, y
, NULL
);
1767 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1768 algorithm is replaced, consider also returning the remainder. */
1769 static uintmax_t _GL_ATTRIBUTE_CONST
1777 count_leading_zeros (c
, n
);
1779 /* Make x > sqrt(n). This will be invariant through the loop. */
1780 x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) / 2);
1784 uintmax_t y
= (x
+ n
/x
) / 2;
1792 static uintmax_t _GL_ATTRIBUTE_CONST
1793 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1798 /* Ensures the remainder fits in an uintmax_t. */
1799 assert (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1804 count_leading_zeros (shift
, nh
);
1807 /* Make x > sqrt(n) */
1808 x
= isqrt ( (nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1809 x
<<= (W_TYPE_SIZE
- shift
) / 2;
1811 /* Do we need more than one iteration? */
1814 uintmax_t r _GL_UNUSED
;
1816 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1822 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1823 assert (gt2 (hi
, lo
, nh
, nl
));
1825 umul_ppmm (hi
, lo
, x
, x
);
1826 assert (ge2 (nh
, nl
, hi
, lo
));
1827 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1837 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1838 # define MAGIC64 0x0202021202030213ULL
1839 # define MAGIC63 0x0402483012450293ULL
1840 # define MAGIC65 0x218a019866014613ULL
1841 # define MAGIC11 0x23b
1843 /* Return the square root if the input is a square, otherwise 0. */
1844 static uintmax_t _GL_ATTRIBUTE_CONST
1845 is_square (uintmax_t x
)
1847 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1848 computing the square root. */
1849 if (((MAGIC64
>> (x
& 63)) & 1)
1850 && ((MAGIC63
>> (x
% 63)) & 1)
1851 /* Both 0 and 64 are squares mod (65) */
1852 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1853 && ((MAGIC11
>> (x
% 11) & 1)))
1855 uintmax_t r
= isqrt (x
);
1862 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1863 static const unsigned short invtab
[0x81] =
1866 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1867 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1868 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1869 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1870 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1871 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1872 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1873 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1874 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1875 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1876 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1877 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1878 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1879 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1880 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1881 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1884 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1885 that q < 0x40; here it instead uses a table of (Euclidian) inverses. */
1886 # define div_smallq(q, r, u, d) \
1888 if ((u) / 0x40 < (d)) \
1891 uintmax_t _dinv, _mask, _q, _r; \
1892 count_leading_zeros (_cnt, (d)); \
1894 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1896 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1897 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1901 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1902 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1906 _mask = -(uintmax_t) (_r >= (d)); \
1907 (r) = _r - (_mask & (d)); \
1909 assert ( (q) * (d) + (r) == u); \
1913 uintmax_t _q = (u) / (d); \
1914 (r) = (u) - _q * (d); \
1919 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1920 = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1923 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1924 representing G_0 = (-55, 2*4652, 8653).
1926 In the notation of the paper:
1928 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1932 t_0 = floor([q_0 + R_0] / S0) = 1
1933 R_1 = t_0 * S_0 - R_0 = 4001
1934 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1937 /* Multipliers, in order of efficiency:
1938 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1939 0.7317 3*5*7 = 105 = 1
1940 0.7820 3*5*11 = 165 = 1
1942 0.8101 3*7*11 = 231 = 3
1944 0.8284 5*7*11 = 385 = 1
1946 0.8716 3*11 = 33 = 1
1948 0.8913 5*11 = 55 = 3
1950 0.9233 7*11 = 77 = 1
1954 # define QUEUE_SIZE 50
1958 # define Q_FREQ_SIZE 50
1959 /* Element 0 keeps the total */
1960 static unsigned int q_freq
[Q_FREQ_SIZE
+ 1];
1961 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1965 /* Return true on success. Expected to fail only for numbers
1966 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1968 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1970 /* Uses algorithm and notation from
1972 SQUARE FORM FACTORIZATION
1973 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1975 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1978 static const unsigned int multipliers_1
[] =
1980 105, 165, 21, 385, 33, 5, 77, 1, 0
1982 static const unsigned int multipliers_3
[] =
1984 1155, 15, 231, 35, 3, 55, 7, 11, 0
1987 const unsigned int *m
;
1989 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1991 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1994 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1996 if (n0
== sqrt_n
* sqrt_n
)
2000 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
2005 if (prime_p (sqrt_n
))
2006 factor_insert_multiplicity (factors
, sqrt_n
, 2);
2012 if (!factor_using_squfof (0, sqrt_n
, &f
))
2014 /* Try pollard rho instead */
2015 factor_using_pollard_rho (sqrt_n
, 1, &f
);
2017 /* Duplicate the new factors */
2018 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
2019 factor_insert_multiplicity (factors
, f
.p
[i
], 2*f
.e
[i
]);
2025 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2026 for (m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2029 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2031 unsigned int mu
= *m
;
2032 unsigned int qpos
= 0;
2034 assert (mu
* n0
% 4 == 3);
2036 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2037 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2038 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2041 However, this seems insufficient: With n = 37243139 and mu =
2042 105, we get a trivial factor, from the square 38809 = 197^2,
2043 without any corresponding Q earlier in the iteration.
2045 Requiring 64 mu^3 < n seems sufficient. */
2048 if ((uintmax_t) mu
*mu
*mu
>= n0
/ 64)
2053 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2056 umul_ppmm (Dh
, Dl
, n0
, mu
);
2059 assert (Dl
% 4 != 1);
2060 assert (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2062 S
= isqrt2 (Dh
, Dl
);
2067 /* Square root remainder fits in one word, so ignore high part. */
2069 /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2074 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2077 for (i
= 0; i
<= B
; i
++)
2079 uintmax_t q
, P1
, t
, rem
;
2081 div_smallq (q
, rem
, S
+P
, Q
);
2082 P1
= S
- rem
; /* P1 = q*Q - P */
2084 IF_LINT (assert (q
> 0 && Q
> 0));
2088 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2098 g
/= gcd_odd (g
, mu
);
2102 if (qpos
>= QUEUE_SIZE
)
2103 die (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2105 queue
[qpos
].P
= P
% g
;
2110 /* I think the difference can be either sign, but mod
2111 2^W_TYPE_SIZE arithmetic should be fine. */
2112 t
= Q1
+ q
* (P
- P1
);
2119 uintmax_t r
= is_square (Q
);
2122 for (unsigned int j
= 0; j
< qpos
; j
++)
2124 if (queue
[j
].Q
== r
)
2127 /* Traversed entire cycle. */
2128 goto next_multiplier
;
2130 /* Need the absolute value for divisibility test. */
2131 if (P
>= queue
[j
].P
)
2137 /* Delete entries up to and including entry
2138 j, which matched. */
2139 memmove (queue
, queue
+ j
+ 1,
2140 (qpos
- j
- 1) * sizeof (queue
[0]));
2147 /* We have found a square form, which should give a
2150 assert (S
>= P
); /* What signs are possible? */
2151 P
+= r
* ((S
- P
) / r
);
2153 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2154 for the case D = 2N. */
2155 /* Compute Q = (D - P*P) / Q1, but we need double
2158 umul_ppmm (hi
, lo
, P
, P
);
2159 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2160 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2165 /* Note: There appears to by a typo in the paper,
2166 Step 4a in the algorithm description says q <--
2167 floor([S+P]/\hat Q), but looking at the equations
2168 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2169 (In this code, \hat Q is Q1). */
2170 div_smallq (q
, rem
, S
+P
, Q
);
2171 P1
= S
- rem
; /* P1 = q*Q - P */
2175 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2179 t
= Q1
+ q
* (P
- P1
);
2187 Q
/= gcd_odd (Q
, mu
);
2189 assert (Q
> 1 && (n1
|| Q
< n0
));
2192 factor_insert (factors
, Q
);
2193 else if (!factor_using_squfof (0, Q
, factors
))
2194 factor_using_pollard_rho (Q
, 2, factors
);
2196 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2198 if (prime2_p (n1
, n0
))
2199 factor_insert_large (factors
, n1
, n0
);
2202 if (!factor_using_squfof (n1
, n0
, factors
))
2205 factor_using_pollard_rho (n0
, 1, factors
);
2207 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2222 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2223 results in FACTORS. */
2225 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2227 factors
->nfactors
= 0;
2228 factors
->plarge
[1] = 0;
2230 if (t1
== 0 && t0
< 2)
2233 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2235 if (t1
== 0 && t0
< 2)
2238 if (prime2_p (t1
, t0
))
2239 factor_insert_large (factors
, t1
, t0
);
2243 if (factor_using_squfof (t1
, t0
, factors
))
2248 factor_using_pollard_rho (t0
, 1, factors
);
2250 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2255 /* Use Pollard-rho to compute the prime factors of
2256 arbitrary-precision T, and put the results in FACTORS. */
2258 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2260 mp_factor_init (factors
);
2262 if (mpz_sgn (t
) != 0)
2264 mp_factor_using_division (t
, factors
);
2266 if (mpz_cmp_ui (t
, 1) != 0)
2268 devmsg ("[is number prime?] ");
2270 mp_factor_insert (factors
, t
);
2272 mp_factor_using_pollard_rho (t
, 1, factors
);
2279 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, const char *s
)
2281 unsigned int lo_carry
;
2282 uintmax_t hi
= 0, lo
= 0;
2284 strtol_error err
= LONGINT_INVALID
;
2286 /* Skip initial spaces and '+'. */
2301 /* Initial scan for invalid digits. */
2305 unsigned int c
= *p
++;
2309 if (UNLIKELY (!ISDIGIT (c
)))
2311 err
= LONGINT_INVALID
;
2315 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2318 for (;err
== LONGINT_OK
;)
2320 unsigned int c
= *s
++;
2326 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2328 err
= LONGINT_OVERFLOW
;
2333 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2334 lo_carry
+= 10 * lo
< 2 * lo
;
2341 if (UNLIKELY (hi
< lo_carry
))
2343 err
= LONGINT_OVERFLOW
;
2354 /* Structure and routines for buffering and outputting full lines,
2355 to support parallel operation efficiently. */
2362 /* 512 is chosen to give good performance,
2363 and also is the max guaranteed size that
2364 consumers can read atomically through pipes.
2365 Also it's big enough to cater for max line length
2366 even with 128 bit uintmax_t. */
2367 #define FACTOR_PIPE_BUF 512
2375 /* Double to ensure enough space for
2376 previous numbers + next number. */
2377 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2378 lbuf
.end
= lbuf
.buf
;
2381 /* Write complete LBUF to standard output. */
2385 size_t size
= lbuf
.end
- lbuf
.buf
;
2386 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2387 die (EXIT_FAILURE
, errno
, "%s", _("write error"));
2388 lbuf
.end
= lbuf
.buf
;
2391 /* Add a character C to LBUF and if it's a newline
2392 and enough bytes are already buffered,
2393 then write atomically to standard output. */
2401 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2403 /* Provide immediate output for interactive input. */
2404 static int line_buffered
= -1;
2405 if (line_buffered
== -1)
2406 line_buffered
= isatty (STDIN_FILENO
);
2409 else if (buffered
>= FACTOR_PIPE_BUF
)
2411 /* Write output in <= PIPE_BUF chunks
2412 so consumers can read atomically. */
2413 char const *tend
= lbuf
.end
;
2415 /* Since a umaxint_t's factors must fit in 512
2416 we're guaranteed to find a newline here. */
2417 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2418 while (*--tlend
!= '\n');
2424 /* Buffer the remainder. */
2425 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2426 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2431 /* Buffer an int to the internal LBUF. */
2433 lbuf_putint (uintmax_t i
, size_t min_width
)
2435 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2436 char const *umaxstr
= umaxtostr (i
, buf
);
2437 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2440 for (; z
< min_width
; z
++)
2443 memcpy (lbuf
.end
, umaxstr
, width
);
2448 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2453 lbuf_putint (t0
, 0);
2456 /* Use very plain code here since it seems hard to write fast code
2457 without assuming a specific word size. */
2458 q
= t1
/ 1000000000;
2459 r
= t1
% 1000000000;
2460 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2461 print_uintmaxes (q
, t0
);
2466 /* Single-precision factoring */
2468 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2470 struct factors factors
;
2472 print_uintmaxes (t1
, t0
);
2475 factor (t1
, t0
, &factors
);
2477 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2478 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2481 print_uintmaxes (0, factors
.p
[j
]);
2484 if (factors
.plarge
[1])
2487 print_uintmaxes (factors
.plarge
[1], factors
.plarge
[0]);
2493 /* Emit the factors of the indicated number. If we have the option of using
2494 either algorithm, we select on the basis of the length of the number.
2495 For longer numbers, we prefer the MP algorithm even if the native algorithm
2496 has enough digits, because the algorithm is better. The turnover point
2497 depends on the value. */
2499 print_factors (const char *input
)
2503 /* Try converting the number to one or two words. If it fails, use GMP or
2504 print an error message. The 2nd condition checks that the most
2505 significant bit of the two-word number is clear, in a typesize neutral
2507 strtol_error err
= strto2uintmax (&t1
, &t0
, input
);
2512 if (((t1
<< 1) >> 1) == t1
)
2514 devmsg ("[using single-precision arithmetic] ");
2515 print_factors_single (t1
, t0
);
2520 case LONGINT_OVERFLOW
:
2525 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2530 devmsg ("[using arbitrary-precision arithmetic] ");
2532 struct mp_factors factors
;
2534 mpz_init_set_str (t
, input
, 10);
2536 gmp_printf ("%Zd:", t
);
2537 mp_factor (t
, &factors
);
2539 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2540 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2541 gmp_printf (" %Zd", factors
.p
[j
]);
2543 mp_factor_clear (&factors
);
2549 error (0, 0, _("%s is too large"), quote (input
));
2557 if (status
!= EXIT_SUCCESS
)
2562 Usage: %s [NUMBER]...\n\
2565 program_name
, program_name
);
2567 Print the prime factors of each specified integer NUMBER. If none\n\
2568 are specified on the command line, read them from standard input.\n\
2571 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2572 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2573 emit_ancillary_info (PROGRAM_NAME
);
2582 token_buffer tokenbuffer
;
2584 init_tokenbuffer (&tokenbuffer
);
2588 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2590 if (token_length
== (size_t) -1)
2592 ok
&= print_factors (tokenbuffer
.buffer
);
2594 free (tokenbuffer
.buffer
);
2600 main (int argc
, char **argv
)
2602 initialize_main (&argc
, &argv
);
2603 set_program_name (argv
[0]);
2604 setlocale (LC_ALL
, "");
2605 bindtextdomain (PACKAGE
, LOCALEDIR
);
2606 textdomain (PACKAGE
);
2609 atexit (close_stdout
);
2610 atexit (lbuf_flush
);
2613 while ((c
= getopt_long (argc
, argv
, "", long_options
, NULL
)) != -1)
2617 case DEV_DEBUG_OPTION
:
2621 case_GETOPT_HELP_CHAR
;
2623 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2626 usage (EXIT_FAILURE
);
2631 memset (q_freq
, 0, sizeof (q_freq
));
2640 for (int i
= optind
; i
< argc
; i
++)
2641 if (! print_factors (argv
[i
]))
2649 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2650 for (unsigned int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2652 double f
= (double) q_freq
[i
] / q_freq
[0];
2654 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2655 100.0 * f
, 100.0 * acc_f
);
2660 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;