1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
40 (1) Perform trial division using a small primes table, but without hardware
41 division since the primes table store inverses modulo the word base.
42 (The GMP variant of this code doesn't make use of the precomputed
43 inverses, but instead relies on GMP for fast divisibility testing.)
44 (2) Check the nature of any non-factored part using Miller-Rabin for
45 detecting composites, and Lucas for detecting primes.
46 (3) Factor any remaining composite part using the Pollard-Brent rho
47 algorithm or if USE_SQUFOF is defined to 1, try that first.
48 Status of found factors are checked again using Miller-Rabin and Lucas.
50 We prefer using Hensel norm in the divisions, not the more familiar
51 Euclidean norm, since the former leads to much faster code. In the
52 Pollard-Brent rho code and the prime testing code, we use Montgomery's
53 trick of multiplying all n-residues by the word base, allowing cheap Hensel
56 The GMP code uses an algorithm that can be considerably slower;
57 for example, on a circa-2017 Intel Xeon Silver 4116, factoring
58 2^{127}-3 takes about 50 ms with the two-word algorithm but would
59 take about 750 ms with the GMP code.
63 * Use modular inverses also for exact division in the Lucas code, and
64 elsewhere. A problem is to locate the inverses not from an index, but
65 from a prime. We might instead compute the inverse on-the-fly.
67 * Tune trial division table size (not forgetting that this is a standalone
68 program where the table will be read from secondary storage for
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidean)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
111 #include "full-write.h"
113 #include "readtokens.h"
116 /* The official name of this program (e.g., no 'g' prefix). */
117 #define PROGRAM_NAME "factor"
120 proper_name ("Paul Rubin"), \
121 proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
122 proper_name_lite ("Niels Moller", "Niels M\303\266ller")
124 /* Token delimiters when reading from a file. */
125 #define DELIM "\n\t "
127 #ifndef USE_LONGLONG_H
128 /* With the way we use longlong.h, it's only safe to use
129 when UWtype = UHWtype, as there were various cases
130 (as can be seen in the history for longlong.h) where
131 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
132 to avoid compile time or run time issues. */
133 # if LONG_MAX == INTMAX_MAX
134 # define USE_LONGLONG_H 1
140 /* Make definitions for longlong.h to make it do what it can do for us */
142 /* bitcount for uintmax_t */
143 # if UINTMAX_MAX == UINT32_MAX
144 # define W_TYPE_SIZE 32
145 # elif UINTMAX_MAX == UINT64_MAX
146 # define W_TYPE_SIZE 64
147 # elif UINTMAX_MAX == UINT128_MAX
148 # define W_TYPE_SIZE 128
151 # define UWtype uintmax_t
152 # define UHWtype unsigned long int
154 # if HAVE_ATTRIBUTE_MODE
155 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
156 typedef int SItype
__attribute__ ((mode (SI
)));
157 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
158 typedef int DItype
__attribute__ ((mode (DI
)));
159 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
161 typedef unsigned char UQItype
;
163 typedef unsigned long int USItype
;
164 # if HAVE_LONG_LONG_INT
165 typedef long long int DItype
;
166 typedef unsigned long long int UDItype
;
167 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
168 typedef long int DItype
;
169 typedef unsigned long int UDItype
;
172 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
173 # define ASSERT(x) /* FIXME make longlong.h really standalone */
174 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
175 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
176 # ifndef __GMP_GNUC_PREREQ
177 # define __GMP_GNUC_PREREQ(a,b) 1
180 /* These stub macros are only used in longlong.h in certain system compiler
181 combinations, so ensure usage to avoid -Wunused-macros warnings. */
182 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
188 # define HAVE_HOST_CPU_FAMILY_powerpc 1
190 # include "longlong.h"
191 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
192 const unsigned char factor_clz_tab
[129] =
194 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
195 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
196 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
197 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
202 #else /* not USE_LONGLONG_H */
204 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
205 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
206 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
207 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
211 #if !defined __clz_tab && !defined UHWtype
212 /* Without this seemingly useless conditional, gcc -Wunused-macros
213 warns that each of the two tested macros is unused on Fedora 18.
214 FIXME: this is just an ugly band-aid. Fix it properly. */
217 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
218 #define MAX_NFACTS 26
222 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
225 static struct option
const long_options
[] =
227 {"exponents", no_argument
, nullptr, 'h'},
228 {"-debug", no_argument
, nullptr, DEV_DEBUG_OPTION
},
229 {GETOPT_HELP_OPTION_DECL
},
230 {GETOPT_VERSION_OPTION_DECL
},
231 {nullptr, 0, nullptr, 0}
234 /* If true, use p^e output format. */
235 static bool print_exponents
;
239 uintmax_t plarge
[2]; /* Can have a single large factor */
240 uintmax_t p
[MAX_NFACTS
];
241 unsigned char e
[MAX_NFACTS
];
242 unsigned char nfactors
;
248 unsigned long int *e
;
253 static void factor (uintmax_t, uintmax_t, struct factors
*);
256 # define umul_ppmm(w1, w0, u, v) \
258 uintmax_t __x0, __x1, __x2, __x3; \
259 unsigned long int __ul, __vl, __uh, __vh; \
260 uintmax_t __u = (u), __v = (v); \
262 __ul = __ll_lowpart (__u); \
263 __uh = __ll_highpart (__u); \
264 __vl = __ll_lowpart (__v); \
265 __vh = __ll_highpart (__v); \
267 __x0 = (uintmax_t) __ul * __vl; \
268 __x1 = (uintmax_t) __ul * __vh; \
269 __x2 = (uintmax_t) __uh * __vl; \
270 __x3 = (uintmax_t) __uh * __vh; \
272 __x1 += __ll_highpart (__x0);/* This can't give carry. */ \
273 __x1 += __x2; /* But this indeed can. */ \
274 if (__x1 < __x2) /* Did we get it? */ \
275 __x3 += __ll_B; /* Yes, add it in the proper pos. */ \
277 (w1) = __x3 + __ll_highpart (__x1); \
278 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
282 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
283 /* Define our own, not needing normalization. This function is
284 currently not performance critical, so keep it simple. Similar to
285 the mod macro below. */
287 # define udiv_qrnnd(q, r, n1, n0, d) \
289 uintmax_t __d1, __d0, __q, __r1, __r0; \
291 __d1 = (d); __d0 = 0; \
292 __r1 = (n1); __r0 = (n0); \
293 affirm (__r1 < __d1); \
295 for (int __i = W_TYPE_SIZE; __i > 0; __i--) \
297 rsh2 (__d1, __d0, __d1, __d0, 1); \
299 if (ge2 (__r1, __r0, __d1, __d0)) \
302 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
310 #if !defined add_ssaaaa
311 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
314 _add_x = (al) + (bl); \
315 (sh) = (ah) + (bh) + (_add_x < (al)); \
320 #define rsh2(rh, rl, ah, al, cnt) \
322 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
323 (rh) = (ah) >> (cnt); \
326 #define lsh2(rh, rl, ah, al, cnt) \
328 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
329 (rl) = (al) << (cnt); \
332 #define ge2(ah, al, bh, bl) \
333 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
335 #define gt2(ah, al, bh, bl) \
336 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
339 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
343 (rl) = (al) - (bl); \
344 (rh) = (ah) - (bh) - _cy; \
348 #ifndef count_leading_zeros
349 # define count_leading_zeros(count, x) do { \
350 uintmax_t __clz_x = (x); \
353 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
356 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
362 #ifndef count_trailing_zeros
363 # define count_trailing_zeros(count, x) do { \
364 uintmax_t __ctz_x = (x); \
366 while ((__ctz_x & 1) == 0) \
375 /* Requires that a < n and b <= n */
376 #define submod(r,a,b,n) \
378 uintmax_t _t = - (uintmax_t) (a < b); \
379 (r) = ((n) & _t) + (a) - (b); \
382 #define addmod(r,a,b,n) \
383 submod ((r), (a), ((n) - (b)), (n))
385 /* Modular two-word addition and subtraction. For performance reasons, the
386 most significant bit of n1 must be clear. The destination variables must be
387 distinct from the mod operand. */
388 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
390 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
391 if (ge2 ((r1), (r0), (n1), (n0))) \
392 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
394 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
396 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
397 if ((intmax_t) (r1) < 0) \
398 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
401 #define HIGHBIT_TO_MASK(x) \
402 (((intmax_t)-1 >> 1) < 0 \
403 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
404 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
405 ? UINTMAX_MAX : (uintmax_t) 0))
407 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
408 Requires that d1 != 0. */
410 mod2 (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
422 count_leading_zeros (cntd
, d1
);
423 count_leading_zeros (cnta
, a1
);
424 int cnt
= cntd
- cnta
;
425 lsh2 (d1
, d0
, d1
, d0
, cnt
);
426 for (int i
= 0; i
< cnt
; i
++)
428 if (ge2 (a1
, a0
, d1
, d0
))
429 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
430 rsh2 (d1
, d0
, d1
, d0
, 1);
439 gcd_odd (uintmax_t a
, uintmax_t b
)
450 /* Take out least significant one bit, to make room for sign */
466 bgta
= HIGHBIT_TO_MASK (t
);
468 /* b <-- min (a, b) */
472 a
= (t
^ bgta
) - bgta
;
477 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
487 while ((a0
& 1) == 0)
488 rsh2 (a1
, a0
, a1
, a0
, 1);
495 return gcd_odd (b0
, a0
);
498 if (gt2 (a1
, a0
, b1
, b0
))
500 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
502 rsh2 (a1
, a0
, a1
, a0
, 1);
503 while ((a0
& 1) == 0);
505 else if (gt2 (b1
, b0
, a1
, a0
))
507 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
509 rsh2 (b1
, b0
, b1
, b0
, 1);
510 while ((b0
& 1) == 0);
521 factor_insert_multiplicity (struct factors
*factors
,
522 uintmax_t prime
, int m
)
524 int nfactors
= factors
->nfactors
;
525 uintmax_t *p
= factors
->p
;
526 unsigned char *e
= factors
->e
;
528 /* Locate position for insert new or increment e. */
530 for (i
= nfactors
- 1; i
>= 0; i
--)
536 if (i
< 0 || p
[i
] != prime
)
538 for (int j
= nfactors
- 1; j
> i
; j
--)
545 factors
->nfactors
= nfactors
+ 1;
553 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
556 factor_insert_large (struct factors
*factors
,
557 uintmax_t p1
, uintmax_t p0
)
561 affirm (factors
->plarge
[1] == 0);
562 factors
->plarge
[0] = p0
;
563 factors
->plarge
[1] = p1
;
566 factor_insert (factors
, p0
);
573 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
574 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
577 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
581 va_start (ap
, mpz_single_init
);
584 while ((mpz
= va_arg (ap
, mpz_t
*)))
585 mpz_single_init (*mpz
);
591 static void mp_factor (mpz_t
, struct mp_factors
*);
594 mp_factor_init (struct mp_factors
*factors
)
596 factors
->p
= nullptr;
597 factors
->e
= nullptr;
598 factors
->nfactors
= 0;
603 mp_factor_clear (struct mp_factors
*factors
)
605 for (idx_t i
= 0; i
< factors
->nfactors
; i
++)
606 mpz_clear (factors
->p
[i
]);
613 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
615 idx_t nfactors
= factors
->nfactors
;
616 mpz_t
*p
= factors
->p
;
617 unsigned long int *e
= factors
->e
;
620 /* Locate position for insert new or increment e. */
621 for (i
= nfactors
- 1; i
>= 0; i
--)
623 if (mpz_cmp (p
[i
], prime
) <= 0)
627 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
629 if (factors
->nfactors
== factors
->nalloc
)
631 p
= xpalloc (p
, &factors
->nalloc
, 1, -1, sizeof *p
);
632 e
= xireallocarray (e
, factors
->nalloc
, sizeof *e
);
635 mpz_init (p
[nfactors
]);
636 for (ptrdiff_t j
= nfactors
- 1; j
> i
; j
--)
638 mpz_set (p
[j
+ 1], p
[j
]);
641 mpz_set (p
[i
+ 1], prime
);
646 factors
->nfactors
= nfactors
+ 1;
655 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
659 mpz_init_set_ui (pz
, prime
);
660 mp_factor_insert (factors
, pz
);
665 /* Number of bits in an uintmax_t. */
666 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
668 /* Verify that uintmax_t does not have holes in its representation. */
669 static_assert (UINTMAX_MAX
>> (W
- 1) == 1);
671 #define P(a,b,c,d) a,
672 static const unsigned char primes_diff
[] = {
674 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
678 #define PRIMES_PTAB_ENTRIES \
679 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
681 #define P(a,b,c,d) b,
682 static const unsigned char primes_diff8
[] = {
684 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
693 #define P(a,b,c,d) {c,d},
694 static const struct primes_dtab primes_dtab
[] = {
696 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
700 /* Verify that uintmax_t is not wider than
701 the integers used to generate primes.h. */
702 static_assert (W
<= WIDE_UINT_BITS
);
704 /* debugging for developers. Enables devmsg().
705 This flag is used only in the GMP code. */
706 static bool dev_debug
= false;
708 /* Prove primality or run probabilistic tests. */
709 static bool flag_prove_primality
= PROVE_PRIMALITY
;
711 /* Number of Miller-Rabin tests to run when not proving primality. */
715 factor_insert_refind (struct factors
*factors
, uintmax_t p
, int i
, int off
)
717 for (int j
= 0; j
< off
; j
++)
718 p
+= primes_diff
[i
+ j
];
719 factor_insert (factors
, p
);
722 /* Trial division with odd primes uses the following trick.
724 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
725 consider the case t < B (this is the second loop below).
727 From our tables we get
729 binv = p^{-1} (mod B)
730 lim = floor ((B-1) / p).
732 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
733 (and all quotients in this range occur for some t).
735 Then t = q * p is true also (mod B), and p is invertible we get
737 q = t * binv (mod B).
739 Next, assume that t is *not* divisible by p. Since multiplication
740 by binv (mod B) is a one-to-one mapping,
742 t * binv (mod B) > lim,
744 because all the smaller values are already taken.
746 This can be summed up by saying that the function
748 q(t) = binv * t (mod B)
750 is a permutation of the range 0 <= t < B, with the curious property
751 that it maps the multiples of p onto the range 0 <= q <= lim, in
752 order, and the non-multiples of p onto the range lim < q < B.
756 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
757 struct factors
*factors
)
765 count_trailing_zeros (cnt
, t1
);
772 count_trailing_zeros (cnt
, t0
);
773 rsh2 (t1
, t0
, t1
, t0
, cnt
);
776 factor_insert_multiplicity (factors
, 2, cnt
);
781 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
785 uintmax_t q1
, q0
, hi
;
786 MAYBE_UNUSED
uintmax_t lo
;
788 q0
= t0
* primes_dtab
[i
].binv
;
789 umul_ppmm (hi
, lo
, q0
, p
);
793 q1
= hi
* primes_dtab
[i
].binv
;
794 if (LIKELY (q1
> primes_dtab
[i
].lim
))
797 factor_insert (factors
, p
);
799 p
+= primes_diff
[i
+ 1];
804 #define DIVBLOCK(I) \
808 q = t0 * pd[I].binv; \
809 if (LIKELY (q > pd[I].lim)) \
812 factor_insert_refind (factors, p, i + 1, I); \
816 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
819 const struct primes_dtab
*pd
= &primes_dtab
[i
];
829 p
+= primes_diff8
[i
];
838 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
843 devmsg ("[trial division] ");
847 p
= mpz_scan1 (t
, 0);
848 mpz_fdiv_q_2exp (t
, t
, p
);
851 mp_factor_insert_ui (factors
, 2);
855 unsigned long int d
= 3;
856 for (idx_t i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
858 if (! mpz_divisible_ui_p (t
, d
))
860 d
+= primes_diff
[i
++];
861 if (mpz_cmp_ui (t
, d
* d
) < 0)
866 mpz_tdiv_q_ui (t
, t
, d
);
867 mp_factor_insert_ui (factors
, d
);
874 /* Entry i contains (2i+1)^(-1) mod 2^8. */
875 static const unsigned char binvert_table
[128] =
877 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
878 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
879 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
880 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
881 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
882 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
883 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
884 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
885 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
886 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
887 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
888 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
889 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
890 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
891 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
892 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
895 /* Compute n^(-1) mod B, using a Newton iteration. */
896 #define binv(inv,n) \
898 uintmax_t __n = (n); \
901 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
902 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
903 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
904 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
906 if (W_TYPE_SIZE > 64) \
908 int __invbits = 64; \
910 __inv = 2 * __inv - __inv * __inv * __n; \
912 } while (__invbits < W_TYPE_SIZE); \
918 /* q = u / d, assuming d|u. */
919 #define divexact_21(q1, q0, u1, u0, d) \
921 uintmax_t _di, _q0; \
927 MAYBE_UNUSED intmax_t _p0; \
928 umul_ppmm (_p1, _p0, _q0, d); \
929 (q1) = ((u1) - _p1) * _di; \
940 #define redcify(r_prim, r, n) \
942 MAYBE_UNUSED uintmax_t _redcify_q; \
943 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
946 /* x B^2 (mod n). Requires x > 0, n1 < B/2. */
947 #define redcify2(r1, r0, x, n1, n0) \
949 uintmax_t _r1, _r0, _i; \
952 _r1 = (x); _r0 = 0; \
957 _r1 = 0; _r0 = (x); \
958 _i = 2 * W_TYPE_SIZE; \
962 lsh2 (_r1, _r0, _r1, _r0, 1); \
963 if (ge2 (_r1, _r0, (n1), (n0))) \
964 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
970 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
971 Both a and b must be in redc form, the result will be in redc form too. */
972 static inline uintmax_t
973 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
975 uintmax_t rh
, rl
, q
, th
, xh
;
976 MAYBE_UNUSED
uintmax_t tl
;
978 umul_ppmm (rh
, rl
, a
, b
);
980 umul_ppmm (th
, tl
, q
, m
);
988 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
989 Both a and b must be in redc form, the result will be in redc form too.
990 For performance reasons, the most significant bit of m must be clear. */
992 mulredc2 (uintmax_t *r1p
,
993 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
994 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
996 uintmax_t r1
, r0
, q
, p1
, t1
, t0
, s1
, s0
;
997 MAYBE_UNUSED
uintmax_t p0
;
999 affirm ((a1
>> (W_TYPE_SIZE
- 1)) == 0);
1000 affirm ((b1
>> (W_TYPE_SIZE
- 1)) == 0);
1001 affirm ((m1
>> (W_TYPE_SIZE
- 1)) == 0);
1003 /* First compute a0 * <b1, b0> B^{-1}
1016 umul_ppmm (t1
, t0
, a0
, b0
);
1017 umul_ppmm (r1
, r0
, a0
, b1
);
1019 umul_ppmm (p1
, p0
, q
, m0
);
1020 umul_ppmm (s1
, s0
, q
, m1
);
1021 r0
+= (t0
!= 0); /* Carry */
1022 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1023 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1024 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1026 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1041 umul_ppmm (t1
, t0
, a1
, b0
);
1042 umul_ppmm (s1
, s0
, a1
, b1
);
1043 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1045 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1046 umul_ppmm (p1
, p0
, q
, m0
);
1047 umul_ppmm (s1
, s0
, q
, m1
);
1048 r0
+= (t0
!= 0); /* Carry */
1049 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1050 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1051 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1053 if (ge2 (r1
, r0
, m1
, m0
))
1054 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1062 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1071 b
= mulredc (b
, b
, n
, ni
);
1075 y
= mulredc (y
, b
, n
, ni
);
1082 powm2 (uintmax_t *r1m
,
1083 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1084 uintmax_t ni
, const uintmax_t *one
)
1086 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1098 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1102 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1105 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1108 for (e
= ep
[1]; e
> 0; e
>>= 1)
1112 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1115 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1124 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1125 int k
, uintmax_t one
)
1127 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1129 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1131 if (y
== one
|| y
== nm1
)
1134 for (int i
= 1; i
< k
; i
++)
1136 y
= mulredc (y
, y
, n
, ni
);
1146 ATTRIBUTE_PURE
static bool
1147 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1148 const uintmax_t *qp
, int k
, const uintmax_t *one
)
1150 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1152 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1155 if (y0
== one
[0] && y1
== one
[1])
1158 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1160 if (y0
== nm1_0
&& y1
== nm1_1
)
1163 for (int i
= 1; i
< k
; i
++)
1165 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1168 if (y0
== nm1_0
&& y1
== nm1_1
)
1170 if (y0
== one
[0] && y1
== one
[1])
1177 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1178 mpz_srcptr q
, mp_bitcnt_t k
)
1180 mpz_powm (y
, x
, q
, n
);
1182 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1185 for (mp_bitcnt_t i
= 1; i
< k
; i
++)
1187 mpz_powm_ui (y
, y
, 2, n
);
1188 if (mpz_cmp (y
, nm1
) == 0)
1190 if (mpz_cmp_ui (y
, 1) == 0)
1196 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1197 have been observed. The average seem to be about 2. */
1198 static bool ATTRIBUTE_PURE
1199 prime_p (uintmax_t n
)
1203 uintmax_t a_prim
, one
, ni
;
1204 struct factors factors
;
1209 /* We have already cast out small primes. */
1210 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1213 /* Precomputation for Miller-Rabin. */
1214 uintmax_t q
= n
- 1;
1215 for (k
= 0; (q
& 1) == 0; k
++)
1219 binv (ni
, n
); /* ni <- 1/n mod B */
1220 redcify (one
, 1, n
);
1221 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1223 /* Perform a Miller-Rabin test, finds most composites quickly. */
1224 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1227 if (flag_prove_primality
)
1229 /* Factor n-1 for Lucas. */
1230 factor (0, n
- 1, &factors
);
1233 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1234 number composite. */
1235 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1237 if (flag_prove_primality
)
1240 for (int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1243 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1248 /* After enough Miller-Rabin runs, be content. */
1249 is_prime
= (r
== MR_REPS
- 1);
1255 a
+= primes_diff
[r
]; /* Establish new base. */
1257 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1258 on most processors, since it avoids udiv_qrnnd. If we go down the
1259 udiv_qrnnd_preinv path, this code should be replaced. */
1262 umul_ppmm (s1
, s0
, one
, a
);
1263 if (LIKELY (s1
== 0))
1267 MAYBE_UNUSED
uintmax_t dummy
;
1268 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1272 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1276 affirm (!"Lucas prime test failure. This should not happen");
1279 static bool ATTRIBUTE_PURE
1280 prime2_p (uintmax_t n1
, uintmax_t n0
)
1282 uintmax_t q
[2], nm1
[2];
1283 uintmax_t a_prim
[2];
1288 struct factors factors
;
1291 return prime_p (n0
);
1293 nm1
[1] = n1
- (n0
== 0);
1297 count_trailing_zeros (k
, nm1
[1]);
1305 count_trailing_zeros (k
, nm1
[0]);
1306 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1311 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1312 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1314 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1318 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1321 if (flag_prove_primality
)
1323 /* Factor n-1 for Lucas. */
1324 factor (nm1
[1], nm1
[0], &factors
);
1327 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1328 number composite. */
1329 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1332 uintmax_t e
[2], y
[2];
1334 if (flag_prove_primality
)
1337 if (factors
.plarge
[1])
1340 binv (pi
, factors
.plarge
[0]);
1343 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1344 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1346 for (int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1348 /* FIXME: We always have the factor 2. Do we really need to
1349 handle it here? We have done the same powering as part
1351 if (factors
.p
[i
] == 2)
1352 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1354 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1355 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1356 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1361 /* After enough Miller-Rabin runs, be content. */
1362 is_prime
= (r
== MR_REPS
- 1);
1368 a
+= primes_diff
[r
]; /* Establish new base. */
1369 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1371 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1375 affirm (!"Lucas prime test failure. This should not happen");
1379 mp_prime_p (mpz_t n
)
1382 mpz_t q
, a
, nm1
, tmp
;
1383 struct mp_factors factors
;
1385 if (mpz_cmp_ui (n
, 1) <= 0)
1388 /* We have already cast out small primes. */
1389 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1392 mpz_inits (q
, a
, nm1
, tmp
, nullptr);
1394 /* Precomputation for Miller-Rabin. */
1395 mpz_sub_ui (nm1
, n
, 1);
1397 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1398 mp_bitcnt_t k
= mpz_scan1 (nm1
, 0);
1399 mpz_tdiv_q_2exp (q
, nm1
, k
);
1403 /* Perform a Miller-Rabin test, finds most composites quickly. */
1404 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1410 if (flag_prove_primality
)
1412 /* Factor n-1 for Lucas. */
1414 mp_factor (tmp
, &factors
);
1417 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1418 number composite. */
1419 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1421 if (flag_prove_primality
)
1424 for (idx_t i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1426 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1427 mpz_powm (tmp
, a
, tmp
, n
);
1428 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1433 /* After enough Miller-Rabin runs, be content. */
1434 is_prime
= (r
== MR_REPS
- 1);
1440 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1442 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1449 affirm (!"Lucas prime test failure. This should not happen");
1452 if (flag_prove_primality
)
1453 mp_factor_clear (&factors
);
1455 mpz_clears (q
, a
, nm1
, tmp
, nullptr);
1461 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1462 struct factors
*factors
)
1464 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1466 unsigned long int k
= 1;
1467 unsigned long int l
= 1;
1470 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1477 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1483 x
= mulredc (x
, x
, n
, ni
);
1484 addmod (x
, x
, a
, n
);
1486 submod (t
, z
, x
, n
);
1487 P
= mulredc (P
, t
, n
, ni
);
1491 if (gcd_odd (P
, n
) != 1)
1501 for (unsigned long int i
= 0; i
< k
; i
++)
1503 x
= mulredc (x
, x
, n
, ni
);
1504 addmod (x
, x
, a
, n
);
1512 y
= mulredc (y
, y
, n
, ni
);
1513 addmod (y
, y
, a
, n
);
1515 submod (t
, z
, y
, n
);
1522 /* Found n itself as factor. Restart with different params. */
1523 factor_using_pollard_rho (n
, a
+ 1, factors
);
1530 factor_using_pollard_rho (g
, a
+ 1, factors
);
1532 factor_insert (factors
, g
);
1536 factor_insert (factors
, n
);
1547 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1548 struct factors
*factors
)
1550 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1552 unsigned long int k
= 1;
1553 unsigned long int l
= 1;
1555 redcify2 (P1
, P0
, 1, n1
, n0
);
1556 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1560 while (n1
!= 0 || n0
!= 1)
1568 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1570 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1572 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1573 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1578 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1579 if (g1
!= 0 || g0
!= 1)
1589 for (unsigned long int i
= 0; i
< k
; i
++)
1591 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1593 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1601 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1603 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1605 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1606 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1608 while (g1
== 0 && g0
== 1);
1612 /* The found factor is one word, and > 1. */
1613 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1616 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1618 factor_insert (factors
, g0
);
1622 /* The found factor is two words. This is highly unlikely, thus hard
1623 to trigger. Please be careful before you change this code! */
1626 if (n1
== g1
&& n0
== g0
)
1628 /* Found n itself as factor. Restart with different params. */
1629 factor_using_pollard_rho2 (n1
, n0
, a
+ 1, factors
);
1633 /* Compute n = n / g. Since the result will fit one word,
1634 we can compute the quotient modulo B, ignoring the high
1640 if (!prime2_p (g1
, g0
))
1641 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1643 factor_insert_large (factors
, g1
, g0
);
1650 factor_insert (factors
, n0
);
1654 factor_using_pollard_rho (n0
, a
, factors
);
1658 if (prime2_p (n1
, n0
))
1660 factor_insert_large (factors
, n1
, n0
);
1664 x0
= mod2 (&x1
, x1
, x0
, n1
, n0
);
1665 z0
= mod2 (&z1
, z1
, z0
, n1
, n0
);
1666 y0
= mod2 (&y1
, y1
, y0
, n1
, n0
);
1671 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1672 struct mp_factors
*factors
)
1677 devmsg ("[pollard-rho (%lu)] ", a
);
1679 mpz_inits (t
, t2
, nullptr);
1680 mpz_init_set_si (y
, 2);
1681 mpz_init_set_si (x
, 2);
1682 mpz_init_set_si (z
, 2);
1683 mpz_init_set_ui (P
, 1);
1685 unsigned long long int k
= 1;
1686 unsigned long long int l
= 1;
1688 while (mpz_cmp_ui (n
, 1) != 0)
1696 mpz_add_ui (x
, x
, a
);
1705 if (mpz_cmp_ui (t
, 1) != 0)
1715 for (unsigned long long int i
= 0; i
< k
; i
++)
1719 mpz_add_ui (x
, x
, a
);
1729 mpz_add_ui (y
, y
, a
);
1734 while (mpz_cmp_ui (t
, 1) == 0);
1736 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1738 if (!mp_prime_p (t
))
1740 devmsg ("[composite factor--restarting pollard-rho] ");
1741 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1745 mp_factor_insert (factors
, t
);
1750 mp_factor_insert (factors
, n
);
1759 mpz_clears (P
, t2
, t
, z
, x
, y
, nullptr);
1763 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1764 algorithm is replaced, consider also returning the remainder. */
1774 count_leading_zeros (c
, n
);
1776 /* Make x > sqrt(n). This will be invariant through the loop. */
1777 x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) >> 1);
1781 uintmax_t y
= (x
+ n
/ x
) / 2;
1791 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1796 /* Ensures the remainder fits in an uintmax_t. */
1797 affirm (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1802 count_leading_zeros (shift
, nh
);
1805 /* Make x > sqrt (n). */
1806 x
= isqrt ((nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1807 x
<<= (W_TYPE_SIZE
- shift
) >> 1;
1809 /* Do we need more than one iteration? */
1812 MAYBE_UNUSED
uintmax_t r
;
1814 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1820 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1821 affirm (gt2 (hi
, lo
, nh
, nl
));
1823 umul_ppmm (hi
, lo
, x
, x
);
1824 affirm (ge2 (nh
, nl
, hi
, lo
));
1825 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1835 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1836 # define MAGIC64 0x0202021202030213ULL
1837 # define MAGIC63 0x0402483012450293ULL
1838 # define MAGIC65 0x218a019866014613ULL
1839 # define MAGIC11 0x23b
1841 /* Return the square root if the input is a square, otherwise 0. */
1844 is_square (uintmax_t x
)
1846 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1847 computing the square root. */
1848 if (((MAGIC64
>> (x
& 63)) & 1)
1849 && ((MAGIC63
>> (x
% 63)) & 1)
1850 /* Both 0 and 64 are squares mod (65). */
1851 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1852 && ((MAGIC11
>> (x
% 11) & 1)))
1854 uintmax_t r
= isqrt (x
);
1861 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1862 static short const invtab
[0x81] =
1865 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1866 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1867 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1868 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1869 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1870 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1871 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1872 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1873 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1874 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1875 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1876 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1877 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1878 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1879 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1880 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1883 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1884 that q < 0x40; here it instead uses a table of (Euclidean) inverses. */
1885 # define div_smallq(q, r, u, d) \
1887 if ((u) / 0x40 < (d)) \
1890 uintmax_t _dinv, _mask, _q, _r; \
1891 count_leading_zeros (_cnt, (d)); \
1893 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1895 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1896 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1900 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1901 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1905 _mask = -(uintmax_t) (_r >= (d)); \
1906 (r) = _r - (_mask & (d)); \
1908 affirm ((q) * (d) + (r) == u); \
1912 uintmax_t _q = (u) / (d); \
1913 (r) = (u) - _q * (d); \
1918 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1919 = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1922 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1923 representing G_0 = (-55, 2 * 4652, 8653).
1925 In the notation of the paper:
1927 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1931 t_0 = floor([q_0 + R_0] / S0) = 1
1932 R_1 = t_0 * S_0 - R_0 = 4001
1933 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1936 /* Multipliers, in order of efficiency:
1937 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1938 0.7317 3*5*7 = 105 = 1
1939 0.7820 3*5*11 = 165 = 1
1941 0.8101 3*7*11 = 231 = 3
1943 0.8284 5*7*11 = 385 = 1
1945 0.8716 3*11 = 33 = 1
1947 0.8913 5*11 = 55 = 3
1949 0.9233 7*11 = 77 = 1
1953 # define QUEUE_SIZE 50
1957 # define Q_FREQ_SIZE 50
1958 /* Element 0 keeps the total */
1959 static int q_freq
[Q_FREQ_SIZE
+ 1];
1963 /* Return true on success. Expected to fail only for numbers
1964 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1966 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1968 /* Uses algorithm and notation from
1970 SQUARE FORM FACTORIZATION
1971 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1973 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1976 static short const multipliers_1
[] =
1978 105, 165, 21, 385, 33, 5, 77, 1, 0
1980 static short const multipliers_3
[] =
1982 1155, 15, 231, 35, 3, 55, 7, 11, 0
1985 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1987 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1990 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1992 if (n0
== sqrt_n
* sqrt_n
)
1996 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
2001 if (prime_p (sqrt_n
))
2002 factor_insert_multiplicity (factors
, sqrt_n
, 2);
2008 if (!factor_using_squfof (0, sqrt_n
, &f
))
2010 /* Try pollard rho instead */
2011 factor_using_pollard_rho (sqrt_n
, 1, &f
);
2013 /* Duplicate the new factors */
2014 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
2015 factor_insert_multiplicity (factors
, f
.p
[i
], 2 * f
.e
[i
]);
2021 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2022 for (short const *m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2025 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2027 unsigned int mu
= *m
;
2030 affirm (mu
* n0
% 4 == 3);
2032 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2033 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2034 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2037 However, this seems insufficient: With n = 37243139 and mu =
2038 105, we get a trivial factor, from the square 38809 = 197^2,
2039 without any corresponding Q earlier in the iteration.
2041 Requiring 64 mu^3 < n seems sufficient. */
2044 if ((uintmax_t) mu
* mu
* mu
>= n0
/ 64)
2049 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2052 umul_ppmm (Dh
, Dl
, n0
, mu
);
2055 affirm (Dl
% 4 != 1);
2056 affirm (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2058 S
= isqrt2 (Dh
, Dl
);
2063 /* Square root remainder fits in one word, so ignore high part. */
2065 /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))? */
2070 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2073 for (i
= 0; i
<= B
; i
++)
2075 uintmax_t q
, P1
, t
, rem
;
2077 div_smallq (q
, rem
, S
+ P
, Q
);
2078 P1
= S
- rem
; /* P1 = q*Q - P */
2080 affirm (q
> 0 && Q
> 0);
2084 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2094 g
/= gcd_odd (g
, mu
);
2098 if (qpos
>= QUEUE_SIZE
)
2099 error (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2101 queue
[qpos
].P
= P
% g
;
2106 /* I think the difference can be either sign, but mod
2107 2^W_TYPE_SIZE arithmetic should be fine. */
2108 t
= Q1
+ q
* (P
- P1
);
2115 uintmax_t r
= is_square (Q
);
2118 for (int j
= 0; j
< qpos
; j
++)
2120 if (queue
[j
].Q
== r
)
2123 /* Traversed entire cycle. */
2124 goto next_multiplier
;
2126 /* Need the absolute value for divisibility test. */
2127 if (P
>= queue
[j
].P
)
2133 /* Delete entries up to and including entry
2134 j, which matched. */
2135 memmove (queue
, queue
+ j
+ 1,
2136 (qpos
- j
- 1) * sizeof (queue
[0]));
2143 /* We have found a square form, which should give a
2146 affirm (S
>= P
); /* What signs are possible? */
2147 P
+= r
* ((S
- P
) / r
);
2149 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2150 for the case D = 2N. */
2151 /* Compute Q = (D - P*P) / Q1, but we need double
2154 umul_ppmm (hi
, lo
, P
, P
);
2155 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2156 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2161 /* Note: There appears to by a typo in the paper,
2162 Step 4a in the algorithm description says q <--
2163 floor([S+P]/\hat Q), but looking at the equations
2164 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2165 (In this code, \hat Q is Q1). */
2166 div_smallq (q
, rem
, S
+ P
, Q
);
2167 P1
= S
- rem
; /* P1 = q*Q - P */
2171 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2175 t
= Q1
+ q
* (P
- P1
);
2183 Q
/= gcd_odd (Q
, mu
);
2185 affirm (Q
> 1 && (n1
|| Q
< n0
));
2188 factor_insert (factors
, Q
);
2189 else if (!factor_using_squfof (0, Q
, factors
))
2190 factor_using_pollard_rho (Q
, 2, factors
);
2192 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2194 if (prime2_p (n1
, n0
))
2195 factor_insert_large (factors
, n1
, n0
);
2198 if (!factor_using_squfof (n1
, n0
, factors
))
2201 factor_using_pollard_rho (n0
, 1, factors
);
2203 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2218 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2219 results in FACTORS. */
2221 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2223 factors
->nfactors
= 0;
2224 factors
->plarge
[1] = 0;
2226 if (t1
== 0 && t0
< 2)
2229 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2231 if (t1
== 0 && t0
< 2)
2234 if (prime2_p (t1
, t0
))
2235 factor_insert_large (factors
, t1
, t0
);
2239 if (factor_using_squfof (t1
, t0
, factors
))
2244 factor_using_pollard_rho (t0
, 1, factors
);
2246 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2250 /* Use Pollard-rho to compute the prime factors of
2251 arbitrary-precision T, and put the results in FACTORS. */
2253 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2255 mp_factor_init (factors
);
2257 if (mpz_sgn (t
) != 0)
2259 mp_factor_using_division (t
, factors
);
2261 if (mpz_cmp_ui (t
, 1) != 0)
2263 devmsg ("[is number prime?] ");
2265 mp_factor_insert (factors
, t
);
2267 mp_factor_using_pollard_rho (t
, 1, factors
);
2273 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, char const *s
)
2276 uintmax_t hi
= 0, lo
= 0;
2278 strtol_error err
= LONGINT_INVALID
;
2280 /* Initial scan for invalid digits. */
2284 unsigned char c
= *p
++;
2288 if (UNLIKELY (!ISDIGIT (c
)))
2290 err
= LONGINT_INVALID
;
2294 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2297 while (err
== LONGINT_OK
)
2299 unsigned char c
= *s
++;
2305 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2307 err
= LONGINT_OVERFLOW
;
2312 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2313 lo_carry
+= 10 * lo
< 2 * lo
;
2320 if (UNLIKELY (hi
< lo_carry
))
2322 err
= LONGINT_OVERFLOW
;
2333 /* Structure and routines for buffering and outputting full lines,
2334 to support parallel operation efficiently. */
2341 /* 512 is chosen to give good performance,
2342 and also is the max guaranteed size that
2343 consumers can read atomically through pipes.
2344 Also it's big enough to cater for max line length
2345 even with 128 bit uintmax_t. */
2346 #define FACTOR_PIPE_BUF 512
2354 /* Double to ensure enough space for
2355 previous numbers + next number. */
2356 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2357 lbuf
.end
= lbuf
.buf
;
2360 /* Write complete LBUF to standard output. */
2364 size_t size
= lbuf
.end
- lbuf
.buf
;
2365 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2367 lbuf
.end
= lbuf
.buf
;
2370 /* Add a character C to LBUF and if it's a newline
2371 and enough bytes are already buffered,
2372 then write atomically to standard output. */
2380 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2382 /* Provide immediate output for interactive use. */
2383 static int line_buffered
= -1;
2384 if (line_buffered
== -1)
2385 line_buffered
= isatty (STDIN_FILENO
) || isatty (STDOUT_FILENO
);
2388 else if (buffered
>= FACTOR_PIPE_BUF
)
2390 /* Write output in <= PIPE_BUF chunks
2391 so consumers can read atomically. */
2392 char const *tend
= lbuf
.end
;
2394 /* Since a umaxint_t's factors must fit in 512
2395 we're guaranteed to find a newline here. */
2396 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2397 while (*--tlend
!= '\n');
2403 /* Buffer the remainder. */
2404 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2405 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2410 /* Buffer an int to the internal LBUF. */
2412 lbuf_putint (uintmax_t i
, size_t min_width
)
2414 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2415 char const *umaxstr
= umaxtostr (i
, buf
);
2416 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2419 for (; z
< min_width
; z
++)
2422 memcpy (lbuf
.end
, umaxstr
, width
);
2427 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2432 lbuf_putint (t0
, 0);
2435 /* Use very plain code here since it seems hard to write fast code
2436 without assuming a specific word size. */
2437 q
= t1
/ 1000000000;
2438 r
= t1
% 1000000000;
2439 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2440 print_uintmaxes (q
, t0
);
2445 /* Single-precision factoring */
2447 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2449 struct factors factors
;
2451 print_uintmaxes (t1
, t0
);
2454 factor (t1
, t0
, &factors
);
2456 for (int j
= 0; j
< factors
.nfactors
; j
++)
2457 for (int k
= 0; k
< factors
.e
[j
]; k
++)
2460 print_uintmaxes (0, factors
.p
[j
]);
2461 if (print_exponents
&& factors
.e
[j
] > 1)
2464 lbuf_putint (factors
.e
[j
], 0);
2469 if (factors
.plarge
[1])
2472 print_uintmaxes (factors
.plarge
[1], factors
.plarge
[0]);
2478 /* Emit the factors of the indicated number. If we have the option of using
2479 either algorithm, we select on the basis of the length of the number.
2480 For longer numbers, we prefer the MP algorithm even if the native algorithm
2481 has enough digits, because the algorithm is better. The turnover point
2482 depends on the value. */
2484 print_factors (char const *input
)
2486 /* Skip initial spaces and '+'. */
2487 char const *str
= input
;
2494 /* Try converting the number to one or two words. If it fails, use GMP or
2495 print an error message. The 2nd condition checks that the most
2496 significant bit of the two-word number is clear, in a typesize neutral
2498 strtol_error err
= strto2uintmax (&t1
, &t0
, str
);
2503 if (((t1
<< 1) >> 1) == t1
)
2505 devmsg ("[using single-precision arithmetic] ");
2506 print_factors_single (t1
, t0
);
2511 case LONGINT_OVERFLOW
:
2516 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2520 devmsg ("[using arbitrary-precision arithmetic] ");
2522 struct mp_factors factors
;
2524 mpz_init_set_str (t
, str
, 10);
2526 mpz_out_str (stdout
, 10, t
);
2528 mp_factor (t
, &factors
);
2530 for (idx_t j
= 0; j
< factors
.nfactors
; j
++)
2531 for (unsigned long int k
= 0; k
< factors
.e
[j
]; k
++)
2534 mpz_out_str (stdout
, 10, factors
.p
[j
]);
2535 if (print_exponents
&& factors
.e
[j
] > 1)
2537 printf ("^%lu", factors
.e
[j
]);
2542 mp_factor_clear (&factors
);
2552 if (status
!= EXIT_SUCCESS
)
2557 Usage: %s [OPTION] [NUMBER]...\n\
2561 Print the prime factors of each specified integer NUMBER. If none\n\
2562 are specified on the command line, read them from standard input.\n\
2566 -h, --exponents print repeated factors in form p^e unless e is 1\n\
2568 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2569 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2570 emit_ancillary_info (PROGRAM_NAME
);
2579 token_buffer tokenbuffer
;
2581 init_tokenbuffer (&tokenbuffer
);
2585 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2587 if (token_length
== (size_t) -1)
2590 error (EXIT_FAILURE
, errno
, _("error reading input"));
2594 ok
&= print_factors (tokenbuffer
.buffer
);
2596 free (tokenbuffer
.buffer
);
2602 main (int argc
, char **argv
)
2604 initialize_main (&argc
, &argv
);
2605 set_program_name (argv
[0]);
2606 setlocale (LC_ALL
, "");
2607 bindtextdomain (PACKAGE
, LOCALEDIR
);
2608 textdomain (PACKAGE
);
2611 atexit (close_stdout
);
2612 atexit (lbuf_flush
);
2615 while ((c
= getopt_long (argc
, argv
, "h", long_options
, nullptr)) != -1)
2619 case 'h': /* NetBSD used -h for this functionality first. */
2620 print_exponents
= true;
2623 case DEV_DEBUG_OPTION
:
2627 case_GETOPT_HELP_CHAR
;
2629 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2632 usage (EXIT_FAILURE
);
2637 memset (q_freq
, 0, sizeof (q_freq
));
2646 for (int i
= optind
; i
< argc
; i
++)
2647 if (! print_factors (argv
[i
]))
2655 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2656 for (int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2658 double f
= (double) q_freq
[i
] / q_freq
[0];
2660 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2661 100.0 * f
, 100.0 * acc_f
);
2666 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;