1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
40 (1) Perform trial division using a small primes table, but without hardware
41 division since the primes table store inverses modulo the word base.
42 (The GMP variant of this code doesn't make use of the precomputed
43 inverses, but instead relies on GMP for fast divisibility testing.)
44 (2) Check the nature of any non-factored part using Miller-Rabin for
45 detecting composites, and Lucas for detecting primes.
46 (3) Factor any remaining composite part using the Pollard-Brent rho
47 algorithm or if USE_SQUFOF is defined to 1, try that first.
48 Status of found factors are checked again using Miller-Rabin and Lucas.
50 We prefer using Hensel norm in the divisions, not the more familiar
51 Euclidean norm, since the former leads to much faster code. In the
52 Pollard-Brent rho code and the prime testing code, we use Montgomery's
53 trick of multiplying all n-residues by the word base, allowing cheap Hensel
56 The GMP code uses an algorithm that can be considerably slower;
57 for example, on a circa-2017 Intel Xeon Silver 4116, factoring
58 2^{127}-3 takes about 50 ms with the two-word algorithm but would
59 take about 750 ms with the GMP code.
63 * Use modular inverses also for exact division in the Lucas code, and
64 elsewhere. A problem is to locate the inverses not from an index, but
65 from a prime. We might instead compute the inverse on-the-fly.
67 * Tune trial division table size (not forgetting that this is a standalone
68 program where the table will be read from secondary storage for
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidean)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
112 #include "full-write.h"
114 #include "readtokens.h"
117 /* The official name of this program (e.g., no 'g' prefix). */
118 #define PROGRAM_NAME "factor"
121 proper_name ("Paul Rubin"), \
122 proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
123 proper_name_lite ("Niels Moller", "Niels M\303\266ller")
125 /* Token delimiters when reading from a file. */
126 #define DELIM "\n\t "
128 #ifndef USE_LONGLONG_H
129 /* With the way we use longlong.h, it's only safe to use
130 when UWtype = UHWtype, as there were various cases
131 (as can be seen in the history for longlong.h) where
132 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
133 to avoid compile time or run time issues. */
134 # if LONG_MAX == INTMAX_MAX
135 # define USE_LONGLONG_H 1
141 /* Make definitions for longlong.h to make it do what it can do for us */
143 /* bitcount for uintmax_t */
144 # if UINTMAX_MAX == UINT32_MAX
145 # define W_TYPE_SIZE 32
146 # elif UINTMAX_MAX == UINT64_MAX
147 # define W_TYPE_SIZE 64
148 # elif UINTMAX_MAX == UINT128_MAX
149 # define W_TYPE_SIZE 128
152 # define UWtype uintmax_t
153 # define UHWtype unsigned long int
155 # if HAVE_ATTRIBUTE_MODE
156 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
157 typedef int SItype
__attribute__ ((mode (SI
)));
158 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
159 typedef int DItype
__attribute__ ((mode (DI
)));
160 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
162 typedef unsigned char UQItype
;
164 typedef unsigned long int USItype
;
165 # if HAVE_LONG_LONG_INT
166 typedef long long int DItype
;
167 typedef unsigned long long int UDItype
;
168 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
169 typedef long int DItype
;
170 typedef unsigned long int UDItype
;
173 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
174 # define ASSERT(x) /* FIXME make longlong.h really standalone */
175 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
176 # ifndef __GMP_GNUC_PREREQ
177 # define __GMP_GNUC_PREREQ(a,b) 1
180 /* longlong.h uses these macros only in certain system compiler combinations.
181 Ensure usage to pacify -Wunused-macros. */
182 #if defined ASSERT || defined UHWtype || defined __GMP_DECLSPEC
186 # define HAVE_HOST_CPU_FAMILY_powerpc 1
188 # include "longlong.h"
190 #else /* not USE_LONGLONG_H */
192 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
193 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
194 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
195 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
199 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
200 #define MAX_NFACTS 26
204 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
207 static struct option
const long_options
[] =
209 {"exponents", no_argument
, nullptr, 'h'},
210 {"-debug", no_argument
, nullptr, DEV_DEBUG_OPTION
},
211 {GETOPT_HELP_OPTION_DECL
},
212 {GETOPT_VERSION_OPTION_DECL
},
213 {nullptr, 0, nullptr, 0}
216 /* If true, use p^e output format. */
217 static bool print_exponents
;
219 /* This represents an unsigned integer twice as wide as uintmax_t. */
220 typedef struct { uintmax_t uu
[2]; } uuint
;
222 /* Accessors and constructors for the type. Pprograms should not
223 access the type's internals directly, in case some future version
224 replaces the type with unsigned __int128 or whatever. */
225 static uintmax_t lo (uuint u
) { return u
.uu
[0]; }
226 static uintmax_t hi (uuint u
) { return u
.uu
[1]; }
227 static void hiset (uuint
*u
, uintmax_t hi
) { u
->uu
[1] = hi
; }
229 uuset (uintmax_t *phi
, uintmax_t *plo
, uuint uu
)
235 make_uuint (uintmax_t hi
, uintmax_t lo
)
237 return (uuint
) {{lo
, hi
}};
242 uuint plarge
; /* Can have a single large factor */
243 uintmax_t p
[MAX_NFACTS
];
244 unsigned char e
[MAX_NFACTS
];
245 unsigned char nfactors
;
251 unsigned long int *e
;
256 static void factor (uintmax_t, uintmax_t, struct factors
*);
259 # define umul_ppmm(w1, w0, u, v) \
261 uintmax_t __x0, __x1, __x2, __x3; \
262 unsigned long int __ul, __vl, __uh, __vh; \
263 uintmax_t __u = (u), __v = (v); \
265 __ul = __ll_lowpart (__u); \
266 __uh = __ll_highpart (__u); \
267 __vl = __ll_lowpart (__v); \
268 __vh = __ll_highpart (__v); \
270 __x0 = (uintmax_t) __ul * __vl; \
271 __x1 = (uintmax_t) __ul * __vh; \
272 __x2 = (uintmax_t) __uh * __vl; \
273 __x3 = (uintmax_t) __uh * __vh; \
275 __x1 += __ll_highpart (__x0);/* This can't give carry. */ \
276 __x1 += __x2; /* But this indeed can. */ \
277 if (__x1 < __x2) /* Did we get it? */ \
278 __x3 += __ll_B; /* Yes, add it in the proper pos. */ \
280 (w1) = __x3 + __ll_highpart (__x1); \
281 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
285 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
286 /* Define our own, not needing normalization. This function is
287 currently not performance critical, so keep it simple. Similar to
288 the mod macro below. */
290 # define udiv_qrnnd(q, r, n1, n0, d) \
292 uintmax_t __d1, __d0, __q, __r1, __r0; \
294 __d1 = (d); __d0 = 0; \
295 __r1 = (n1); __r0 = (n0); \
296 affirm (__r1 < __d1); \
298 for (int __i = W_TYPE_SIZE; __i > 0; __i--) \
300 rsh2 (__d1, __d0, __d1, __d0, 1); \
302 if (ge2 (__r1, __r0, __d1, __d0)) \
305 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
313 #if !defined add_ssaaaa
314 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
317 _add_x = (al) + (bl); \
318 (sh) = (ah) + (bh) + (_add_x < (al)); \
323 /* Set (rh,rl) = (ah,al) >> cnt, where 0 < cnt < W_TYPE_SIZE. */
324 #define rsh2(rh, rl, ah, al, cnt) \
326 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
327 (rh) = (ah) >> (cnt); \
330 /* Set (rh,rl) = (ah,al) << cnt, where 0 < cnt < W_TYPE_SIZE. */
331 #define lsh2(rh, rl, ah, al, cnt) \
333 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
334 (rl) = (al) << (cnt); \
337 #define ge2(ah, al, bh, bl) \
338 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
340 #define gt2(ah, al, bh, bl) \
341 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
344 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
348 (rl) = (al) - (bl); \
349 (rh) = (ah) - (bh) - _cy; \
353 /* Requires that a < n and b <= n */
354 #define submod(r,a,b,n) \
356 uintmax_t _t = - (uintmax_t) (a < b); \
357 (r) = ((n) & _t) + (a) - (b); \
360 #define addmod(r,a,b,n) \
361 submod ((r), (a), ((n) - (b)), (n))
363 /* Modular two-word addition and subtraction. For performance reasons, the
364 most significant bit of n1 must be clear. The destination variables must be
365 distinct from the mod operand. */
366 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
368 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
369 if (ge2 ((r1), (r0), (n1), (n0))) \
370 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
372 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
374 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
375 if ((intmax_t) (r1) < 0) \
376 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
379 #define HIGHBIT_TO_MASK(x) \
380 (((intmax_t)-1 >> 1) < 0 \
381 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
382 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
383 ? UINTMAX_MAX : (uintmax_t) 0))
385 /* Return r = a mod d, where a = <a1,a0>, d = <d1,d0>.
386 Requires that d1 != 0. */
387 ATTRIBUTE_PURE
static uuint
388 mod2 (uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
394 int cntd
= stdc_leading_zeros (d1
);
395 int cnta
= stdc_leading_zeros (a1
);
396 int cnt
= cntd
- cnta
;
399 lsh2 (d1
, d0
, d1
, d0
, cnt
);
400 for (int i
= 0; i
< cnt
; i
++)
402 if (ge2 (a1
, a0
, d1
, d0
))
403 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
404 rsh2 (d1
, d0
, d1
, d0
, 1);
409 return make_uuint (a1
, a0
);
414 gcd_odd (uintmax_t a
, uintmax_t b
)
425 /* Take out least significant one bit, to make room for sign */
434 a
>>= stdc_trailing_zeros (a
);
441 bgta
= HIGHBIT_TO_MASK (t
);
443 /* b <-- min (a, b) */
447 a
= (t
^ bgta
) - bgta
;
452 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
464 int ctz
= stdc_trailing_zeros (a0
);
466 rsh2 (a1
, a0
, a1
, a0
, ctz
);
473 return gcd_odd (b0
, a0
);
476 if (gt2 (a1
, a0
, b1
, b0
))
478 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
482 ctz
= stdc_trailing_zeros (a0
);
484 rsh2 (a1
, a0
, a1
, a0
, ctz
);
486 else if (gt2 (b1
, b0
, a1
, a0
))
488 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
492 ctz
= stdc_trailing_zeros (b0
);
494 rsh2 (b1
, b0
, b1
, b0
, ctz
);
505 factor_insert_multiplicity (struct factors
*factors
,
506 uintmax_t prime
, int m
)
508 int nfactors
= factors
->nfactors
;
509 uintmax_t *p
= factors
->p
;
510 unsigned char *e
= factors
->e
;
512 /* Locate position for insert new or increment e. */
514 for (i
= nfactors
- 1; i
>= 0; i
--)
520 if (i
< 0 || p
[i
] != prime
)
522 for (int j
= nfactors
- 1; j
> i
; j
--)
529 factors
->nfactors
= nfactors
+ 1;
537 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
540 factor_insert_large (struct factors
*factors
,
541 uintmax_t p1
, uintmax_t p0
)
545 affirm (hi (factors
->plarge
) == 0);
546 factors
->plarge
= make_uuint (p1
, p0
);
549 factor_insert (factors
, p0
);
556 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
557 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
560 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
564 va_start (ap
, mpz_single_init
);
567 while ((mpz
= va_arg (ap
, mpz_t
*)))
568 mpz_single_init (*mpz
);
574 static void mp_factor (mpz_t
, struct mp_factors
*);
577 mp_factor_init (struct mp_factors
*factors
)
579 factors
->p
= nullptr;
580 factors
->e
= nullptr;
581 factors
->nfactors
= 0;
586 mp_factor_clear (struct mp_factors
*factors
)
588 for (idx_t i
= 0; i
< factors
->nfactors
; i
++)
589 mpz_clear (factors
->p
[i
]);
596 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
598 idx_t nfactors
= factors
->nfactors
;
599 mpz_t
*p
= factors
->p
;
600 unsigned long int *e
= factors
->e
;
603 /* Locate position for insert new or increment e. */
604 for (i
= nfactors
- 1; i
>= 0; i
--)
606 if (mpz_cmp (p
[i
], prime
) <= 0)
610 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
612 if (factors
->nfactors
== factors
->nalloc
)
614 p
= xpalloc (p
, &factors
->nalloc
, 1, -1, sizeof *p
);
615 e
= xireallocarray (e
, factors
->nalloc
, sizeof *e
);
618 mpz_init (p
[nfactors
]);
619 for (ptrdiff_t j
= nfactors
- 1; j
> i
; j
--)
621 mpz_set (p
[j
+ 1], p
[j
]);
624 mpz_set (p
[i
+ 1], prime
);
629 factors
->nfactors
= nfactors
+ 1;
638 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
642 mpz_init_set_ui (pz
, prime
);
643 mp_factor_insert (factors
, pz
);
648 /* Number of bits in an uintmax_t. */
649 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
651 /* Verify that uintmax_t does not have holes in its representation. */
652 static_assert (UINTMAX_MAX
>> (W
- 1) == 1);
654 #define P(a,b,c,d) a,
655 static const unsigned char primes_diff
[] = {
657 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
661 #define PRIMES_PTAB_ENTRIES \
662 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
664 #define P(a,b,c,d) b,
665 static const unsigned char primes_diff8
[] = {
667 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
676 #define P(a,b,c,d) {c,d},
677 static const struct primes_dtab primes_dtab
[] = {
679 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
683 /* Verify that uintmax_t is not wider than
684 the integers used to generate primes.h. */
685 static_assert (W
<= WIDE_UINT_BITS
);
687 /* debugging for developers. Enables devmsg().
688 This flag is used only in the GMP code. */
689 static bool dev_debug
= false;
691 /* Prove primality or run probabilistic tests. */
692 static bool flag_prove_primality
= PROVE_PRIMALITY
;
694 /* Number of Miller-Rabin tests to run when not proving primality. */
698 factor_insert_refind (struct factors
*factors
, uintmax_t p
, int i
, int off
)
700 for (int j
= 0; j
< off
; j
++)
701 p
+= primes_diff
[i
+ j
];
702 factor_insert (factors
, p
);
705 /* Trial division with odd primes uses the following trick.
707 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
708 consider the case t < B (this is the second loop below).
710 From our tables we get
712 binv = p^{-1} (mod B)
713 lim = floor ((B-1) / p).
715 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
716 (and all quotients in this range occur for some t).
718 Then t = q * p is true also (mod B), and p is invertible we get
720 q = t * binv (mod B).
722 Next, assume that t is *not* divisible by p. Since multiplication
723 by binv (mod B) is a one-to-one mapping,
725 t * binv (mod B) > lim,
727 because all the smaller values are already taken.
729 This can be summed up by saying that the function
731 q(t) = binv * t (mod B)
733 is a permutation of the range 0 <= t < B, with the curious property
734 that it maps the multiples of p onto the range 0 <= q <= lim, in
735 order, and the non-multiples of p onto the range lim < q < B.
739 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
740 struct factors
*factors
)
749 cnt
= stdc_trailing_zeros (t1
);
756 cnt
= stdc_trailing_zeros (t0
);
757 rsh2 (t1
, t0
, t1
, t0
, cnt
);
760 factor_insert_multiplicity (factors
, 2, cnt
);
765 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
769 uintmax_t q1
, q0
, hi
;
770 MAYBE_UNUSED
uintmax_t lo
;
772 q0
= t0
* primes_dtab
[i
].binv
;
773 umul_ppmm (hi
, lo
, q0
, p
);
777 q1
= hi
* primes_dtab
[i
].binv
;
778 if (LIKELY (q1
> primes_dtab
[i
].lim
))
781 factor_insert (factors
, p
);
783 p
+= primes_diff
[i
+ 1];
788 #define DIVBLOCK(I) \
792 q = t0 * pd[I].binv; \
793 if (LIKELY (q > pd[I].lim)) \
796 factor_insert_refind (factors, p, i + 1, I); \
800 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
803 const struct primes_dtab
*pd
= &primes_dtab
[i
];
813 p
+= primes_diff8
[i
];
822 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
827 devmsg ("[trial division] ");
831 p
= mpz_scan1 (t
, 0);
832 mpz_fdiv_q_2exp (t
, t
, p
);
835 mp_factor_insert_ui (factors
, 2);
839 unsigned long int d
= 3;
840 for (idx_t i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
842 if (! mpz_divisible_ui_p (t
, d
))
844 d
+= primes_diff
[i
++];
845 if (mpz_cmp_ui (t
, d
* d
) < 0)
850 mpz_tdiv_q_ui (t
, t
, d
);
851 mp_factor_insert_ui (factors
, d
);
858 /* Entry i contains (2i+1)^(-1) mod 2^8. */
859 static const unsigned char binvert_table
[128] =
861 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
862 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
863 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
864 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
865 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
866 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
867 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
868 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
869 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
870 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
871 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
872 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
873 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
874 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
875 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
876 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
879 /* Compute n^(-1) mod B, using a Newton iteration. */
880 #define binv(inv,n) \
882 uintmax_t __n = (n); \
885 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
886 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
887 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
888 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
890 if (W_TYPE_SIZE > 64) \
892 int __invbits = 64; \
894 __inv = 2 * __inv - __inv * __inv * __n; \
896 } while (__invbits < W_TYPE_SIZE); \
902 /* q = u / d, assuming d|u. */
903 #define divexact_21(q1, q0, u1, u0, d) \
905 uintmax_t _di, _q0; \
911 MAYBE_UNUSED intmax_t _p0; \
912 umul_ppmm (_p1, _p0, _q0, d); \
913 (q1) = ((u1) - _p1) * _di; \
924 #define redcify(r_prim, r, n) \
926 MAYBE_UNUSED uintmax_t _redcify_q; \
927 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
930 /* x B^2 (mod n). Requires x > 0, n1 < B/2. */
931 #define redcify2(r1, r0, x, n1, n0) \
933 uintmax_t _r1, _r0, _i; \
936 _r1 = (x); _r0 = 0; \
941 _r1 = 0; _r0 = (x); \
942 _i = 2 * W_TYPE_SIZE; \
946 lsh2 (_r1, _r0, _r1, _r0, 1); \
947 if (ge2 (_r1, _r0, (n1), (n0))) \
948 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
954 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
955 Both a and b must be in redc form, the result will be in redc form too. */
956 static inline uintmax_t
957 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
959 uintmax_t rh
, rl
, q
, th
, xh
;
960 MAYBE_UNUSED
uintmax_t tl
;
962 umul_ppmm (rh
, rl
, a
, b
);
964 umul_ppmm (th
, tl
, q
, m
);
972 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
973 Both a and b must be in redc form, the result will be in redc form too.
974 For performance reasons, the most significant bit of m must be clear. */
976 mulredc2 (uintmax_t *r1p
,
977 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
978 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
980 uintmax_t r1
, r0
, q
, p1
, t1
, t0
, s1
, s0
;
981 MAYBE_UNUSED
uintmax_t p0
;
983 affirm ((a1
>> (W_TYPE_SIZE
- 1)) == 0);
984 affirm ((b1
>> (W_TYPE_SIZE
- 1)) == 0);
985 affirm ((m1
>> (W_TYPE_SIZE
- 1)) == 0);
987 /* First compute a0 * <b1, b0> B^{-1}
1000 umul_ppmm (t1
, t0
, a0
, b0
);
1001 umul_ppmm (r1
, r0
, a0
, b1
);
1003 umul_ppmm (p1
, p0
, q
, m0
);
1004 umul_ppmm (s1
, s0
, q
, m1
);
1005 r0
+= (t0
!= 0); /* Carry */
1006 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1007 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1008 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1010 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1025 umul_ppmm (t1
, t0
, a1
, b0
);
1026 umul_ppmm (s1
, s0
, a1
, b1
);
1027 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1029 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1030 umul_ppmm (p1
, p0
, q
, m0
);
1031 umul_ppmm (s1
, s0
, q
, m1
);
1032 r0
+= (t0
!= 0); /* Carry */
1033 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1034 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1035 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1037 if (ge2 (r1
, r0
, m1
, m0
))
1038 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1046 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1055 b
= mulredc (b
, b
, n
, ni
);
1059 y
= mulredc (y
, b
, n
, ni
);
1066 powm2 (uintmax_t *r1m
,
1067 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1068 uintmax_t ni
, const uintmax_t *one
)
1070 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1082 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1086 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1089 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1092 for (e
= ep
[1]; e
> 0; e
>>= 1)
1096 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1099 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1108 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1109 int k
, uintmax_t one
)
1111 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1113 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1115 if (y
== one
|| y
== nm1
)
1118 for (int i
= 1; i
< k
; i
++)
1120 y
= mulredc (y
, y
, n
, ni
);
1130 ATTRIBUTE_PURE
static bool
1131 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1132 const uintmax_t *qp
, int k
, const uintmax_t *one
)
1134 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1136 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1139 if (y0
== one
[0] && y1
== one
[1])
1142 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1144 if (y0
== nm1_0
&& y1
== nm1_1
)
1147 for (int i
= 1; i
< k
; i
++)
1149 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1152 if (y0
== nm1_0
&& y1
== nm1_1
)
1154 if (y0
== one
[0] && y1
== one
[1])
1161 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1162 mpz_srcptr q
, mp_bitcnt_t k
)
1164 mpz_powm (y
, x
, q
, n
);
1166 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1169 for (mp_bitcnt_t i
= 1; i
< k
; i
++)
1171 mpz_powm_ui (y
, y
, 2, n
);
1172 if (mpz_cmp (y
, nm1
) == 0)
1174 if (mpz_cmp_ui (y
, 1) == 0)
1180 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1181 have been observed. The average seem to be about 2. */
1182 static bool ATTRIBUTE_PURE
1183 prime_p (uintmax_t n
)
1186 uintmax_t a_prim
, one
, ni
;
1187 struct factors factors
;
1192 /* We have already cast out small primes. */
1193 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1196 /* Precomputation for Miller-Rabin. */
1197 int k
= stdc_trailing_zeros (n
- 1);
1198 uintmax_t q
= (n
- 1) >> k
;
1201 binv (ni
, n
); /* ni <- 1/n mod B */
1202 redcify (one
, 1, n
);
1203 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1205 /* Perform a Miller-Rabin test, finds most composites quickly. */
1206 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1209 if (flag_prove_primality
)
1211 /* Factor n-1 for Lucas. */
1212 factor (0, n
- 1, &factors
);
1215 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1216 number composite. */
1217 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1219 if (flag_prove_primality
)
1222 for (int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1225 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1230 /* After enough Miller-Rabin runs, be content. */
1231 is_prime
= (r
== MR_REPS
- 1);
1237 a
+= primes_diff
[r
]; /* Establish new base. */
1239 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1240 on most processors, since it avoids udiv_qrnnd. If we go down the
1241 udiv_qrnnd_preinv path, this code should be replaced. */
1244 umul_ppmm (s1
, s0
, one
, a
);
1245 if (LIKELY (s1
== 0))
1249 MAYBE_UNUSED
uintmax_t dummy
;
1250 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1254 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1258 affirm (!"Lucas prime test failure. This should not happen");
1261 static bool ATTRIBUTE_PURE
1262 prime2_p (uintmax_t n1
, uintmax_t n0
)
1264 uintmax_t q
[2], nm1
[2];
1265 uintmax_t a_prim
[2];
1270 struct factors factors
;
1273 return prime_p (n0
);
1275 nm1
[1] = n1
- (n0
== 0);
1280 k
= stdc_trailing_zeros (nm1
[1]);
1288 k
= stdc_trailing_zeros (nm1
[0]);
1289 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1294 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1295 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1297 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1301 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1304 if (flag_prove_primality
)
1306 /* Factor n-1 for Lucas. */
1307 factor (nm1
[1], nm1
[0], &factors
);
1310 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1311 number composite. */
1312 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1315 uintmax_t e
[2], y
[2];
1317 if (flag_prove_primality
)
1320 if (hi (factors
.plarge
))
1323 binv (pi
, lo (factors
.plarge
));
1326 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1327 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1329 for (int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1331 /* FIXME: We always have the factor 2. Do we really need to
1332 handle it here? We have done the same powering as part
1334 if (factors
.p
[i
] == 2)
1335 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1337 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1338 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1339 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1344 /* After enough Miller-Rabin runs, be content. */
1345 is_prime
= (r
== MR_REPS
- 1);
1351 a
+= primes_diff
[r
]; /* Establish new base. */
1352 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1354 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1358 affirm (!"Lucas prime test failure. This should not happen");
1362 mp_prime_p (mpz_t n
)
1365 mpz_t q
, a
, nm1
, tmp
;
1366 struct mp_factors factors
;
1368 if (mpz_cmp_ui (n
, 1) <= 0)
1371 /* We have already cast out small primes. */
1372 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1375 mpz_inits (q
, a
, nm1
, tmp
, nullptr);
1377 /* Precomputation for Miller-Rabin. */
1378 mpz_sub_ui (nm1
, n
, 1);
1380 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1381 mp_bitcnt_t k
= mpz_scan1 (nm1
, 0);
1382 mpz_tdiv_q_2exp (q
, nm1
, k
);
1386 /* Perform a Miller-Rabin test, finds most composites quickly. */
1387 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1393 if (flag_prove_primality
)
1395 /* Factor n-1 for Lucas. */
1397 mp_factor (tmp
, &factors
);
1400 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1401 number composite. */
1402 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1404 if (flag_prove_primality
)
1407 for (idx_t i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1409 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1410 mpz_powm (tmp
, a
, tmp
, n
);
1411 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1416 /* After enough Miller-Rabin runs, be content. */
1417 is_prime
= (r
== MR_REPS
- 1);
1423 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1425 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1432 affirm (!"Lucas prime test failure. This should not happen");
1435 if (flag_prove_primality
)
1436 mp_factor_clear (&factors
);
1438 mpz_clears (q
, a
, nm1
, tmp
, nullptr);
1444 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1445 struct factors
*factors
)
1447 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1449 unsigned long int k
= 1;
1450 unsigned long int l
= 1;
1453 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1460 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1466 x
= mulredc (x
, x
, n
, ni
);
1467 addmod (x
, x
, a
, n
);
1469 submod (t
, z
, x
, n
);
1470 P
= mulredc (P
, t
, n
, ni
);
1474 if (gcd_odd (P
, n
) != 1)
1484 for (unsigned long int i
= 0; i
< k
; i
++)
1486 x
= mulredc (x
, x
, n
, ni
);
1487 addmod (x
, x
, a
, n
);
1495 y
= mulredc (y
, y
, n
, ni
);
1496 addmod (y
, y
, a
, n
);
1498 submod (t
, z
, y
, n
);
1505 /* Found n itself as factor. Restart with different params. */
1506 factor_using_pollard_rho (n
, a
+ 1, factors
);
1513 factor_using_pollard_rho (g
, a
+ 1, factors
);
1515 factor_insert (factors
, g
);
1519 factor_insert (factors
, n
);
1530 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1531 struct factors
*factors
)
1533 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1535 unsigned long int k
= 1;
1536 unsigned long int l
= 1;
1538 redcify2 (P1
, P0
, 1, n1
, n0
);
1539 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1543 while (n1
!= 0 || n0
!= 1)
1551 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1553 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1555 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1556 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1561 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1562 if (g1
!= 0 || g0
!= 1)
1572 for (unsigned long int i
= 0; i
< k
; i
++)
1574 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1576 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1584 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1586 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1588 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1589 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1591 while (g1
== 0 && g0
== 1);
1595 /* The found factor is one word, and > 1. */
1596 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1599 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1601 factor_insert (factors
, g0
);
1605 /* The found factor is two words. This is highly unlikely, thus hard
1606 to trigger. Please be careful before you change this code! */
1609 if (n1
== g1
&& n0
== g0
)
1611 /* Found n itself as factor. Restart with different params. */
1612 factor_using_pollard_rho2 (n1
, n0
, a
+ 1, factors
);
1616 /* Compute n = n / g. Since the result will fit one word,
1617 we can compute the quotient modulo B, ignoring the high
1623 if (!prime2_p (g1
, g0
))
1624 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1626 factor_insert_large (factors
, g1
, g0
);
1633 factor_insert (factors
, n0
);
1637 factor_using_pollard_rho (n0
, a
, factors
);
1641 if (prime2_p (n1
, n0
))
1643 factor_insert_large (factors
, n1
, n0
);
1647 uuset (&x1
, &x0
, mod2 (x1
, x0
, n1
, n0
));
1648 uuset (&z1
, &z0
, mod2 (z1
, z0
, n1
, n0
));
1649 uuset (&y1
, &y0
, mod2 (y1
, y0
, n1
, n0
));
1654 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1655 struct mp_factors
*factors
)
1660 devmsg ("[pollard-rho (%lu)] ", a
);
1662 mpz_inits (t
, t2
, nullptr);
1663 mpz_init_set_si (y
, 2);
1664 mpz_init_set_si (x
, 2);
1665 mpz_init_set_si (z
, 2);
1666 mpz_init_set_ui (P
, 1);
1668 unsigned long long int k
= 1;
1669 unsigned long long int l
= 1;
1671 while (mpz_cmp_ui (n
, 1) != 0)
1679 mpz_add_ui (x
, x
, a
);
1688 if (mpz_cmp_ui (t
, 1) != 0)
1698 for (unsigned long long int i
= 0; i
< k
; i
++)
1702 mpz_add_ui (x
, x
, a
);
1712 mpz_add_ui (y
, y
, a
);
1717 while (mpz_cmp_ui (t
, 1) == 0);
1719 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1721 if (!mp_prime_p (t
))
1723 devmsg ("[composite factor--restarting pollard-rho] ");
1724 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1728 mp_factor_insert (factors
, t
);
1733 mp_factor_insert (factors
, n
);
1742 mpz_clears (P
, t2
, t
, z
, x
, y
, nullptr);
1746 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1747 algorithm is replaced, consider also returning the remainder. */
1755 int c
= stdc_leading_zeros (n
);
1757 /* Make x > sqrt(n). This will be invariant through the loop. */
1758 uintmax_t x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) >> 1);
1762 uintmax_t y
= (x
+ n
/ x
) / 2;
1772 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1774 /* Ensures the remainder fits in an uintmax_t. */
1775 affirm (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1780 int shift
= stdc_leading_zeros (nh
) & ~1;
1782 /* Make x > sqrt (n). */
1783 uintmax_t x
= isqrt ((nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1784 x
<<= (W_TYPE_SIZE
- shift
) >> 1;
1786 /* Do we need more than one iteration? */
1789 MAYBE_UNUSED
uintmax_t r
;
1791 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1797 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1798 affirm (gt2 (hi
, lo
, nh
, nl
));
1800 umul_ppmm (hi
, lo
, x
, x
);
1801 affirm (ge2 (nh
, nl
, hi
, lo
));
1802 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1812 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1813 # define MAGIC64 0x0202021202030213ULL
1814 # define MAGIC63 0x0402483012450293ULL
1815 # define MAGIC65 0x218a019866014613ULL
1816 # define MAGIC11 0x23b
1818 /* Return the square root if the input is a square, otherwise 0. */
1821 is_square (uintmax_t x
)
1823 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1824 computing the square root. */
1825 if (((MAGIC64
>> (x
& 63)) & 1)
1826 && ((MAGIC63
>> (x
% 63)) & 1)
1827 /* Both 0 and 64 are squares mod (65). */
1828 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1829 && ((MAGIC11
>> (x
% 11) & 1)))
1831 uintmax_t r
= isqrt (x
);
1838 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1839 static short const invtab
[0x81] =
1842 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1843 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1844 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1845 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1846 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1847 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1848 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1849 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1850 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1851 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1852 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1853 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1854 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1855 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1856 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1857 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1860 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1861 that q < 0x40; here it instead uses a table of (Euclidean) inverses. */
1862 # define div_smallq(q, r, u, d) \
1864 if ((u) / 0x40 < (d)) \
1866 uintmax_t _dinv, _mask, _q, _r; \
1867 int _cnt = stdc_leading_zeros (d); \
1869 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1871 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1872 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1876 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1877 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1881 _mask = -(uintmax_t) (_r >= (d)); \
1882 (r) = _r - (_mask & (d)); \
1884 affirm ((q) * (d) + (r) == u); \
1888 uintmax_t _q = (u) / (d); \
1889 (r) = (u) - _q * (d); \
1894 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1895 = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1898 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1899 representing G_0 = (-55, 2 * 4652, 8653).
1901 In the notation of the paper:
1903 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1907 t_0 = floor([q_0 + R_0] / S0) = 1
1908 R_1 = t_0 * S_0 - R_0 = 4001
1909 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1912 /* Multipliers, in order of efficiency:
1913 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1914 0.7317 3*5*7 = 105 = 1
1915 0.7820 3*5*11 = 165 = 1
1917 0.8101 3*7*11 = 231 = 3
1919 0.8284 5*7*11 = 385 = 1
1921 0.8716 3*11 = 33 = 1
1923 0.8913 5*11 = 55 = 3
1925 0.9233 7*11 = 77 = 1
1929 # define QUEUE_SIZE 50
1933 # define Q_FREQ_SIZE 50
1934 /* Element 0 keeps the total */
1935 static int q_freq
[Q_FREQ_SIZE
+ 1];
1939 /* Return true on success. Expected to fail only for numbers
1940 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1942 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1944 /* Uses algorithm and notation from
1946 SQUARE FORM FACTORIZATION
1947 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1949 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1952 static short const multipliers_1
[] =
1954 105, 165, 21, 385, 33, 5, 77, 1, 0
1956 static short const multipliers_3
[] =
1958 1155, 15, 231, 35, 3, 55, 7, 11, 0
1961 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1963 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1966 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1968 if (n0
== sqrt_n
* sqrt_n
)
1972 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
1977 if (prime_p (sqrt_n
))
1978 factor_insert_multiplicity (factors
, sqrt_n
, 2);
1984 if (!factor_using_squfof (0, sqrt_n
, &f
))
1986 /* Try pollard rho instead */
1987 factor_using_pollard_rho (sqrt_n
, 1, &f
);
1989 /* Duplicate the new factors */
1990 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
1991 factor_insert_multiplicity (factors
, f
.p
[i
], 2 * f
.e
[i
]);
1997 /* Select multipliers so we always get n * mu = 3 (mod 4) */
1998 for (short const *m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2001 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2003 unsigned int mu
= *m
;
2006 affirm (mu
* n0
% 4 == 3);
2008 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2009 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2010 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2013 However, this seems insufficient: With n = 37243139 and mu =
2014 105, we get a trivial factor, from the square 38809 = 197^2,
2015 without any corresponding Q earlier in the iteration.
2017 Requiring 64 mu^3 < n seems sufficient. */
2020 if ((uintmax_t) mu
* mu
* mu
>= n0
/ 64)
2025 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2028 umul_ppmm (Dh
, Dl
, n0
, mu
);
2031 affirm (Dl
% 4 != 1);
2032 affirm (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2034 S
= isqrt2 (Dh
, Dl
);
2039 /* Square root remainder fits in one word, so ignore high part. */
2041 /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))? */
2046 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2049 for (i
= 0; i
<= B
; i
++)
2051 uintmax_t q
, P1
, t
, rem
;
2053 div_smallq (q
, rem
, S
+ P
, Q
);
2054 P1
= S
- rem
; /* P1 = q*Q - P */
2056 affirm (q
> 0 && Q
> 0);
2060 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2070 g
/= gcd_odd (g
, mu
);
2074 if (qpos
>= QUEUE_SIZE
)
2075 error (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2077 queue
[qpos
].P
= P
% g
;
2082 /* I think the difference can be either sign, but mod
2083 2^W_TYPE_SIZE arithmetic should be fine. */
2084 t
= Q1
+ q
* (P
- P1
);
2091 uintmax_t r
= is_square (Q
);
2094 for (int j
= 0; j
< qpos
; j
++)
2096 if (queue
[j
].Q
== r
)
2099 /* Traversed entire cycle. */
2100 goto next_multiplier
;
2102 /* Need the absolute value for divisibility test. */
2103 if (P
>= queue
[j
].P
)
2109 /* Delete entries up to and including entry
2110 j, which matched. */
2111 memmove (queue
, queue
+ j
+ 1,
2112 (qpos
- j
- 1) * sizeof (queue
[0]));
2119 /* We have found a square form, which should give a
2122 affirm (S
>= P
); /* What signs are possible? */
2123 P
+= r
* ((S
- P
) / r
);
2125 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2126 for the case D = 2N. */
2127 /* Compute Q = (D - P*P) / Q1, but we need double
2130 umul_ppmm (hi
, lo
, P
, P
);
2131 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2132 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2137 /* Note: There appears to by a typo in the paper,
2138 Step 4a in the algorithm description says q <--
2139 floor([S+P]/\hat Q), but looking at the equations
2140 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2141 (In this code, \hat Q is Q1). */
2142 div_smallq (q
, rem
, S
+ P
, Q
);
2143 P1
= S
- rem
; /* P1 = q*Q - P */
2147 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2151 t
= Q1
+ q
* (P
- P1
);
2159 Q
/= gcd_odd (Q
, mu
);
2161 affirm (Q
> 1 && (n1
|| Q
< n0
));
2164 factor_insert (factors
, Q
);
2165 else if (!factor_using_squfof (0, Q
, factors
))
2166 factor_using_pollard_rho (Q
, 2, factors
);
2168 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2170 if (prime2_p (n1
, n0
))
2171 factor_insert_large (factors
, n1
, n0
);
2174 if (!factor_using_squfof (n1
, n0
, factors
))
2177 factor_using_pollard_rho (n0
, 1, factors
);
2179 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2194 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2195 results in FACTORS. */
2197 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2199 factors
->nfactors
= 0;
2200 hiset (&factors
->plarge
, 0);
2202 if (t1
== 0 && t0
< 2)
2205 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2207 if (t1
== 0 && t0
< 2)
2210 if (prime2_p (t1
, t0
))
2211 factor_insert_large (factors
, t1
, t0
);
2215 if (factor_using_squfof (t1
, t0
, factors
))
2220 factor_using_pollard_rho (t0
, 1, factors
);
2222 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2226 /* Use Pollard-rho to compute the prime factors of
2227 arbitrary-precision T, and put the results in FACTORS. */
2229 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2231 mp_factor_init (factors
);
2233 if (mpz_sgn (t
) != 0)
2235 mp_factor_using_division (t
, factors
);
2237 if (mpz_cmp_ui (t
, 1) != 0)
2239 devmsg ("[is number prime?] ");
2241 mp_factor_insert (factors
, t
);
2243 mp_factor_using_pollard_rho (t
, 1, factors
);
2249 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, char const *s
)
2252 uintmax_t hi
= 0, lo
= 0;
2254 strtol_error err
= LONGINT_INVALID
;
2256 /* Initial scan for invalid digits. */
2260 unsigned char c
= *p
++;
2264 if (UNLIKELY (!ISDIGIT (c
)))
2266 err
= LONGINT_INVALID
;
2270 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2273 while (err
== LONGINT_OK
)
2275 unsigned char c
= *s
++;
2281 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2283 err
= LONGINT_OVERFLOW
;
2288 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2289 lo_carry
+= 10 * lo
< 2 * lo
;
2296 if (UNLIKELY (hi
< lo_carry
))
2298 err
= LONGINT_OVERFLOW
;
2309 /* Structure and routines for buffering and outputting full lines,
2310 to support parallel operation efficiently. */
2317 /* 512 is chosen to give good performance,
2318 and also is the max guaranteed size that
2319 consumers can read atomically through pipes.
2320 Also it's big enough to cater for max line length
2321 even with 128 bit uintmax_t. */
2322 #define FACTOR_PIPE_BUF 512
2330 /* Double to ensure enough space for
2331 previous numbers + next number. */
2332 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2333 lbuf
.end
= lbuf
.buf
;
2336 /* Write complete LBUF to standard output. */
2340 size_t size
= lbuf
.end
- lbuf
.buf
;
2341 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2343 lbuf
.end
= lbuf
.buf
;
2346 /* Add a character C to LBUF and if it's a newline
2347 and enough bytes are already buffered,
2348 then write atomically to standard output. */
2356 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2358 /* Provide immediate output for interactive use. */
2359 static int line_buffered
= -1;
2360 if (line_buffered
== -1)
2361 line_buffered
= isatty (STDIN_FILENO
) || isatty (STDOUT_FILENO
);
2364 else if (buffered
>= FACTOR_PIPE_BUF
)
2366 /* Write output in <= PIPE_BUF chunks
2367 so consumers can read atomically. */
2368 char const *tend
= lbuf
.end
;
2370 /* Since a umaxint_t's factors must fit in 512
2371 we're guaranteed to find a newline here. */
2372 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2373 while (*--tlend
!= '\n');
2379 /* Buffer the remainder. */
2380 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2381 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2386 /* Buffer an int to the internal LBUF. */
2388 lbuf_putint (uintmax_t i
, size_t min_width
)
2390 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2391 char const *umaxstr
= umaxtostr (i
, buf
);
2392 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2395 for (; z
< min_width
; z
++)
2398 memcpy (lbuf
.end
, umaxstr
, width
);
2403 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2408 lbuf_putint (t0
, 0);
2411 /* Use very plain code here since it seems hard to write fast code
2412 without assuming a specific word size. */
2413 q
= t1
/ 1000000000;
2414 r
= t1
% 1000000000;
2415 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2416 print_uintmaxes (q
, t0
);
2421 /* Single-precision factoring */
2423 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2425 struct factors factors
;
2427 print_uintmaxes (t1
, t0
);
2430 factor (t1
, t0
, &factors
);
2432 for (int j
= 0; j
< factors
.nfactors
; j
++)
2433 for (int k
= 0; k
< factors
.e
[j
]; k
++)
2436 print_uintmaxes (0, factors
.p
[j
]);
2437 if (print_exponents
&& factors
.e
[j
] > 1)
2440 lbuf_putint (factors
.e
[j
], 0);
2445 if (hi (factors
.plarge
))
2448 print_uintmaxes (hi (factors
.plarge
), lo (factors
.plarge
));
2454 /* Emit the factors of the indicated number. If we have the option of using
2455 either algorithm, we select on the basis of the length of the number.
2456 For longer numbers, we prefer the MP algorithm even if the native algorithm
2457 has enough digits, because the algorithm is better. The turnover point
2458 depends on the value. */
2460 print_factors (char const *input
)
2462 /* Skip initial spaces and '+'. */
2463 char const *str
= input
;
2470 /* Try converting the number to one or two words. If it fails, use GMP or
2471 print an error message. The 2nd condition checks that the most
2472 significant bit of the two-word number is clear, in a typesize neutral
2474 strtol_error err
= strto2uintmax (&t1
, &t0
, str
);
2479 if (((t1
<< 1) >> 1) == t1
)
2481 devmsg ("[using single-precision arithmetic] ");
2482 print_factors_single (t1
, t0
);
2487 case LONGINT_OVERFLOW
:
2492 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2496 devmsg ("[using arbitrary-precision arithmetic] ");
2498 struct mp_factors factors
;
2500 mpz_init_set_str (t
, str
, 10);
2502 mpz_out_str (stdout
, 10, t
);
2504 mp_factor (t
, &factors
);
2506 for (idx_t j
= 0; j
< factors
.nfactors
; j
++)
2507 for (unsigned long int k
= 0; k
< factors
.e
[j
]; k
++)
2510 mpz_out_str (stdout
, 10, factors
.p
[j
]);
2511 if (print_exponents
&& factors
.e
[j
] > 1)
2513 printf ("^%lu", factors
.e
[j
]);
2518 mp_factor_clear (&factors
);
2528 if (status
!= EXIT_SUCCESS
)
2533 Usage: %s [OPTION] [NUMBER]...\n\
2537 Print the prime factors of each specified integer NUMBER. If none\n\
2538 are specified on the command line, read them from standard input.\n\
2542 -h, --exponents print repeated factors in form p^e unless e is 1\n\
2544 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2545 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2546 emit_ancillary_info (PROGRAM_NAME
);
2555 token_buffer tokenbuffer
;
2557 init_tokenbuffer (&tokenbuffer
);
2561 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2563 if (token_length
== (size_t) -1)
2566 error (EXIT_FAILURE
, errno
, _("error reading input"));
2570 ok
&= print_factors (tokenbuffer
.buffer
);
2572 free (tokenbuffer
.buffer
);
2578 main (int argc
, char **argv
)
2580 initialize_main (&argc
, &argv
);
2581 set_program_name (argv
[0]);
2582 setlocale (LC_ALL
, "");
2583 bindtextdomain (PACKAGE
, LOCALEDIR
);
2584 textdomain (PACKAGE
);
2587 atexit (close_stdout
);
2588 atexit (lbuf_flush
);
2591 while ((c
= getopt_long (argc
, argv
, "h", long_options
, nullptr)) != -1)
2595 case 'h': /* NetBSD used -h for this functionality first. */
2596 print_exponents
= true;
2599 case DEV_DEBUG_OPTION
:
2603 case_GETOPT_HELP_CHAR
;
2605 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2608 usage (EXIT_FAILURE
);
2613 memset (q_freq
, 0, sizeof (q_freq
));
2622 for (int i
= optind
; i
< argc
; i
++)
2623 if (! print_factors (argv
[i
]))
2631 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2632 for (int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2634 double f
= (double) q_freq
[i
] / q_freq
[0];
2636 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2637 100.0 * f
, 100.0 * acc_f
);
2642 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;