1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
38 Using GMP is optional. Define HAVE_GMP to make this code include GMP
39 factoring code. The GMP factoring code is based on GMP's demos/factorize.c
40 (last synced 2012-09-07). The GMP-based factoring code will stay in GMP
41 factoring code even if numbers get small enough for using the two-word
46 (1) Perform trial division using a small primes table, but without hardware
47 division since the primes table store inverses modulo the word base.
48 (The GMP variant of this code doesn't make use of the precomputed
49 inverses, but instead relies on GMP for fast divisibility testing.)
50 (2) Check the nature of any non-factored part using Miller-Rabin for
51 detecting composites, and Lucas for detecting primes.
52 (3) Factor any remaining composite part using the Pollard-Brent rho
53 algorithm or if USE_SQUFOF is defined to 1, try that first.
54 Status of found factors are checked again using Miller-Rabin and Lucas.
56 We prefer using Hensel norm in the divisions, not the more familiar
57 Euclidian norm, since the former leads to much faster code. In the
58 Pollard-Brent rho code and the prime testing code, we use Montgomery's
59 trick of multiplying all n-residues by the word base, allowing cheap Hensel
64 * Use modular inverses also for exact division in the Lucas code, and
65 elsewhere. A problem is to locate the inverses not from an index, but
66 from a prime. We might instead compute the inverse on-the-fly.
68 * Tune trial division table size (not forgetting that this is a standalone
69 program where the table will be read from disk for each invocation).
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidian)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methoods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
109 # if !HAVE_DECL_MPZ_INITS
119 #include "full-write.h"
121 #include "readtokens.h"
124 /* The official name of this program (e.g., no 'g' prefix). */
125 #define PROGRAM_NAME "factor"
128 proper_name ("Paul Rubin"), \
129 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
130 proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
132 /* Token delimiters when reading from a file. */
133 #define DELIM "\n\t "
135 #ifndef USE_LONGLONG_H
136 /* With the way we use longlong.h, it's only safe to use
137 when UWtype = UHWtype, as there were various cases
138 (as can be seen in the history for longlong.h) where
139 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
140 to avoid compile time or run time issues. */
141 # if LONG_MAX == INTMAX_MAX
142 # define USE_LONGLONG_H 1
148 /* Make definitions for longlong.h to make it do what it can do for us */
150 /* bitcount for uintmax_t */
151 # if UINTMAX_MAX == UINT32_MAX
152 # define W_TYPE_SIZE 32
153 # elif UINTMAX_MAX == UINT64_MAX
154 # define W_TYPE_SIZE 64
155 # elif UINTMAX_MAX == UINT128_MAX
156 # define W_TYPE_SIZE 128
159 # define UWtype uintmax_t
160 # define UHWtype unsigned long int
162 # if HAVE_ATTRIBUTE_MODE
163 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
164 typedef int SItype
__attribute__ ((mode (SI
)));
165 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
166 typedef int DItype
__attribute__ ((mode (DI
)));
167 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
169 typedef unsigned char UQItype
;
171 typedef unsigned long int USItype
;
172 # if HAVE_LONG_LONG_INT
173 typedef long long int DItype
;
174 typedef unsigned long long int UDItype
;
175 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
176 typedef long int DItype
;
177 typedef unsigned long int UDItype
;
180 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
181 # define ASSERT(x) /* FIXME make longlong.h really standalone */
182 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
183 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
184 # ifndef __GMP_GNUC_PREREQ
185 # define __GMP_GNUC_PREREQ(a,b) 1
188 /* These stub macros are only used in longlong.h in certain system compiler
189 combinations, so ensure usage to avoid -Wunused-macros warnings. */
190 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
196 # define HAVE_HOST_CPU_FAMILY_powerpc 1
198 # include "longlong.h"
199 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
200 const unsigned char factor_clz_tab
[129] =
202 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
203 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
204 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
205 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
210 #else /* not USE_LONGLONG_H */
212 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
213 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
214 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
215 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
219 #if !defined __clz_tab && !defined UHWtype
220 /* Without this seemingly useless conditional, gcc -Wunused-macros
221 warns that each of the two tested macros is unused on Fedora 18.
222 FIXME: this is just an ugly band-aid. Fix it properly. */
225 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
226 #define MAX_NFACTS 26
230 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
233 static struct option
const long_options
[] =
235 {"-debug", no_argument
, NULL
, DEV_DEBUG_OPTION
},
236 {GETOPT_HELP_OPTION_DECL
},
237 {GETOPT_VERSION_OPTION_DECL
},
243 uintmax_t plarge
[2]; /* Can have a single large factor */
244 uintmax_t p
[MAX_NFACTS
];
245 unsigned char e
[MAX_NFACTS
];
246 unsigned char nfactors
;
253 unsigned long int *e
;
254 unsigned long int nfactors
;
258 static void factor (uintmax_t, uintmax_t, struct factors
*);
261 # define umul_ppmm(w1, w0, u, v) \
263 uintmax_t __x0, __x1, __x2, __x3; \
264 unsigned long int __ul, __vl, __uh, __vh; \
265 uintmax_t __u = (u), __v = (v); \
267 __ul = __ll_lowpart (__u); \
268 __uh = __ll_highpart (__u); \
269 __vl = __ll_lowpart (__v); \
270 __vh = __ll_highpart (__v); \
272 __x0 = (uintmax_t) __ul * __vl; \
273 __x1 = (uintmax_t) __ul * __vh; \
274 __x2 = (uintmax_t) __uh * __vl; \
275 __x3 = (uintmax_t) __uh * __vh; \
277 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
278 __x1 += __x2; /* but this indeed can */ \
279 if (__x1 < __x2) /* did we get it? */ \
280 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
282 (w1) = __x3 + __ll_highpart (__x1); \
283 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
287 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
288 /* Define our own, not needing normalization. This function is
289 currently not performance critical, so keep it simple. Similar to
290 the mod macro below. */
292 # define udiv_qrnnd(q, r, n1, n0, d) \
294 uintmax_t __d1, __d0, __q, __r1, __r0; \
296 assert ((n1) < (d)); \
297 __d1 = (d); __d0 = 0; \
298 __r1 = (n1); __r0 = (n0); \
300 for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--) \
302 rsh2 (__d1, __d0, __d1, __d0, 1); \
304 if (ge2 (__r1, __r0, __d1, __d0)) \
307 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
315 #if !defined add_ssaaaa
316 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
319 _add_x = (al) + (bl); \
320 (sh) = (ah) + (bh) + (_add_x < (al)); \
325 #define rsh2(rh, rl, ah, al, cnt) \
327 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
328 (rh) = (ah) >> (cnt); \
331 #define lsh2(rh, rl, ah, al, cnt) \
333 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
334 (rl) = (al) << (cnt); \
337 #define ge2(ah, al, bh, bl) \
338 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
340 #define gt2(ah, al, bh, bl) \
341 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
344 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
348 (rl) = (al) - (bl); \
349 (rh) = (ah) - (bh) - _cy; \
353 #ifndef count_leading_zeros
354 # define count_leading_zeros(count, x) do { \
355 uintmax_t __clz_x = (x); \
356 unsigned int __clz_c; \
358 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
361 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
367 #ifndef count_trailing_zeros
368 # define count_trailing_zeros(count, x) do { \
369 uintmax_t __ctz_x = (x); \
370 unsigned int __ctz_c = 0; \
371 while ((__ctz_x & 1) == 0) \
380 /* Requires that a < n and b <= n */
381 #define submod(r,a,b,n) \
383 uintmax_t _t = - (uintmax_t) (a < b); \
384 (r) = ((n) & _t) + (a) - (b); \
387 #define addmod(r,a,b,n) \
388 submod ((r), (a), ((n) - (b)), (n))
390 /* Modular two-word addition and subtraction. For performance reasons, the
391 most significant bit of n1 must be clear. The destination variables must be
392 distinct from the mod operand. */
393 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
395 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
396 if (ge2 ((r1), (r0), (n1), (n0))) \
397 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
399 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
401 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
402 if ((intmax_t) (r1) < 0) \
403 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
406 #define HIGHBIT_TO_MASK(x) \
407 (((intmax_t)-1 >> 1) < 0 \
408 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
409 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
410 ? UINTMAX_MAX : (uintmax_t) 0))
412 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
413 Requires that d1 != 0. */
415 mod2 (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
427 count_leading_zeros (cntd
, d1
);
428 count_leading_zeros (cnta
, a1
);
429 int cnt
= cntd
- cnta
;
430 lsh2 (d1
, d0
, d1
, d0
, cnt
);
431 for (int i
= 0; i
< cnt
; i
++)
433 if (ge2 (a1
, a0
, d1
, d0
))
434 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
435 rsh2 (d1
, d0
, d1
, d0
, 1);
442 static uintmax_t _GL_ATTRIBUTE_CONST
443 gcd_odd (uintmax_t a
, uintmax_t b
)
454 /* Take out least significant one bit, to make room for sign */
470 bgta
= HIGHBIT_TO_MASK (t
);
472 /* b <-- min (a, b) */
476 a
= (t
^ bgta
) - bgta
;
481 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
483 while ((a0
& 1) == 0)
484 rsh2 (a1
, a0
, a1
, a0
, 1);
485 while ((b0
& 1) == 0)
486 rsh2 (b1
, b0
, b1
, b0
, 1);
493 return gcd_odd (b0
, a0
);
496 if (gt2 (a1
, a0
, b1
, b0
))
498 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
500 rsh2 (a1
, a0
, a1
, a0
, 1);
501 while ((a0
& 1) == 0);
503 else if (gt2 (b1
, b0
, a1
, a0
))
505 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
507 rsh2 (b1
, b0
, b1
, b0
, 1);
508 while ((b0
& 1) == 0);
519 factor_insert_multiplicity (struct factors
*factors
,
520 uintmax_t prime
, unsigned int m
)
522 unsigned int nfactors
= factors
->nfactors
;
523 uintmax_t *p
= factors
->p
;
524 unsigned char *e
= factors
->e
;
526 /* Locate position for insert new or increment e. */
528 for (i
= nfactors
- 1; i
>= 0; i
--)
534 if (i
< 0 || p
[i
] != prime
)
536 for (int j
= nfactors
- 1; j
> i
; j
--)
543 factors
->nfactors
= nfactors
+ 1;
551 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
554 factor_insert_large (struct factors
*factors
,
555 uintmax_t p1
, uintmax_t p0
)
559 assert (factors
->plarge
[1] == 0);
560 factors
->plarge
[0] = p0
;
561 factors
->plarge
[1] = p1
;
564 factor_insert (factors
, p0
);
569 # if !HAVE_DECL_MPZ_INITS
571 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
572 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
575 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
579 va_start (ap
, mpz_single_init
);
582 while ((mpz
= va_arg (ap
, mpz_t
*)))
583 mpz_single_init (*mpz
);
589 static void mp_factor (mpz_t
, struct mp_factors
*);
592 mp_factor_init (struct mp_factors
*factors
)
596 factors
->nfactors
= 0;
600 mp_factor_clear (struct mp_factors
*factors
)
602 for (unsigned int i
= 0; i
< factors
->nfactors
; i
++)
603 mpz_clear (factors
->p
[i
]);
610 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
612 unsigned long int nfactors
= factors
->nfactors
;
613 mpz_t
*p
= factors
->p
;
614 unsigned long int *e
= factors
->e
;
617 /* Locate position for insert new or increment e. */
618 for (i
= nfactors
- 1; i
>= 0; i
--)
620 if (mpz_cmp (p
[i
], prime
) <= 0)
624 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
626 p
= xrealloc (p
, (nfactors
+ 1) * sizeof p
[0]);
627 e
= xrealloc (e
, (nfactors
+ 1) * sizeof e
[0]);
629 mpz_init (p
[nfactors
]);
630 for (long j
= nfactors
- 1; j
> i
; j
--)
632 mpz_set (p
[j
+ 1], p
[j
]);
635 mpz_set (p
[i
+ 1], prime
);
640 factors
->nfactors
= nfactors
+ 1;
649 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
653 mpz_init_set_ui (pz
, prime
);
654 mp_factor_insert (factors
, pz
);
657 #endif /* HAVE_GMP */
660 /* Number of bits in an uintmax_t. */
661 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
663 /* Verify that uintmax_t does not have holes in its representation. */
664 verify (UINTMAX_MAX
>> (W
- 1) == 1);
666 #define P(a,b,c,d) a,
667 static const unsigned char primes_diff
[] = {
669 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
673 #define PRIMES_PTAB_ENTRIES \
674 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
676 #define P(a,b,c,d) b,
677 static const unsigned char primes_diff8
[] = {
679 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
688 #define P(a,b,c,d) {c,d},
689 static const struct primes_dtab primes_dtab
[] = {
691 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
695 /* Verify that uintmax_t is not wider than
696 the integers used to generate primes.h. */
697 verify (W
<= WIDE_UINT_BITS
);
699 /* debugging for developers. Enables devmsg().
700 This flag is used only in the GMP code. */
701 static bool dev_debug
= false;
703 /* Prove primality or run probabilistic tests. */
704 static bool flag_prove_primality
= PROVE_PRIMALITY
;
706 /* Number of Miller-Rabin tests to run when not proving primality. */
710 factor_insert_refind (struct factors
*factors
, uintmax_t p
, unsigned int i
,
713 for (unsigned int j
= 0; j
< off
; j
++)
714 p
+= primes_diff
[i
+ j
];
715 factor_insert (factors
, p
);
718 /* Trial division with odd primes uses the following trick.
720 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
721 consider the case t < B (this is the second loop below).
723 From our tables we get
725 binv = p^{-1} (mod B)
726 lim = floor ( (B-1) / p ).
728 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
729 (and all quotients in this range occur for some t).
731 Then t = q * p is true also (mod B), and p is invertible we get
733 q = t * binv (mod B).
735 Next, assume that t is *not* divisible by p. Since multiplication
736 by binv (mod B) is a one-to-one mapping,
738 t * binv (mod B) > lim,
740 because all the smaller values are already taken.
742 This can be summed up by saying that the function
744 q(t) = binv * t (mod B)
746 is a permutation of the range 0 <= t < B, with the curious property
747 that it maps the multiples of p onto the range 0 <= q <= lim, in
748 order, and the non-multiples of p onto the range lim < q < B.
752 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
753 struct factors
*factors
)
761 count_trailing_zeros (cnt
, t1
);
768 count_trailing_zeros (cnt
, t0
);
769 rsh2 (t1
, t0
, t1
, t0
, cnt
);
772 factor_insert_multiplicity (factors
, 2, cnt
);
777 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
781 uintmax_t q1
, q0
, hi
, lo _GL_UNUSED
;
783 q0
= t0
* primes_dtab
[i
].binv
;
784 umul_ppmm (hi
, lo
, q0
, p
);
788 q1
= hi
* primes_dtab
[i
].binv
;
789 if (LIKELY (q1
> primes_dtab
[i
].lim
))
792 factor_insert (factors
, p
);
794 p
+= primes_diff
[i
+ 1];
799 #define DIVBLOCK(I) \
803 q = t0 * pd[I].binv; \
804 if (LIKELY (q > pd[I].lim)) \
807 factor_insert_refind (factors, p, i + 1, I); \
811 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
814 const struct primes_dtab
*pd
= &primes_dtab
[i
];
824 p
+= primes_diff8
[i
];
834 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
839 devmsg ("[trial division] ");
843 p
= mpz_scan1 (t
, 0);
844 mpz_div_2exp (t
, t
, p
);
847 mp_factor_insert_ui (factors
, 2);
852 for (unsigned int i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
854 if (! mpz_divisible_ui_p (t
, p
))
856 p
+= primes_diff
[i
++];
857 if (mpz_cmp_ui (t
, p
* p
) < 0)
862 mpz_tdiv_q_ui (t
, t
, p
);
863 mp_factor_insert_ui (factors
, p
);
871 /* Entry i contains (2i+1)^(-1) mod 2^8. */
872 static const unsigned char binvert_table
[128] =
874 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
875 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
876 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
877 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
878 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
879 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
880 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
881 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
882 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
883 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
884 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
885 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
886 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
887 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
888 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
889 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
892 /* Compute n^(-1) mod B, using a Newton iteration. */
893 #define binv(inv,n) \
895 uintmax_t __n = (n); \
898 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
899 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
900 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
901 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
903 if (W_TYPE_SIZE > 64) \
905 int __invbits = 64; \
907 __inv = 2 * __inv - __inv * __inv * __n; \
909 } while (__invbits < W_TYPE_SIZE); \
915 /* q = u / d, assuming d|u. */
916 #define divexact_21(q1, q0, u1, u0, d) \
918 uintmax_t _di, _q0; \
923 uintmax_t _p1, _p0 _GL_UNUSED; \
924 umul_ppmm (_p1, _p0, _q0, d); \
925 (q1) = ((u1) - _p1) * _di; \
936 #define redcify(r_prim, r, n) \
938 uintmax_t _redcify_q _GL_UNUSED; \
939 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
942 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
943 #define redcify2(r1, r0, x, n1, n0) \
945 uintmax_t _r1, _r0, _i; \
948 _r1 = (x); _r0 = 0; \
953 _r1 = 0; _r0 = (x); \
954 _i = 2*W_TYPE_SIZE; \
958 lsh2 (_r1, _r0, _r1, _r0, 1); \
959 if (ge2 (_r1, _r0, (n1), (n0))) \
960 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
966 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
967 Both a and b must be in redc form, the result will be in redc form too. */
968 static inline uintmax_t
969 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
971 uintmax_t rh
, rl
, q
, th
, tl _GL_UNUSED
, xh
;
973 umul_ppmm (rh
, rl
, a
, b
);
975 umul_ppmm (th
, tl
, q
, m
);
983 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
984 Both a and b must be in redc form, the result will be in redc form too.
985 For performance reasons, the most significant bit of m must be clear. */
987 mulredc2 (uintmax_t *r1p
,
988 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
989 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
991 uintmax_t r1
, r0
, q
, p1
, p0 _GL_UNUSED
, t1
, t0
, s1
, s0
;
993 assert ( (a1
>> (W_TYPE_SIZE
- 1)) == 0);
994 assert ( (b1
>> (W_TYPE_SIZE
- 1)) == 0);
995 assert ( (m1
>> (W_TYPE_SIZE
- 1)) == 0);
997 /* First compute a0 * <b1, b0> B^{-1}
1010 umul_ppmm (t1
, t0
, a0
, b0
);
1011 umul_ppmm (r1
, r0
, a0
, b1
);
1013 umul_ppmm (p1
, p0
, q
, m0
);
1014 umul_ppmm (s1
, s0
, q
, m1
);
1015 r0
+= (t0
!= 0); /* Carry */
1016 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1017 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1018 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1020 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1035 umul_ppmm (t1
, t0
, a1
, b0
);
1036 umul_ppmm (s1
, s0
, a1
, b1
);
1037 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1039 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1040 umul_ppmm (p1
, p0
, q
, m0
);
1041 umul_ppmm (s1
, s0
, q
, m1
);
1042 r0
+= (t0
!= 0); /* Carry */
1043 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1044 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1045 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1047 if (ge2 (r1
, r0
, m1
, m0
))
1048 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1054 static uintmax_t _GL_ATTRIBUTE_CONST
1055 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1064 b
= mulredc (b
, b
, n
, ni
);
1068 y
= mulredc (y
, b
, n
, ni
);
1075 powm2 (uintmax_t *r1m
,
1076 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1077 uintmax_t ni
, const uintmax_t *one
)
1079 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1091 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1095 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1098 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1101 for (e
= ep
[1]; e
> 0; e
>>= 1)
1105 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1108 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1115 static bool _GL_ATTRIBUTE_CONST
1116 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1117 unsigned int k
, uintmax_t one
)
1119 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1121 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1123 if (y
== one
|| y
== nm1
)
1126 for (unsigned int i
= 1; i
< k
; i
++)
1128 y
= mulredc (y
, y
, n
, ni
);
1139 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1140 const uintmax_t *qp
, unsigned int k
, const uintmax_t *one
)
1142 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1144 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1147 if (y0
== one
[0] && y1
== one
[1])
1150 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1152 if (y0
== nm1_0
&& y1
== nm1_1
)
1155 for (unsigned int i
= 1; i
< k
; i
++)
1157 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1160 if (y0
== nm1_0
&& y1
== nm1_1
)
1162 if (y0
== one
[0] && y1
== one
[1])
1170 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1171 mpz_srcptr q
, unsigned long int k
)
1173 mpz_powm (y
, x
, q
, n
);
1175 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1178 for (unsigned long int i
= 1; i
< k
; i
++)
1180 mpz_powm_ui (y
, y
, 2, n
);
1181 if (mpz_cmp (y
, nm1
) == 0)
1183 if (mpz_cmp_ui (y
, 1) == 0)
1190 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1191 have been observed. The average seem to be about 2. */
1193 prime_p (uintmax_t n
)
1197 uintmax_t a_prim
, one
, ni
;
1198 struct factors factors
;
1203 /* We have already casted out small primes. */
1204 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1207 /* Precomputation for Miller-Rabin. */
1208 uintmax_t q
= n
- 1;
1209 for (k
= 0; (q
& 1) == 0; k
++)
1213 binv (ni
, n
); /* ni <- 1/n mod B */
1214 redcify (one
, 1, n
);
1215 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1217 /* Perform a Miller-Rabin test, finds most composites quickly. */
1218 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1221 if (flag_prove_primality
)
1223 /* Factor n-1 for Lucas. */
1224 factor (0, n
- 1, &factors
);
1227 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1228 number composite. */
1229 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1231 if (flag_prove_primality
)
1234 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1237 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1242 /* After enough Miller-Rabin runs, be content. */
1243 is_prime
= (r
== MR_REPS
- 1);
1249 a
+= primes_diff
[r
]; /* Establish new base. */
1251 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1252 on most processors, since it avoids udiv_qrnnd. If we go down the
1253 udiv_qrnnd_preinv path, this code should be replaced. */
1256 umul_ppmm (s1
, s0
, one
, a
);
1257 if (LIKELY (s1
== 0))
1261 uintmax_t dummy _GL_UNUSED
;
1262 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1266 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1270 error (0, 0, _("Lucas prime test failure. This should not happen"));
1275 prime2_p (uintmax_t n1
, uintmax_t n0
)
1277 uintmax_t q
[2], nm1
[2];
1278 uintmax_t a_prim
[2];
1283 struct factors factors
;
1286 return prime_p (n0
);
1288 nm1
[1] = n1
- (n0
== 0);
1292 count_trailing_zeros (k
, nm1
[1]);
1300 count_trailing_zeros (k
, nm1
[0]);
1301 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1306 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1307 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1309 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1313 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1316 if (flag_prove_primality
)
1318 /* Factor n-1 for Lucas. */
1319 factor (nm1
[1], nm1
[0], &factors
);
1322 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1323 number composite. */
1324 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1327 uintmax_t e
[2], y
[2];
1329 if (flag_prove_primality
)
1332 if (factors
.plarge
[1])
1335 binv (pi
, factors
.plarge
[0]);
1338 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1339 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1341 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1343 /* FIXME: We always have the factor 2. Do we really need to
1344 handle it here? We have done the same powering as part
1346 if (factors
.p
[i
] == 2)
1347 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1349 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1350 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1351 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1356 /* After enough Miller-Rabin runs, be content. */
1357 is_prime
= (r
== MR_REPS
- 1);
1363 a
+= primes_diff
[r
]; /* Establish new base. */
1364 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1366 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1370 error (0, 0, _("Lucas prime test failure. This should not happen"));
1376 mp_prime_p (mpz_t n
)
1379 mpz_t q
, a
, nm1
, tmp
;
1380 struct mp_factors factors
;
1382 if (mpz_cmp_ui (n
, 1) <= 0)
1385 /* We have already casted out small primes. */
1386 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1389 mpz_inits (q
, a
, nm1
, tmp
, NULL
);
1391 /* Precomputation for Miller-Rabin. */
1392 mpz_sub_ui (nm1
, n
, 1);
1394 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1395 unsigned long int k
= mpz_scan1 (nm1
, 0);
1396 mpz_tdiv_q_2exp (q
, nm1
, k
);
1400 /* Perform a Miller-Rabin test, finds most composites quickly. */
1401 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1407 if (flag_prove_primality
)
1409 /* Factor n-1 for Lucas. */
1411 mp_factor (tmp
, &factors
);
1414 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1415 number composite. */
1416 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1418 if (flag_prove_primality
)
1421 for (unsigned long int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1423 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1424 mpz_powm (tmp
, a
, tmp
, n
);
1425 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1430 /* After enough Miller-Rabin runs, be content. */
1431 is_prime
= (r
== MR_REPS
- 1);
1437 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1439 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1446 error (0, 0, _("Lucas prime test failure. This should not happen"));
1450 if (flag_prove_primality
)
1451 mp_factor_clear (&factors
);
1453 mpz_clears (q
, a
, nm1
, tmp
, NULL
);
1460 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1461 struct factors
*factors
)
1463 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1465 unsigned long int k
= 1;
1466 unsigned long int l
= 1;
1469 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1476 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1482 x
= mulredc (x
, x
, n
, ni
);
1483 addmod (x
, x
, a
, n
);
1485 submod (t
, z
, x
, n
);
1486 P
= mulredc (P
, t
, n
, ni
);
1490 if (gcd_odd (P
, n
) != 1)
1500 for (unsigned long int i
= 0; i
< k
; i
++)
1502 x
= mulredc (x
, x
, n
, ni
);
1503 addmod (x
, x
, a
, n
);
1511 y
= mulredc (y
, y
, n
, ni
);
1512 addmod (y
, y
, a
, n
);
1514 submod (t
, z
, y
, n
);
1522 factor_using_pollard_rho (g
, a
+ 1, factors
);
1524 factor_insert (factors
, g
);
1528 factor_insert (factors
, n
);
1539 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1540 struct factors
*factors
)
1542 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1544 unsigned long int k
= 1;
1545 unsigned long int l
= 1;
1547 redcify2 (P1
, P0
, 1, n1
, n0
);
1548 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1552 while (n1
!= 0 || n0
!= 1)
1560 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1562 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1564 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1565 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1570 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1571 if (g1
!= 0 || g0
!= 1)
1581 for (unsigned long int i
= 0; i
< k
; i
++)
1583 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1585 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1593 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1595 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1597 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1598 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1600 while (g1
== 0 && g0
== 1);
1604 /* The found factor is one word. */
1605 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1608 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1610 factor_insert (factors
, g0
);
1614 /* The found factor is two words. This is highly unlikely, thus hard
1615 to trigger. Please be careful before you change this code! */
1618 binv (ginv
, g0
); /* Compute n = n / g. Since the result will */
1619 n0
= ginv
* n0
; /* fit one word, we can compute the quotient */
1620 n1
= 0; /* modulo B, ignoring the high divisor word. */
1622 if (!prime2_p (g1
, g0
))
1623 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1625 factor_insert_large (factors
, g1
, g0
);
1632 factor_insert (factors
, n0
);
1636 factor_using_pollard_rho (n0
, a
, factors
);
1640 if (prime2_p (n1
, n0
))
1642 factor_insert_large (factors
, n1
, n0
);
1646 x0
= mod2 (&x1
, x1
, x0
, n1
, n0
);
1647 z0
= mod2 (&z1
, z1
, z0
, n1
, n0
);
1648 y0
= mod2 (&y1
, y1
, y0
, n1
, n0
);
1654 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1655 struct mp_factors
*factors
)
1660 devmsg ("[pollard-rho (%lu)] ", a
);
1662 mpz_inits (t
, t2
, NULL
);
1663 mpz_init_set_si (y
, 2);
1664 mpz_init_set_si (x
, 2);
1665 mpz_init_set_si (z
, 2);
1666 mpz_init_set_ui (P
, 1);
1668 unsigned long long int k
= 1;
1669 unsigned long long int l
= 1;
1671 while (mpz_cmp_ui (n
, 1) != 0)
1679 mpz_add_ui (x
, x
, a
);
1688 if (mpz_cmp_ui (t
, 1) != 0)
1698 for (unsigned long long int i
= 0; i
< k
; i
++)
1702 mpz_add_ui (x
, x
, a
);
1712 mpz_add_ui (y
, y
, a
);
1717 while (mpz_cmp_ui (t
, 1) == 0);
1719 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1721 if (!mp_prime_p (t
))
1723 devmsg ("[composite factor--restarting pollard-rho] ");
1724 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1728 mp_factor_insert (factors
, t
);
1733 mp_factor_insert (factors
, n
);
1742 mpz_clears (P
, t2
, t
, z
, x
, y
, NULL
);
1747 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1748 algorithm is replaced, consider also returning the remainder. */
1749 static uintmax_t _GL_ATTRIBUTE_CONST
1757 count_leading_zeros (c
, n
);
1759 /* Make x > sqrt(n). This will be invariant through the loop. */
1760 x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) / 2);
1764 uintmax_t y
= (x
+ n
/x
) / 2;
1772 static uintmax_t _GL_ATTRIBUTE_CONST
1773 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1778 /* Ensures the remainder fits in an uintmax_t. */
1779 assert (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1784 count_leading_zeros (shift
, nh
);
1787 /* Make x > sqrt(n) */
1788 x
= isqrt ( (nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1789 x
<<= (W_TYPE_SIZE
- shift
) / 2;
1791 /* Do we need more than one iteration? */
1794 uintmax_t r _GL_UNUSED
;
1796 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1802 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1803 assert (gt2 (hi
, lo
, nh
, nl
));
1805 umul_ppmm (hi
, lo
, x
, x
);
1806 assert (ge2 (nh
, nl
, hi
, lo
));
1807 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1817 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1818 # define MAGIC64 0x0202021202030213ULL
1819 # define MAGIC63 0x0402483012450293ULL
1820 # define MAGIC65 0x218a019866014613ULL
1821 # define MAGIC11 0x23b
1823 /* Return the square root if the input is a square, otherwise 0. */
1824 static uintmax_t _GL_ATTRIBUTE_CONST
1825 is_square (uintmax_t x
)
1827 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1828 computing the square root. */
1829 if (((MAGIC64
>> (x
& 63)) & 1)
1830 && ((MAGIC63
>> (x
% 63)) & 1)
1831 /* Both 0 and 64 are squares mod (65) */
1832 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1833 && ((MAGIC11
>> (x
% 11) & 1)))
1835 uintmax_t r
= isqrt (x
);
1842 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1843 static const unsigned short invtab
[0x81] =
1846 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1847 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1848 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1849 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1850 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1851 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1852 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1853 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1854 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1855 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1856 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1857 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1858 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1859 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1860 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1861 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1864 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1865 that q < 0x40; here it instead uses a table of (Euclidian) inverses. */
1866 # define div_smallq(q, r, u, d) \
1868 if ((u) / 0x40 < (d)) \
1871 uintmax_t _dinv, _mask, _q, _r; \
1872 count_leading_zeros (_cnt, (d)); \
1874 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1876 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1877 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1881 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1882 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1886 _mask = -(uintmax_t) (_r >= (d)); \
1887 (r) = _r - (_mask & (d)); \
1889 assert ( (q) * (d) + (r) == u); \
1893 uintmax_t _q = (u) / (d); \
1894 (r) = (u) - _q * (d); \
1899 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1900 = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1903 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1904 representing G_0 = (-55, 2*4652, 8653).
1906 In the notation of the paper:
1908 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1912 t_0 = floor([q_0 + R_0] / S0) = 1
1913 R_1 = t_0 * S_0 - R_0 = 4001
1914 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1917 /* Multipliers, in order of efficiency:
1918 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1919 0.7317 3*5*7 = 105 = 1
1920 0.7820 3*5*11 = 165 = 1
1922 0.8101 3*7*11 = 231 = 3
1924 0.8284 5*7*11 = 385 = 1
1926 0.8716 3*11 = 33 = 1
1928 0.8913 5*11 = 55 = 3
1930 0.9233 7*11 = 77 = 1
1934 # define QUEUE_SIZE 50
1938 # define Q_FREQ_SIZE 50
1939 /* Element 0 keeps the total */
1940 static unsigned int q_freq
[Q_FREQ_SIZE
+ 1];
1941 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1945 /* Return true on success. Expected to fail only for numbers
1946 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1948 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1950 /* Uses algorithm and notation from
1952 SQUARE FORM FACTORIZATION
1953 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1955 http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1958 static const unsigned int multipliers_1
[] =
1960 105, 165, 21, 385, 33, 5, 77, 1, 0
1962 static const unsigned int multipliers_3
[] =
1964 1155, 15, 231, 35, 3, 55, 7, 11, 0
1967 const unsigned int *m
;
1969 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1971 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1974 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1976 if (n0
== sqrt_n
* sqrt_n
)
1980 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
1985 if (prime_p (sqrt_n
))
1986 factor_insert_multiplicity (factors
, sqrt_n
, 2);
1992 if (!factor_using_squfof (0, sqrt_n
, &f
))
1994 /* Try pollard rho instead */
1995 factor_using_pollard_rho (sqrt_n
, 1, &f
);
1997 /* Duplicate the new factors */
1998 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
1999 factor_insert_multiplicity (factors
, f
.p
[i
], 2*f
.e
[i
]);
2005 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2006 for (m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2009 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2011 unsigned int mu
= *m
;
2012 unsigned int qpos
= 0;
2014 assert (mu
* n0
% 4 == 3);
2016 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2017 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2018 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2021 However, this seems insufficient: With n = 37243139 and mu =
2022 105, we get a trivial factor, from the square 38809 = 197^2,
2023 without any corresponding Q earlier in the iteration.
2025 Requiring 64 mu^3 < n seems sufficient. */
2028 if ((uintmax_t) mu
*mu
*mu
>= n0
/ 64)
2033 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2036 umul_ppmm (Dh
, Dl
, n0
, mu
);
2039 assert (Dl
% 4 != 1);
2040 assert (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2042 S
= isqrt2 (Dh
, Dl
);
2047 /* Square root remainder fits in one word, so ignore high part. */
2049 /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2054 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2057 for (i
= 0; i
<= B
; i
++)
2059 uintmax_t q
, P1
, t
, rem
;
2061 div_smallq (q
, rem
, S
+P
, Q
);
2062 P1
= S
- rem
; /* P1 = q*Q - P */
2064 IF_LINT (assert (q
> 0 && Q
> 0));
2068 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2078 g
/= gcd_odd (g
, mu
);
2082 if (qpos
>= QUEUE_SIZE
)
2083 die (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2085 queue
[qpos
].P
= P
% g
;
2090 /* I think the difference can be either sign, but mod
2091 2^W_TYPE_SIZE arithmetic should be fine. */
2092 t
= Q1
+ q
* (P
- P1
);
2099 uintmax_t r
= is_square (Q
);
2102 for (unsigned int j
= 0; j
< qpos
; j
++)
2104 if (queue
[j
].Q
== r
)
2107 /* Traversed entire cycle. */
2108 goto next_multiplier
;
2110 /* Need the absolute value for divisibility test. */
2111 if (P
>= queue
[j
].P
)
2117 /* Delete entries up to and including entry
2118 j, which matched. */
2119 memmove (queue
, queue
+ j
+ 1,
2120 (qpos
- j
- 1) * sizeof (queue
[0]));
2127 /* We have found a square form, which should give a
2130 assert (S
>= P
); /* What signs are possible? */
2131 P
+= r
* ((S
- P
) / r
);
2133 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2134 for the case D = 2N. */
2135 /* Compute Q = (D - P*P) / Q1, but we need double
2138 umul_ppmm (hi
, lo
, P
, P
);
2139 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2140 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2145 /* Note: There appears to by a typo in the paper,
2146 Step 4a in the algorithm description says q <--
2147 floor([S+P]/\hat Q), but looking at the equations
2148 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2149 (In this code, \hat Q is Q1). */
2150 div_smallq (q
, rem
, S
+P
, Q
);
2151 P1
= S
- rem
; /* P1 = q*Q - P */
2155 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2159 t
= Q1
+ q
* (P
- P1
);
2167 Q
/= gcd_odd (Q
, mu
);
2169 assert (Q
> 1 && (n1
|| Q
< n0
));
2172 factor_insert (factors
, Q
);
2173 else if (!factor_using_squfof (0, Q
, factors
))
2174 factor_using_pollard_rho (Q
, 2, factors
);
2176 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2178 if (prime2_p (n1
, n0
))
2179 factor_insert_large (factors
, n1
, n0
);
2182 if (!factor_using_squfof (n1
, n0
, factors
))
2185 factor_using_pollard_rho (n0
, 1, factors
);
2187 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2202 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2203 results in FACTORS. */
2205 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2207 factors
->nfactors
= 0;
2208 factors
->plarge
[1] = 0;
2210 if (t1
== 0 && t0
< 2)
2213 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2215 if (t1
== 0 && t0
< 2)
2218 if (prime2_p (t1
, t0
))
2219 factor_insert_large (factors
, t1
, t0
);
2223 if (factor_using_squfof (t1
, t0
, factors
))
2228 factor_using_pollard_rho (t0
, 1, factors
);
2230 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2235 /* Use Pollard-rho to compute the prime factors of
2236 arbitrary-precision T, and put the results in FACTORS. */
2238 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2240 mp_factor_init (factors
);
2242 if (mpz_sgn (t
) != 0)
2244 mp_factor_using_division (t
, factors
);
2246 if (mpz_cmp_ui (t
, 1) != 0)
2248 devmsg ("[is number prime?] ");
2250 mp_factor_insert (factors
, t
);
2252 mp_factor_using_pollard_rho (t
, 1, factors
);
2259 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, const char *s
)
2261 unsigned int lo_carry
;
2262 uintmax_t hi
= 0, lo
= 0;
2264 strtol_error err
= LONGINT_INVALID
;
2266 /* Skip initial spaces and '+'. */
2281 /* Initial scan for invalid digits. */
2285 unsigned int c
= *p
++;
2289 if (UNLIKELY (!ISDIGIT (c
)))
2291 err
= LONGINT_INVALID
;
2295 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2298 for (;err
== LONGINT_OK
;)
2300 unsigned int c
= *s
++;
2306 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2308 err
= LONGINT_OVERFLOW
;
2313 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2314 lo_carry
+= 10 * lo
< 2 * lo
;
2321 if (UNLIKELY (hi
< lo_carry
))
2323 err
= LONGINT_OVERFLOW
;
2334 /* Structure and routines for buffering and outputting full lines,
2335 to support parallel operation efficiently. */
2342 /* 512 is chosen to give good performance,
2343 and also is the max guaranteed size that
2344 consumers can read atomically through pipes.
2345 Also it's big enough to cater for max line length
2346 even with 128 bit uintmax_t. */
2347 #define FACTOR_PIPE_BUF 512
2355 /* Double to ensure enough space for
2356 previous numbers + next number. */
2357 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2358 lbuf
.end
= lbuf
.buf
;
2361 /* Write complete LBUF to standard output. */
2365 size_t size
= lbuf
.end
- lbuf
.buf
;
2366 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2367 die (EXIT_FAILURE
, errno
, "%s", _("write error"));
2368 lbuf
.end
= lbuf
.buf
;
2371 /* Add a character C to LBUF and if it's a newline
2372 and enough bytes are already buffered,
2373 then write atomically to standard output. */
2381 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2383 /* Provide immediate output for interactive input. */
2384 static int line_buffered
= -1;
2385 if (line_buffered
== -1)
2386 line_buffered
= isatty (STDIN_FILENO
);
2389 else if (buffered
>= FACTOR_PIPE_BUF
)
2391 /* Write output in <= PIPE_BUF chunks
2392 so consumers can read atomically. */
2393 char const *tend
= lbuf
.end
;
2395 /* Since a umaxint_t's factors must fit in 512
2396 we're guaranteed to find a newline here. */
2397 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2398 while (*--tlend
!= '\n');
2404 /* Buffer the remainder. */
2405 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2406 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2411 /* Buffer an int to the internal LBUF. */
2413 lbuf_putint (uintmax_t i
, size_t min_width
)
2415 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2416 char const *umaxstr
= umaxtostr (i
, buf
);
2417 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2420 for (; z
< min_width
; z
++)
2423 memcpy (lbuf
.end
, umaxstr
, width
);
2428 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2433 lbuf_putint (t0
, 0);
2436 /* Use very plain code here since it seems hard to write fast code
2437 without assuming a specific word size. */
2438 q
= t1
/ 1000000000;
2439 r
= t1
% 1000000000;
2440 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2441 print_uintmaxes (q
, t0
);
2446 /* Single-precision factoring */
2448 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2450 struct factors factors
;
2452 print_uintmaxes (t1
, t0
);
2455 factor (t1
, t0
, &factors
);
2457 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2458 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2461 print_uintmaxes (0, factors
.p
[j
]);
2464 if (factors
.plarge
[1])
2467 print_uintmaxes (factors
.plarge
[1], factors
.plarge
[0]);
2473 /* Emit the factors of the indicated number. If we have the option of using
2474 either algorithm, we select on the basis of the length of the number.
2475 For longer numbers, we prefer the MP algorithm even if the native algorithm
2476 has enough digits, because the algorithm is better. The turnover point
2477 depends on the value. */
2479 print_factors (const char *input
)
2483 /* Try converting the number to one or two words. If it fails, use GMP or
2484 print an error message. The 2nd condition checks that the most
2485 significant bit of the two-word number is clear, in a typesize neutral
2487 strtol_error err
= strto2uintmax (&t1
, &t0
, input
);
2492 if (((t1
<< 1) >> 1) == t1
)
2494 devmsg ("[using single-precision arithmetic] ");
2495 print_factors_single (t1
, t0
);
2500 case LONGINT_OVERFLOW
:
2505 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2510 devmsg ("[using arbitrary-precision arithmetic] ");
2512 struct mp_factors factors
;
2514 mpz_init_set_str (t
, input
, 10);
2516 gmp_printf ("%Zd:", t
);
2517 mp_factor (t
, &factors
);
2519 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2520 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2521 gmp_printf (" %Zd", factors
.p
[j
]);
2523 mp_factor_clear (&factors
);
2529 error (0, 0, _("%s is too large"), quote (input
));
2537 if (status
!= EXIT_SUCCESS
)
2542 Usage: %s [NUMBER]...\n\
2545 program_name
, program_name
);
2547 Print the prime factors of each specified integer NUMBER. If none\n\
2548 are specified on the command line, read them from standard input.\n\
2551 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2552 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2553 emit_ancillary_info (PROGRAM_NAME
);
2562 token_buffer tokenbuffer
;
2564 init_tokenbuffer (&tokenbuffer
);
2568 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2570 if (token_length
== (size_t) -1)
2572 ok
&= print_factors (tokenbuffer
.buffer
);
2574 free (tokenbuffer
.buffer
);
2580 main (int argc
, char **argv
)
2582 initialize_main (&argc
, &argv
);
2583 set_program_name (argv
[0]);
2584 setlocale (LC_ALL
, "");
2585 bindtextdomain (PACKAGE
, LOCALEDIR
);
2586 textdomain (PACKAGE
);
2589 atexit (close_stdout
);
2590 atexit (lbuf_flush
);
2593 while ((c
= getopt_long (argc
, argv
, "", long_options
, NULL
)) != -1)
2597 case DEV_DEBUG_OPTION
:
2601 case_GETOPT_HELP_CHAR
;
2603 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2606 usage (EXIT_FAILURE
);
2611 memset (q_freq
, 0, sizeof (q_freq
));
2620 for (int i
= optind
; i
< argc
; i
++)
2621 if (! print_factors (argv
[i
]))
2629 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2630 for (unsigned int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2632 double f
= (double) q_freq
[i
] / q_freq
[0];
2634 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2635 100.0 * f
, 100.0 * acc_f
);
2640 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;