src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2024 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organization:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38   Algorithm:
  39
  40     (1) Perform trial division using a small primes table, but without hardware
  41         division since the primes table store inverses modulo the word base.
  42         (The GMP variant of this code doesn't make use of the precomputed
  43         inverses, but instead relies on GMP for fast divisibility testing.)
  44     (2) Check the nature of any non-factored part using Miller-Rabin for
  45         detecting composites, and Lucas for detecting primes.
  46     (3) Factor any remaining composite part using the Pollard-Brent rho
  47         algorithm or if USE_SQUFOF is defined to 1, try that first.
  48         Status of found factors are checked again using Miller-Rabin and Lucas.
  49
  50     We prefer using Hensel norm in the divisions, not the more familiar
  51     Euclidean norm, since the former leads to much faster code.  In the
  52     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  53     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  54     reductions mod n.
  55
  56     The GMP code uses an algorithm that can be considerably slower;
  57     for example, on a circa-2017 Intel Xeon Silver 4116, factoring
  58     2^{127}-3 takes about 50 ms with the two-word algorithm but would
  59     take about 750 ms with the GMP code.
  60
  61   Improvements:
  62
  63     * Use modular inverses also for exact division in the Lucas code, and
  64       elsewhere.  A problem is to locate the inverses not from an index, but
  65       from a prime.  We might instead compute the inverse on-the-fly.
  66
  67     * Tune trial division table size (not forgetting that this is a standalone
  68       program where the table will be read from secondary storage for
  69       each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidean)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdio.h>
 107 #include <gmp.h>
 108
 109 #include "system.h"
 110 #include "assure.h"
 111 #include "full-write.h"
 112 #include "quote.h"
 113 #include "readtokens.h"
 114 #include "xstrtol.h"
 115
 116 /* The official name of this program (e.g., no 'g' prefix).  */
 117 #define PROGRAM_NAME "factor"
 118
 119 #define AUTHORS \
 120   proper_name ("Paul Rubin"),                                           \
 121   proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 122   proper_name_lite ("Niels Moller", "Niels M\303\266ller")
 123
 124 /* Token delimiters when reading from a file.  */
 125 #define DELIM "\n\t "
 126
 127 #ifndef USE_LONGLONG_H
 128 /* With the way we use longlong.h, it's only safe to use
 129    when UWtype = UHWtype, as there were various cases
 130    (as can be seen in the history for longlong.h) where
 131    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 132    to avoid compile time or run time issues.  */
 133 # if LONG_MAX == INTMAX_MAX
 134 #  define USE_LONGLONG_H 1
 135 # endif
 136 #endif
 137
 138 #if USE_LONGLONG_H
 139
 140 /* Make definitions for longlong.h to make it do what it can do for us */
 141
 142 /* bitcount for uintmax_t */
 143 # if UINTMAX_MAX == UINT32_MAX
 144 #  define W_TYPE_SIZE 32
 145 # elif UINTMAX_MAX == UINT64_MAX
 146 #  define W_TYPE_SIZE 64
 147 # elif UINTMAX_MAX == UINT128_MAX
 148 #  define W_TYPE_SIZE 128
 149 # endif
 150
 151 # define UWtype  uintmax_t
 152 # define UHWtype unsigned long int
 153 # undef UDWtype
 154 # if HAVE_ATTRIBUTE_MODE
 155 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 156 typedef          int SItype     __attribute__ ((mode (SI)));
 157 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 158 typedef          int DItype     __attribute__ ((mode (DI)));
 159 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 160 # else
 161 typedef unsigned char UQItype;
 162 typedef          long SItype;
 163 typedef unsigned long int USItype;
 164 #  if HAVE_LONG_LONG_INT
 165 typedef long long int DItype;
 166 typedef unsigned long long int UDItype;
 167 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 168 typedef long int DItype;
 169 typedef unsigned long int UDItype;
 170 #  endif
 171 # endif
 172 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 173 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 174 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 175 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 176 # ifndef __GMP_GNUC_PREREQ
 177 #  define __GMP_GNUC_PREREQ(a,b) 1
 178 # endif
 179
 180 /* These stub macros are only used in longlong.h in certain system compiler
 181    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 182 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 183 ASSERT (1)
 184 __GMP_DECLSPEC
 185 # endif
 186
 187 # if _ARCH_PPC
 188 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 189 # endif
 190 # include "longlong.h"
 191 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 192 const unsigned char factor_clz_tab[129] =
 193 {
 194   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 195   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 196   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 197   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 198   9
 199 };
 200 # endif
 201
 202 #else /* not USE_LONGLONG_H */
 203
 204 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 205 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 206 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 207 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 208
 209 #endif
 210
 211 #if !defined __clz_tab && !defined UHWtype
 212 /* Without this seemingly useless conditional, gcc -Wunused-macros
 213    warns that each of the two tested macros is unused on Fedora 18.
 214    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 215 #endif
 216
 217 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 218 #define MAX_NFACTS 26
 219
 220 enum
 221 {
 222   DEV_DEBUG_OPTION = CHAR_MAX + 1
 223 };
 224
 225 static struct option const long_options[] =
 226 {
 227   {"exponents", no_argument, nullptr, 'h'},
 228   {"-debug", no_argument, nullptr, DEV_DEBUG_OPTION},
 229   {GETOPT_HELP_OPTION_DECL},
 230   {GETOPT_VERSION_OPTION_DECL},
 231   {nullptr, 0, nullptr, 0}
 232 };
 233
 234 /* If true, use p^e output format.  */
 235 static bool print_exponents;
 236
 237 struct factors
 238 {
 239   uintmax_t     plarge[2]; /* Can have a single large factor */
 240   uintmax_t     p[MAX_NFACTS];
 241   unsigned char e[MAX_NFACTS];
 242   unsigned char nfactors;
 243 };
 244
 245 struct mp_factors
 246 {
 247   mpz_t             *p;
 248   unsigned long int *e;
 249   idx_t nfactors;
 250   idx_t nalloc;
 251 };
 252
 253 static void factor (uintmax_t, uintmax_t, struct factors *);
 254
 255 #ifndef umul_ppmm
 256 # define umul_ppmm(w1, w0, u, v)                                        \
 257   do {                                                                  \
 258     uintmax_t __x0, __x1, __x2, __x3;                                   \
 259     unsigned long int __ul, __vl, __uh, __vh;                           \
 260     uintmax_t __u = (u), __v = (v);                                     \
 261                                                                         \
 262     __ul = __ll_lowpart (__u);                                          \
 263     __uh = __ll_highpart (__u);                                         \
 264     __vl = __ll_lowpart (__v);                                          \
 265     __vh = __ll_highpart (__v);                                         \
 266                                                                         \
 267     __x0 = (uintmax_t) __ul * __vl;                                     \
 268     __x1 = (uintmax_t) __ul * __vh;                                     \
 269     __x2 = (uintmax_t) __uh * __vl;                                     \
 270     __x3 = (uintmax_t) __uh * __vh;                                     \
 271                                                                         \
 272     __x1 += __ll_highpart (__x0);/* This can't give carry.  */          \
 273     __x1 += __x2;               /* But this indeed can.  */             \
 274     if (__x1 < __x2)            /* Did we get it?  */                   \
 275       __x3 += __ll_B;           /* Yes, add it in the proper pos.  */   \
 276                                                                         \
 277     (w1) = __x3 + __ll_highpart (__x1);                                 \
 278     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 279   } while (0)
 280 #endif
 281
 282 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 283 /* Define our own, not needing normalization.  This function is
 284    currently not performance critical, so keep it simple.  Similar to
 285    the mod macro below.  */
 286 # undef udiv_qrnnd
 287 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 288   do {                                                                  \
 289     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 290                                                                         \
 291     __d1 = (d); __d0 = 0;                                               \
 292     __r1 = (n1); __r0 = (n0);                                           \
 293     affirm (__r1 < __d1);                                               \
 294     __q = 0;                                                            \
 295     for (int __i = W_TYPE_SIZE; __i > 0; __i--)                         \
 296       {                                                                 \
 297         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 298         __q <<= 1;                                                      \
 299         if (ge2 (__r1, __r0, __d1, __d0))                               \
 300           {                                                             \
 301             __q++;                                                      \
 302             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 303           }                                                             \
 304       }                                                                 \
 305     (r) = __r0;                                                         \
 306     (q) = __q;                                                          \
 307   } while (0)
 308 #endif
 309
 310 #if !defined add_ssaaaa
 311 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 312   do {                                                                  \
 313     uintmax_t _add_x;                                                   \
 314     _add_x = (al) + (bl);                                               \
 315     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 316     (sl) = _add_x;                                                      \
 317   } while (0)
 318 #endif
 319
 320 #define rsh2(rh, rl, ah, al, cnt)                                       \
 321   do {                                                                  \
 322     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 323     (rh) = (ah) >> (cnt);                                               \
 324   } while (0)
 325
 326 #define lsh2(rh, rl, ah, al, cnt)                                       \
 327   do {                                                                  \
 328     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 329     (rl) = (al) << (cnt);                                               \
 330   } while (0)
 331
 332 #define ge2(ah, al, bh, bl)                                             \
 333   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 334
 335 #define gt2(ah, al, bh, bl)                                             \
 336   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 337
 338 #ifndef sub_ddmmss
 339 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 340   do {                                                                  \
 341     uintmax_t _cy;                                                      \
 342     _cy = (al) < (bl);                                                  \
 343     (rl) = (al) - (bl);                                                 \
 344     (rh) = (ah) - (bh) - _cy;                                           \
 345   } while (0)
 346 #endif
 347
 348 #ifndef count_leading_zeros
 349 # define count_leading_zeros(count, x) do {                             \
 350     uintmax_t __clz_x = (x);                                            \
 351     int __clz_c;                                                        \
 352     for (__clz_c = 0;                                                   \
 353          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 354          __clz_c += 8)                                                  \
 355       __clz_x <<= 8;                                                    \
 356     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 357       __clz_x <<= 1;                                                    \
 358     (count) = __clz_c;                                                  \
 359   } while (0)
 360 #endif
 361
 362 #ifndef count_trailing_zeros
 363 # define count_trailing_zeros(count, x) do {                            \
 364     uintmax_t __ctz_x = (x);                                            \
 365     int __ctz_c = 0;                                                    \
 366     while ((__ctz_x & 1) == 0)                                          \
 367       {                                                                 \
 368         __ctz_x >>= 1;                                                  \
 369         __ctz_c++;                                                      \
 370       }                                                                 \
 371     (count) = __ctz_c;                                                  \
 372   } while (0)
 373 #endif
 374
 375 /* Requires that a < n and b <= n */
 376 #define submod(r,a,b,n)                                                 \
 377   do {                                                                  \
 378     uintmax_t _t = - (uintmax_t) (a < b);                               \
 379     (r) = ((n) & _t) + (a) - (b);                                       \
 380   } while (0)
 381
 382 #define addmod(r,a,b,n)                                                 \
 383   submod ((r), (a), ((n) - (b)), (n))
 384
 385 /* Modular two-word addition and subtraction.  For performance reasons, the
 386    most significant bit of n1 must be clear.  The destination variables must be
 387    distinct from the mod operand.  */
 388 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 389   do {                                                                  \
 390     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 391     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 392       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 393   } while (0)
 394 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 395   do {                                                                  \
 396     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 397     if ((intmax_t) (r1) < 0)                                            \
 398       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 399   } while (0)
 400
 401 #define HIGHBIT_TO_MASK(x)                                              \
 402   (((intmax_t)-1 >> 1) < 0                                              \
 403    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 404    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 405       ? UINTMAX_MAX : (uintmax_t) 0))
 406
 407 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 408    Requires that d1 != 0.  */
 409 static uintmax_t
 410 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 411 {
 412   int cntd, cnta;
 413
 414   affirm (d1 != 0);
 415
 416   if (a1 == 0)
 417     {
 418       *r1 = 0;
 419       return a0;
 420     }
 421
 422   count_leading_zeros (cntd, d1);
 423   count_leading_zeros (cnta, a1);
 424   int cnt = cntd - cnta;
 425   lsh2 (d1, d0, d1, d0, cnt);
 426   for (int i = 0; i < cnt; i++)
 427     {
 428       if (ge2 (a1, a0, d1, d0))
 429         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 430       rsh2 (d1, d0, d1, d0, 1);
 431     }
 432
 433   *r1 = a1;
 434   return a0;
 435 }
 436
 437 ATTRIBUTE_CONST
 438 static uintmax_t
 439 gcd_odd (uintmax_t a, uintmax_t b)
 440 {
 441   if ((b & 1) == 0)
 442     {
 443       uintmax_t t = b;
 444       b = a;
 445       a = t;
 446     }
 447   if (a == 0)
 448     return b;
 449
 450   /* Take out least significant one bit, to make room for sign */
 451   b >>= 1;
 452
 453   for (;;)
 454     {
 455       uintmax_t t;
 456       uintmax_t bgta;
 457
 458       while ((a & 1) == 0)
 459         a >>= 1;
 460       a >>= 1;
 461
 462       t = a - b;
 463       if (t == 0)
 464         return (a << 1) + 1;
 465
 466       bgta = HIGHBIT_TO_MASK (t);
 467
 468       /* b <-- min (a, b) */
 469       b += (bgta & t);
 470
 471       /* a <-- |a - b| */
 472       a = (t ^ bgta) - bgta;
 473     }
 474 }
 475
 476 static uintmax_t
 477 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 478 {
 479   affirm (b0 & 1);
 480
 481   if ((a0 | a1) == 0)
 482     {
 483       *r1 = b1;
 484       return b0;
 485     }
 486
 487   while ((a0 & 1) == 0)
 488     rsh2 (a1, a0, a1, a0, 1);
 489
 490   for (;;)
 491     {
 492       if ((b1 | a1) == 0)
 493         {
 494           *r1 = 0;
 495           return gcd_odd (b0, a0);
 496         }
 497
 498       if (gt2 (a1, a0, b1, b0))
 499         {
 500           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 501           do
 502             rsh2 (a1, a0, a1, a0, 1);
 503           while ((a0 & 1) == 0);
 504         }
 505       else if (gt2 (b1, b0, a1, a0))
 506         {
 507           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 508           do
 509             rsh2 (b1, b0, b1, b0, 1);
 510           while ((b0 & 1) == 0);
 511         }
 512       else
 513         break;
 514     }
 515
 516   *r1 = a1;
 517   return a0;
 518 }
 519
 520 static void
 521 factor_insert_multiplicity (struct factors *factors,
 522                             uintmax_t prime, int m)
 523 {
 524   int nfactors = factors->nfactors;
 525   uintmax_t *p = factors->p;
 526   unsigned char *e = factors->e;
 527
 528   /* Locate position for insert new or increment e.  */
 529   int i;
 530   for (i = nfactors - 1; i >= 0; i--)
 531     {
 532       if (p[i] <= prime)
 533         break;
 534     }
 535
 536   if (i < 0 || p[i] != prime)
 537     {
 538       for (int j = nfactors - 1; j > i; j--)
 539         {
 540           p[j + 1] = p[j];
 541           e[j + 1] = e[j];
 542         }
 543       p[i + 1] = prime;
 544       e[i + 1] = m;
 545       factors->nfactors = nfactors + 1;
 546     }
 547   else
 548     {
 549       e[i] += m;
 550     }
 551 }
 552
 553 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 554
 555 static void
 556 factor_insert_large (struct factors *factors,
 557                      uintmax_t p1, uintmax_t p0)
 558 {
 559   if (p1 > 0)
 560     {
 561       affirm (factors->plarge[1] == 0);
 562       factors->plarge[0] = p0;
 563       factors->plarge[1] = p1;
 564     }
 565   else
 566     factor_insert (factors, p0);
 567 }
 568
 569 #ifndef mpz_inits
 570
 571 # include <stdarg.h>
 572
 573 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 574 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 575
 576 static void
 577 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 578 {
 579   va_list ap;
 580
 581   va_start (ap, mpz_single_init);
 582
 583   mpz_t *mpz;
 584   while ((mpz = va_arg (ap, mpz_t *)))
 585     mpz_single_init (*mpz);
 586
 587   va_end (ap);
 588 }
 589 #endif
 590
 591 static void mp_factor (mpz_t, struct mp_factors *);
 592
 593 static void
 594 mp_factor_init (struct mp_factors *factors)
 595 {
 596   factors->p = nullptr;
 597   factors->e = nullptr;
 598   factors->nfactors = 0;
 599   factors->nalloc = 0;
 600 }
 601
 602 static void
 603 mp_factor_clear (struct mp_factors *factors)
 604 {
 605   for (idx_t i = 0; i < factors->nfactors; i++)
 606     mpz_clear (factors->p[i]);
 607
 608   free (factors->p);
 609   free (factors->e);
 610 }
 611
 612 static void
 613 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 614 {
 615   idx_t nfactors = factors->nfactors;
 616   mpz_t *p = factors->p;
 617   unsigned long int *e = factors->e;
 618   ptrdiff_t i;
 619
 620   /* Locate position for insert new or increment e.  */
 621   for (i = nfactors - 1; i >= 0; i--)
 622     {
 623       if (mpz_cmp (p[i], prime) <= 0)
 624         break;
 625     }
 626
 627   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 628     {
 629       if (factors->nfactors == factors->nalloc)
 630         {
 631           p = xpalloc (p, &factors->nalloc, 1, -1, sizeof *p);
 632           e = xireallocarray (e, factors->nalloc, sizeof *e);
 633         }
 634
 635       mpz_init (p[nfactors]);
 636       for (ptrdiff_t j = nfactors - 1; j > i; j--)
 637         {
 638           mpz_set (p[j + 1], p[j]);
 639           e[j + 1] = e[j];
 640         }
 641       mpz_set (p[i + 1], prime);
 642       e[i + 1] = 1;
 643
 644       factors->p = p;
 645       factors->e = e;
 646       factors->nfactors = nfactors + 1;
 647     }
 648   else
 649     {
 650       e[i] += 1;
 651     }
 652 }
 653
 654 static void
 655 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 656 {
 657   mpz_t pz;
 658
 659   mpz_init_set_ui (pz, prime);
 660   mp_factor_insert (factors, pz);
 661   mpz_clear (pz);
 662 }
 663
 664
 665 /* Number of bits in an uintmax_t.  */
 666 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 667
 668 /* Verify that uintmax_t does not have holes in its representation.  */
 669 static_assert (UINTMAX_MAX >> (W - 1) == 1);
 670
 671 #define P(a,b,c,d) a,
 672 static const unsigned char primes_diff[] = {
 673 #include "primes.h"
 674 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 675 };
 676 #undef P
 677
 678 #define PRIMES_PTAB_ENTRIES \
 679   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 680
 681 #define P(a,b,c,d) b,
 682 static const unsigned char primes_diff8[] = {
 683 #include "primes.h"
 684 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 685 };
 686 #undef P
 687
 688 struct primes_dtab
 689 {
 690   uintmax_t binv, lim;
 691 };
 692
 693 #define P(a,b,c,d) {c,d},
 694 static const struct primes_dtab primes_dtab[] = {
 695 #include "primes.h"
 696 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 697 };
 698 #undef P
 699
 700 /* Verify that uintmax_t is not wider than
 701    the integers used to generate primes.h.  */
 702 static_assert (W <= WIDE_UINT_BITS);
 703
 704 /* debugging for developers.  Enables devmsg().
 705    This flag is used only in the GMP code.  */
 706 static bool dev_debug = false;
 707
 708 /* Prove primality or run probabilistic tests.  */
 709 static bool flag_prove_primality = PROVE_PRIMALITY;
 710
 711 /* Number of Miller-Rabin tests to run when not proving primality.  */
 712 #define MR_REPS 25
 713
 714 static void
 715 factor_insert_refind (struct factors *factors, uintmax_t p, int i, int off)
 716 {
 717   for (int j = 0; j < off; j++)
 718     p += primes_diff[i + j];
 719   factor_insert (factors, p);
 720 }
 721
 722 /* Trial division with odd primes uses the following trick.
 723
 724    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}.  For simplicity,
 725    consider the case t < B (this is the second loop below).
 726
 727    From our tables we get
 728
 729      binv = p^{-1} (mod B)
 730      lim = floor ((B-1) / p).
 731
 732    First assume that t is a multiple of p, t = q * p.  Then 0 <= q <= lim
 733    (and all quotients in this range occur for some t).
 734
 735    Then t = q * p is true also (mod B), and p is invertible we get
 736
 737      q = t * binv (mod B).
 738
 739    Next, assume that t is *not* divisible by p.  Since multiplication
 740    by binv (mod B) is a one-to-one mapping,
 741
 742      t * binv (mod B) > lim,
 743
 744    because all the smaller values are already taken.
 745
 746    This can be summed up by saying that the function
 747
 748      q(t) = binv * t (mod B)
 749
 750    is a permutation of the range 0 <= t < B, with the curious property
 751    that it maps the multiples of p onto the range 0 <= q <= lim, in
 752    order, and the non-multiples of p onto the range lim < q < B.
 753  */
 754
 755 static uintmax_t
 756 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 757                        struct factors *factors)
 758 {
 759   if (t0 % 2 == 0)
 760     {
 761       int cnt;
 762
 763       if (t0 == 0)
 764         {
 765           count_trailing_zeros (cnt, t1);
 766           t0 = t1 >> cnt;
 767           t1 = 0;
 768           cnt += W_TYPE_SIZE;
 769         }
 770       else
 771         {
 772           count_trailing_zeros (cnt, t0);
 773           rsh2 (t1, t0, t1, t0, cnt);
 774         }
 775
 776       factor_insert_multiplicity (factors, 2, cnt);
 777     }
 778
 779   uintmax_t p = 3;
 780   idx_t i;
 781   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 782     {
 783       for (;;)
 784         {
 785           uintmax_t q1, q0, hi;
 786           MAYBE_UNUSED uintmax_t lo;
 787
 788           q0 = t0 * primes_dtab[i].binv;
 789           umul_ppmm (hi, lo, q0, p);
 790           if (hi > t1)
 791             break;
 792           hi = t1 - hi;
 793           q1 = hi * primes_dtab[i].binv;
 794           if (LIKELY (q1 > primes_dtab[i].lim))
 795             break;
 796           t1 = q1; t0 = q0;
 797           factor_insert (factors, p);
 798         }
 799       p += primes_diff[i + 1];
 800     }
 801   if (t1p)
 802     *t1p = t1;
 803
 804 #define DIVBLOCK(I)                                                     \
 805   do {                                                                  \
 806     for (;;)                                                            \
 807       {                                                                 \
 808         q = t0 * pd[I].binv;                                            \
 809         if (LIKELY (q > pd[I].lim))                                     \
 810           break;                                                        \
 811         t0 = q;                                                         \
 812         factor_insert_refind (factors, p, i + 1, I);                    \
 813       }                                                                 \
 814   } while (0)
 815
 816   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 817     {
 818       uintmax_t q;
 819       const struct primes_dtab *pd = &primes_dtab[i];
 820       DIVBLOCK (0);
 821       DIVBLOCK (1);
 822       DIVBLOCK (2);
 823       DIVBLOCK (3);
 824       DIVBLOCK (4);
 825       DIVBLOCK (5);
 826       DIVBLOCK (6);
 827       DIVBLOCK (7);
 828
 829       p += primes_diff8[i];
 830       if (p * p > t0)
 831         break;
 832     }
 833
 834   return t0;
 835 }
 836
 837 static void
 838 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 839 {
 840   mpz_t q;
 841   mp_bitcnt_t p;
 842
 843   devmsg ("[trial division] ");
 844
 845   mpz_init (q);
 846
 847   p = mpz_scan1 (t, 0);
 848   mpz_fdiv_q_2exp (t, t, p);
 849   while (p)
 850     {
 851       mp_factor_insert_ui (factors, 2);
 852       --p;
 853     }
 854
 855   unsigned long int d = 3;
 856   for (idx_t i = 1; i <= PRIMES_PTAB_ENTRIES;)
 857     {
 858       if (! mpz_divisible_ui_p (t, d))
 859         {
 860           d += primes_diff[i++];
 861           if (mpz_cmp_ui (t, d * d) < 0)
 862             break;
 863         }
 864       else
 865         {
 866           mpz_tdiv_q_ui (t, t, d);
 867           mp_factor_insert_ui (factors, d);
 868         }
 869     }
 870
 871   mpz_clear (q);
 872 }
 873
 874 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 875 static const unsigned char  binvert_table[128] =
 876 {
 877   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 878   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 879   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 880   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 881   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 882   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 883   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 884   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 885   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 886   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 887   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 888   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 889   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 890   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 891   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 892   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 893 };
 894
 895 /* Compute n^(-1) mod B, using a Newton iteration.  */
 896 #define binv(inv,n)                                                     \
 897   do {                                                                  \
 898     uintmax_t  __n = (n);                                               \
 899     uintmax_t  __inv;                                                   \
 900                                                                         \
 901     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 902     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 903     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 904     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 905                                                                         \
 906     if (W_TYPE_SIZE > 64)                                               \
 907       {                                                                 \
 908         int  __invbits = 64;                                            \
 909         do {                                                            \
 910           __inv = 2 * __inv - __inv * __inv * __n;                      \
 911           __invbits *= 2;                                               \
 912         } while (__invbits < W_TYPE_SIZE);                              \
 913       }                                                                 \
 914                                                                         \
 915     (inv) = __inv;                                                      \
 916   } while (0)
 917
 918 /* q = u / d, assuming d|u.  */
 919 #define divexact_21(q1, q0, u1, u0, d)                                  \
 920   do {                                                                  \
 921     uintmax_t _di, _q0;                                                 \
 922     binv (_di, (d));                                                    \
 923     _q0 = (u0) * _di;                                                   \
 924     if ((u1) >= (d))                                                    \
 925       {                                                                 \
 926         uintmax_t _p1;                                                  \
 927         MAYBE_UNUSED intmax_t _p0;                                      \
 928         umul_ppmm (_p1, _p0, _q0, d);                                   \
 929         (q1) = ((u1) - _p1) * _di;                                      \
 930         (q0) = _q0;                                                     \
 931       }                                                                 \
 932     else                                                                \
 933       {                                                                 \
 934         (q0) = _q0;                                                     \
 935         (q1) = 0;                                                       \
 936       }                                                                 \
 937   } while (0)
 938
 939 /* x B (mod n).  */
 940 #define redcify(r_prim, r, n)                                           \
 941   do {                                                                  \
 942     MAYBE_UNUSED uintmax_t _redcify_q;                                  \
 943     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 944   } while (0)
 945
 946 /* x B^2 (mod n).  Requires x > 0, n1 < B/2.  */
 947 #define redcify2(r1, r0, x, n1, n0)                                     \
 948   do {                                                                  \
 949     uintmax_t _r1, _r0, _i;                                             \
 950     if ((x) < (n1))                                                     \
 951       {                                                                 \
 952         _r1 = (x); _r0 = 0;                                             \
 953         _i = W_TYPE_SIZE;                                               \
 954       }                                                                 \
 955     else                                                                \
 956       {                                                                 \
 957         _r1 = 0; _r0 = (x);                                             \
 958         _i = 2 * W_TYPE_SIZE;                                           \
 959       }                                                                 \
 960     while (_i-- > 0)                                                    \
 961       {                                                                 \
 962         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 963         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 964           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 965       }                                                                 \
 966     (r1) = _r1;                                                         \
 967     (r0) = _r0;                                                         \
 968   } while (0)
 969
 970 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 971    Both a and b must be in redc form, the result will be in redc form too.  */
 972 static inline uintmax_t
 973 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 974 {
 975   uintmax_t rh, rl, q, th, xh;
 976   MAYBE_UNUSED uintmax_t tl;
 977
 978   umul_ppmm (rh, rl, a, b);
 979   q = rl * mi;
 980   umul_ppmm (th, tl, q, m);
 981   xh = rh - th;
 982   if (rh < th)
 983     xh += m;
 984
 985   return xh;
 986 }
 987
 988 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 989    Both a and b must be in redc form, the result will be in redc form too.
 990    For performance reasons, the most significant bit of m must be clear.  */
 991 static uintmax_t
 992 mulredc2 (uintmax_t *r1p,
 993           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 994           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 995 {
 996   uintmax_t r1, r0, q, p1, t1, t0, s1, s0;
 997   MAYBE_UNUSED uintmax_t p0;
 998   mi = -mi;
 999   affirm ((a1 >> (W_TYPE_SIZE - 1)) == 0);
1000   affirm ((b1 >> (W_TYPE_SIZE - 1)) == 0);
1001   affirm ((m1 >> (W_TYPE_SIZE - 1)) == 0);
1002
1003   /* First compute a0 * <b1, b0> B^{-1}
1004         +-----+
1005         |a0 b0|
1006      +--+--+--+
1007      |a0 b1|
1008      +--+--+--+
1009         |q0 m0|
1010      +--+--+--+
1011      |q0 m1|
1012     -+--+--+--+
1013      |r1|r0| 0|
1014      +--+--+--+
1015   */
1016   umul_ppmm (t1, t0, a0, b0);
1017   umul_ppmm (r1, r0, a0, b1);
1018   q = mi * t0;
1019   umul_ppmm (p1, p0, q, m0);
1020   umul_ppmm (s1, s0, q, m1);
1021   r0 += (t0 != 0); /* Carry */
1022   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1023   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1024   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1025
1026   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1027         +-----+
1028         |a1 b0|
1029         +--+--+
1030         |r1|r0|
1031      +--+--+--+
1032      |a1 b1|
1033      +--+--+--+
1034         |q1 m0|
1035      +--+--+--+
1036      |q1 m1|
1037     -+--+--+--+
1038      |r1|r0| 0|
1039      +--+--+--+
1040   */
1041   umul_ppmm (t1, t0, a1, b0);
1042   umul_ppmm (s1, s0, a1, b1);
1043   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1044   q = mi * t0;
1045   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1046   umul_ppmm (p1, p0, q, m0);
1047   umul_ppmm (s1, s0, q, m1);
1048   r0 += (t0 != 0); /* Carry */
1049   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1050   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1051   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1052
1053   if (ge2 (r1, r0, m1, m0))
1054     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1055
1056   *r1p = r1;
1057   return r0;
1058 }
1059
1060 ATTRIBUTE_CONST
1061 static uintmax_t
1062 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1063 {
1064   uintmax_t y = one;
1065
1066   if (e & 1)
1067     y = b;
1068
1069   while (e != 0)
1070     {
1071       b = mulredc (b, b, n, ni);
1072       e >>= 1;
1073
1074       if (e & 1)
1075         y = mulredc (y, b, n, ni);
1076     }
1077
1078   return y;
1079 }
1080
1081 static uintmax_t
1082 powm2 (uintmax_t *r1m,
1083        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1084        uintmax_t ni, const uintmax_t *one)
1085 {
1086   uintmax_t r1, r0, b1, b0, n1, n0;
1087   int i;
1088   uintmax_t e;
1089
1090   b0 = bp[0];
1091   b1 = bp[1];
1092   n0 = np[0];
1093   n1 = np[1];
1094
1095   r0 = one[0];
1096   r1 = one[1];
1097
1098   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1099     {
1100       if (e & 1)
1101         {
1102           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1103           r1 = *r1m;
1104         }
1105       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1106       b1 = *r1m;
1107     }
1108   for (e = ep[1]; e > 0; e >>= 1)
1109     {
1110       if (e & 1)
1111         {
1112           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1113           r1 = *r1m;
1114         }
1115       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1116       b1 = *r1m;
1117     }
1118   *r1m = r1;
1119   return r0;
1120 }
1121
1122 ATTRIBUTE_CONST
1123 static bool
1124 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1125              int k, uintmax_t one)
1126 {
1127   uintmax_t y = powm (b, q, n, ni, one);
1128
1129   uintmax_t nm1 = n - one;      /* -1, but in redc representation.  */
1130
1131   if (y == one || y == nm1)
1132     return true;
1133
1134   for (int i = 1; i < k; i++)
1135     {
1136       y = mulredc (y, y, n, ni);
1137
1138       if (y == nm1)
1139         return true;
1140       if (y == one)
1141         return false;
1142     }
1143   return false;
1144 }
1145
1146 ATTRIBUTE_PURE static bool
1147 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1148               const uintmax_t *qp, int k, const uintmax_t *one)
1149 {
1150   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1151
1152   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1153   y1 = r1m;
1154
1155   if (y0 == one[0] && y1 == one[1])
1156     return true;
1157
1158   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1159
1160   if (y0 == nm1_0 && y1 == nm1_1)
1161     return true;
1162
1163   for (int i = 1; i < k; i++)
1164     {
1165       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1166       y1 = r1m;
1167
1168       if (y0 == nm1_0 && y1 == nm1_1)
1169         return true;
1170       if (y0 == one[0] && y1 == one[1])
1171         return false;
1172     }
1173   return false;
1174 }
1175
1176 static bool
1177 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1178                 mpz_srcptr q, mp_bitcnt_t k)
1179 {
1180   mpz_powm (y, x, q, n);
1181
1182   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1183     return true;
1184
1185   for (mp_bitcnt_t i = 1; i < k; i++)
1186     {
1187       mpz_powm_ui (y, y, 2, n);
1188       if (mpz_cmp (y, nm1) == 0)
1189         return true;
1190       if (mpz_cmp_ui (y, 1) == 0)
1191         return false;
1192     }
1193   return false;
1194 }
1195
1196 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1197    have been observed.  The average seem to be about 2.  */
1198 static bool ATTRIBUTE_PURE
1199 prime_p (uintmax_t n)
1200 {
1201   mp_bitcnt_t k;
1202   bool is_prime;
1203   uintmax_t a_prim, one, ni;
1204   struct factors factors;
1205
1206   if (n <= 1)
1207     return false;
1208
1209   /* We have already cast out small primes.  */
1210   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1211     return true;
1212
1213   /* Precomputation for Miller-Rabin.  */
1214   uintmax_t q = n - 1;
1215   for (k = 0; (q & 1) == 0; k++)
1216     q >>= 1;
1217
1218   uintmax_t a = 2;
1219   binv (ni, n);                 /* ni <- 1/n mod B */
1220   redcify (one, 1, n);
1221   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1222
1223   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1224   if (!millerrabin (n, ni, a_prim, q, k, one))
1225     return false;
1226
1227   if (flag_prove_primality)
1228     {
1229       /* Factor n-1 for Lucas.  */
1230       factor (0, n - 1, &factors);
1231     }
1232
1233   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1234      number composite.  */
1235   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1236     {
1237       if (flag_prove_primality)
1238         {
1239           is_prime = true;
1240           for (int i = 0; i < factors.nfactors && is_prime; i++)
1241             {
1242               is_prime
1243                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1244             }
1245         }
1246       else
1247         {
1248           /* After enough Miller-Rabin runs, be content.  */
1249           is_prime = (r == MR_REPS - 1);
1250         }
1251
1252       if (is_prime)
1253         return true;
1254
1255       a += primes_diff[r];      /* Establish new base.  */
1256
1257       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1258          on most processors, since it avoids udiv_qrnnd.  If we go down the
1259          udiv_qrnnd_preinv path, this code should be replaced.  */
1260       {
1261         uintmax_t s1, s0;
1262         umul_ppmm (s1, s0, one, a);
1263         if (LIKELY (s1 == 0))
1264           a_prim = s0 % n;
1265         else
1266           {
1267             MAYBE_UNUSED uintmax_t dummy;
1268             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1269           }
1270       }
1271
1272       if (!millerrabin (n, ni, a_prim, q, k, one))
1273         return false;
1274     }
1275
1276   affirm (!"Lucas prime test failure.  This should not happen");
1277 }
1278
1279 static bool ATTRIBUTE_PURE
1280 prime2_p (uintmax_t n1, uintmax_t n0)
1281 {
1282   uintmax_t q[2], nm1[2];
1283   uintmax_t a_prim[2];
1284   uintmax_t one[2];
1285   uintmax_t na[2];
1286   uintmax_t ni;
1287   int k;
1288   struct factors factors;
1289
1290   if (n1 == 0)
1291     return prime_p (n0);
1292
1293   nm1[1] = n1 - (n0 == 0);
1294   nm1[0] = n0 - 1;
1295   if (nm1[0] == 0)
1296     {
1297       count_trailing_zeros (k, nm1[1]);
1298
1299       q[0] = nm1[1] >> k;
1300       q[1] = 0;
1301       k += W_TYPE_SIZE;
1302     }
1303   else
1304     {
1305       count_trailing_zeros (k, nm1[0]);
1306       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1307     }
1308
1309   uintmax_t a = 2;
1310   binv (ni, n0);
1311   redcify2 (one[1], one[0], 1, n1, n0);
1312   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1313
1314   /* FIXME: Use scalars or pointers in arguments?  Some consistency needed.  */
1315   na[0] = n0;
1316   na[1] = n1;
1317
1318   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1319     return false;
1320
1321   if (flag_prove_primality)
1322     {
1323       /* Factor n-1 for Lucas.  */
1324       factor (nm1[1], nm1[0], &factors);
1325     }
1326
1327   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1328      number composite.  */
1329   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1330     {
1331       bool is_prime;
1332       uintmax_t e[2], y[2];
1333
1334       if (flag_prove_primality)
1335         {
1336           is_prime = true;
1337           if (factors.plarge[1])
1338             {
1339               uintmax_t pi;
1340               binv (pi, factors.plarge[0]);
1341               e[0] = pi * nm1[0];
1342               e[1] = 0;
1343               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1344               is_prime = (y[0] != one[0] || y[1] != one[1]);
1345             }
1346           for (int i = 0; i < factors.nfactors && is_prime; i++)
1347             {
1348               /* FIXME: We always have the factor 2.  Do we really need to
1349                  handle it here?  We have done the same powering as part
1350                  of millerrabin.  */
1351               if (factors.p[i] == 2)
1352                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1353               else
1354                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1355               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1356               is_prime = (y[0] != one[0] || y[1] != one[1]);
1357             }
1358         }
1359       else
1360         {
1361           /* After enough Miller-Rabin runs, be content.  */
1362           is_prime = (r == MR_REPS - 1);
1363         }
1364
1365       if (is_prime)
1366         return true;
1367
1368       a += primes_diff[r];      /* Establish new base.  */
1369       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1370
1371       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1372         return false;
1373     }
1374
1375   affirm (!"Lucas prime test failure.  This should not happen");
1376 }
1377
1378 static bool
1379 mp_prime_p (mpz_t n)
1380 {
1381   bool is_prime;
1382   mpz_t q, a, nm1, tmp;
1383   struct mp_factors factors;
1384
1385   if (mpz_cmp_ui (n, 1) <= 0)
1386     return false;
1387
1388   /* We have already cast out small primes.  */
1389   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1390     return true;
1391
1392   mpz_inits (q, a, nm1, tmp, nullptr);
1393
1394   /* Precomputation for Miller-Rabin.  */
1395   mpz_sub_ui (nm1, n, 1);
1396
1397   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1398   mp_bitcnt_t k = mpz_scan1 (nm1, 0);
1399   mpz_tdiv_q_2exp (q, nm1, k);
1400
1401   mpz_set_ui (a, 2);
1402
1403   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1404   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1405     {
1406       is_prime = false;
1407       goto ret2;
1408     }
1409
1410   if (flag_prove_primality)
1411     {
1412       /* Factor n-1 for Lucas.  */
1413       mpz_set (tmp, nm1);
1414       mp_factor (tmp, &factors);
1415     }
1416
1417   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1418      number composite.  */
1419   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1420     {
1421       if (flag_prove_primality)
1422         {
1423           is_prime = true;
1424           for (idx_t i = 0; i < factors.nfactors && is_prime; i++)
1425             {
1426               mpz_divexact (tmp, nm1, factors.p[i]);
1427               mpz_powm (tmp, a, tmp, n);
1428               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1429             }
1430         }
1431       else
1432         {
1433           /* After enough Miller-Rabin runs, be content.  */
1434           is_prime = (r == MR_REPS - 1);
1435         }
1436
1437       if (is_prime)
1438         goto ret1;
1439
1440       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1441
1442       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1443         {
1444           is_prime = false;
1445           goto ret1;
1446         }
1447     }
1448
1449   affirm (!"Lucas prime test failure.  This should not happen");
1450
1451  ret1:
1452   if (flag_prove_primality)
1453     mp_factor_clear (&factors);
1454  ret2:
1455   mpz_clears (q, a, nm1, tmp, nullptr);
1456
1457   return is_prime;
1458 }
1459
1460 static void
1461 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1462                           struct factors *factors)
1463 {
1464   uintmax_t x, z, y, P, t, ni, g;
1465
1466   unsigned long int k = 1;
1467   unsigned long int l = 1;
1468
1469   redcify (P, 1, n);
1470   addmod (x, P, P, n);          /* i.e., redcify(2) */
1471   y = z = x;
1472
1473   while (n != 1)
1474     {
1475       affirm (a < n);
1476
1477       binv (ni, n);             /* FIXME: when could we use old 'ni' value?  */
1478
1479       for (;;)
1480         {
1481           do
1482             {
1483               x = mulredc (x, x, n, ni);
1484               addmod (x, x, a, n);
1485
1486               submod (t, z, x, n);
1487               P = mulredc (P, t, n, ni);
1488
1489               if (k % 32 == 1)
1490                 {
1491                   if (gcd_odd (P, n) != 1)
1492                     goto factor_found;
1493                   y = x;
1494                 }
1495             }
1496           while (--k != 0);
1497
1498           z = x;
1499           k = l;
1500           l = 2 * l;
1501           for (unsigned long int i = 0; i < k; i++)
1502             {
1503               x = mulredc (x, x, n, ni);
1504               addmod (x, x, a, n);
1505             }
1506           y = x;
1507         }
1508
1509     factor_found:
1510       do
1511         {
1512           y = mulredc (y, y, n, ni);
1513           addmod (y, y, a, n);
1514
1515           submod (t, z, y, n);
1516           g = gcd_odd (t, n);
1517         }
1518       while (g == 1);
1519
1520       if (n == g)
1521         {
1522           /* Found n itself as factor.  Restart with different params.  */
1523           factor_using_pollard_rho (n, a + 1, factors);
1524           return;
1525         }
1526
1527       n = n / g;
1528
1529       if (!prime_p (g))
1530         factor_using_pollard_rho (g, a + 1, factors);
1531       else
1532         factor_insert (factors, g);
1533
1534       if (prime_p (n))
1535         {
1536           factor_insert (factors, n);
1537           break;
1538         }
1539
1540       x = x % n;
1541       z = z % n;
1542       y = y % n;
1543     }
1544 }
1545
1546 static void
1547 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1548                            struct factors *factors)
1549 {
1550   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1551
1552   unsigned long int k = 1;
1553   unsigned long int l = 1;
1554
1555   redcify2 (P1, P0, 1, n1, n0);
1556   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1557   y1 = z1 = x1;
1558   y0 = z0 = x0;
1559
1560   while (n1 != 0 || n0 != 1)
1561     {
1562       binv (ni, n0);
1563
1564       for (;;)
1565         {
1566           do
1567             {
1568               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1569               x1 = r1m;
1570               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1571
1572               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1573               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1574               P1 = r1m;
1575
1576               if (k % 32 == 1)
1577                 {
1578                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1579                   if (g1 != 0 || g0 != 1)
1580                     goto factor_found;
1581                   y1 = x1; y0 = x0;
1582                 }
1583             }
1584           while (--k != 0);
1585
1586           z1 = x1; z0 = x0;
1587           k = l;
1588           l = 2 * l;
1589           for (unsigned long int i = 0; i < k; i++)
1590             {
1591               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1592               x1 = r1m;
1593               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1594             }
1595           y1 = x1; y0 = x0;
1596         }
1597
1598     factor_found:
1599       do
1600         {
1601           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1602           y1 = r1m;
1603           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1604
1605           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1606           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1607         }
1608       while (g1 == 0 && g0 == 1);
1609
1610       if (g1 == 0)
1611         {
1612           /* The found factor is one word, and > 1.  */
1613           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1614
1615           if (!prime_p (g0))
1616             factor_using_pollard_rho (g0, a + 1, factors);
1617           else
1618             factor_insert (factors, g0);
1619         }
1620       else
1621         {
1622           /* The found factor is two words.  This is highly unlikely, thus hard
1623              to trigger.  Please be careful before you change this code!  */
1624           uintmax_t ginv;
1625
1626           if (n1 == g1 && n0 == g0)
1627             {
1628               /* Found n itself as factor.  Restart with different params.  */
1629               factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1630               return;
1631             }
1632
1633           /* Compute n = n / g.  Since the result will fit one word,
1634              we can compute the quotient modulo B, ignoring the high
1635              divisor word.  */
1636           binv (ginv, g0);
1637           n0 = ginv * n0;
1638           n1 = 0;
1639
1640           if (!prime2_p (g1, g0))
1641             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1642           else
1643             factor_insert_large (factors, g1, g0);
1644         }
1645
1646       if (n1 == 0)
1647         {
1648           if (prime_p (n0))
1649             {
1650               factor_insert (factors, n0);
1651               break;
1652             }
1653
1654           factor_using_pollard_rho (n0, a, factors);
1655           return;
1656         }
1657
1658       if (prime2_p (n1, n0))
1659         {
1660           factor_insert_large (factors, n1, n0);
1661           break;
1662         }
1663
1664       x0 = mod2 (&x1, x1, x0, n1, n0);
1665       z0 = mod2 (&z1, z1, z0, n1, n0);
1666       y0 = mod2 (&y1, y1, y0, n1, n0);
1667     }
1668 }
1669
1670 static void
1671 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1672                              struct mp_factors *factors)
1673 {
1674   mpz_t x, z, y, P;
1675   mpz_t t, t2;
1676
1677   devmsg ("[pollard-rho (%lu)] ", a);
1678
1679   mpz_inits (t, t2, nullptr);
1680   mpz_init_set_si (y, 2);
1681   mpz_init_set_si (x, 2);
1682   mpz_init_set_si (z, 2);
1683   mpz_init_set_ui (P, 1);
1684
1685   unsigned long long int k = 1;
1686   unsigned long long int l = 1;
1687
1688   while (mpz_cmp_ui (n, 1) != 0)
1689     {
1690       for (;;)
1691         {
1692           do
1693             {
1694               mpz_mul (t, x, x);
1695               mpz_mod (x, t, n);
1696               mpz_add_ui (x, x, a);
1697
1698               mpz_sub (t, z, x);
1699               mpz_mul (t2, P, t);
1700               mpz_mod (P, t2, n);
1701
1702               if (k % 32 == 1)
1703                 {
1704                   mpz_gcd (t, P, n);
1705                   if (mpz_cmp_ui (t, 1) != 0)
1706                     goto factor_found;
1707                   mpz_set (y, x);
1708                 }
1709             }
1710           while (--k != 0);
1711
1712           mpz_set (z, x);
1713           k = l;
1714           l = 2 * l;
1715           for (unsigned long long int i = 0; i < k; i++)
1716             {
1717               mpz_mul (t, x, x);
1718               mpz_mod (x, t, n);
1719               mpz_add_ui (x, x, a);
1720             }
1721           mpz_set (y, x);
1722         }
1723
1724     factor_found:
1725       do
1726         {
1727           mpz_mul (t, y, y);
1728           mpz_mod (y, t, n);
1729           mpz_add_ui (y, y, a);
1730
1731           mpz_sub (t, z, y);
1732           mpz_gcd (t, t, n);
1733         }
1734       while (mpz_cmp_ui (t, 1) == 0);
1735
1736       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1737
1738       if (!mp_prime_p (t))
1739         {
1740           devmsg ("[composite factor--restarting pollard-rho] ");
1741           mp_factor_using_pollard_rho (t, a + 1, factors);
1742         }
1743       else
1744         {
1745           mp_factor_insert (factors, t);
1746         }
1747
1748       if (mp_prime_p (n))
1749         {
1750           mp_factor_insert (factors, n);
1751           break;
1752         }
1753
1754       mpz_mod (x, x, n);
1755       mpz_mod (z, z, n);
1756       mpz_mod (y, y, n);
1757     }
1758
1759   mpz_clears (P, t2, t, z, x, y, nullptr);
1760 }
1761
1762 #if USE_SQUFOF
1763 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1764    algorithm is replaced, consider also returning the remainder.  */
1765 ATTRIBUTE_CONST
1766 static uintmax_t
1767 isqrt (uintmax_t n)
1768 {
1769   uintmax_t x;
1770   int c;
1771   if (n == 0)
1772     return 0;
1773
1774   count_leading_zeros (c, n);
1775
1776   /* Make x > sqrt(n).  This will be invariant through the loop.  */
1777   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) >> 1);
1778
1779   for (;;)
1780     {
1781       uintmax_t y = (x + n / x) / 2;
1782       if (y >= x)
1783         return x;
1784
1785       x = y;
1786     }
1787 }
1788
1789 ATTRIBUTE_CONST
1790 static uintmax_t
1791 isqrt2 (uintmax_t nh, uintmax_t nl)
1792 {
1793   int shift;
1794   uintmax_t x;
1795
1796   /* Ensures the remainder fits in an uintmax_t.  */
1797   affirm (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1798
1799   if (nh == 0)
1800     return isqrt (nl);
1801
1802   count_leading_zeros (shift, nh);
1803   shift &= ~1;
1804
1805   /* Make x > sqrt (n).  */
1806   x = isqrt ((nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1807   x <<= (W_TYPE_SIZE - shift) >> 1;
1808
1809   /* Do we need more than one iteration?  */
1810   for (;;)
1811     {
1812       MAYBE_UNUSED uintmax_t r;
1813       uintmax_t q, y;
1814       udiv_qrnnd (q, r, nh, nl, x);
1815       y = (x + q) / 2;
1816
1817       if (y >= x)
1818         {
1819           uintmax_t hi, lo;
1820           umul_ppmm (hi, lo, x + 1, x + 1);
1821           affirm (gt2 (hi, lo, nh, nl));
1822
1823           umul_ppmm (hi, lo, x, x);
1824           affirm (ge2 (nh, nl, hi, lo));
1825           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1826           affirm (hi == 0);
1827
1828           return x;
1829         }
1830
1831       x = y;
1832     }
1833 }
1834
1835 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N.  */
1836 # define MAGIC64 0x0202021202030213ULL
1837 # define MAGIC63 0x0402483012450293ULL
1838 # define MAGIC65 0x218a019866014613ULL
1839 # define MAGIC11 0x23b
1840
1841 /* Return the square root if the input is a square, otherwise 0.  */
1842 ATTRIBUTE_CONST
1843 static uintmax_t
1844 is_square (uintmax_t x)
1845 {
1846   /* Uses the tests suggested by Cohen.  Excludes 99% of the non-squares before
1847      computing the square root.  */
1848   if (((MAGIC64 >> (x & 63)) & 1)
1849       && ((MAGIC63 >> (x % 63)) & 1)
1850       /* Both 0 and 64 are squares mod (65).  */
1851       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1852       && ((MAGIC11 >> (x % 11) & 1)))
1853     {
1854       uintmax_t r = isqrt (x);
1855       if (r * r == x)
1856         return r;
1857     }
1858   return 0;
1859 }
1860
1861 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1862 static short const invtab[0x81] =
1863   {
1864     0x200,
1865     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1866     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1867     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1868     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1869     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1870     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1871     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1872     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1873     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1874     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1875     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1876     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1877     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1878     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1879     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1880     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1881   };
1882
1883 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1884    that q < 0x40; here it instead uses a table of (Euclidean) inverses.  */
1885 # define div_smallq(q, r, u, d)                                          \
1886   do {                                                                  \
1887     if ((u) / 0x40 < (d))                                               \
1888       {                                                                 \
1889         int _cnt;                                                       \
1890         uintmax_t _dinv, _mask, _q, _r;                                 \
1891         count_leading_zeros (_cnt, (d));                                \
1892         _r = (u);                                                       \
1893         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1894           {                                                             \
1895             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1896             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1897           }                                                             \
1898         else                                                            \
1899           {                                                             \
1900             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1901             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1902           }                                                             \
1903         _r -= _q * (d);                                                 \
1904                                                                         \
1905         _mask = -(uintmax_t) (_r >= (d));                               \
1906         (r) = _r - (_mask & (d));                                       \
1907         (q) = _q - _mask;                                               \
1908         affirm ((q) * (d) + (r) == u);                                  \
1909       }                                                                 \
1910     else                                                                \
1911       {                                                                 \
1912         uintmax_t _q = (u) / (d);                                       \
1913         (r) = (u) - _q * (d);                                           \
1914         (q) = _q;                                                       \
1915       }                                                                 \
1916   } while (0)
1917
1918 /* Notes: Example N = 22117019.  After first phase we find Q1 = 6314, Q
1919    = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1920    with 3025 = 55^2.
1921
1922    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1923    representing G_0 = (-55, 2 * 4652, 8653).
1924
1925    In the notation of the paper:
1926
1927    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1928
1929    Put
1930
1931      t_0 = floor([q_0 + R_0] / S0) = 1
1932      R_1 = t_0 * S_0 - R_0 = 4001
1933      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1934 */
1935
1936 /* Multipliers, in order of efficiency:
1937    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1938    0.7317  3*5*7    =  105 = 1
1939    0.7820  3*5*11   =  165 = 1
1940    0.7872  3*5      =   15 = 3
1941    0.8101  3*7*11   =  231 = 3
1942    0.8155  3*7      =   21 = 1
1943    0.8284  5*7*11   =  385 = 1
1944    0.8339  5*7      =   35 = 3
1945    0.8716  3*11     =   33 = 1
1946    0.8774  3        =    3 = 3
1947    0.8913  5*11     =   55 = 3
1948    0.8972  5        =    5 = 1
1949    0.9233  7*11     =   77 = 1
1950    0.9295  7        =    7 = 3
1951    0.9934  11       =   11 = 3
1952 */
1953 # define QUEUE_SIZE 50
1954 #endif
1955
1956 #if STAT_SQUFOF
1957 # define Q_FREQ_SIZE 50
1958 /* Element 0 keeps the total */
1959 static int q_freq[Q_FREQ_SIZE + 1];
1960 #endif
1961
1962 #if USE_SQUFOF
1963 /* Return true on success.  Expected to fail only for numbers
1964    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit.  */
1965 static bool
1966 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1967 {
1968   /* Uses algorithm and notation from
1969
1970      SQUARE FORM FACTORIZATION
1971      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1972
1973      https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1974    */
1975
1976   static short const multipliers_1[] =
1977     { /* = 1 (mod 4) */
1978       105, 165, 21, 385, 33, 5, 77, 1, 0
1979     };
1980   static short const multipliers_3[] =
1981     { /* = 3 (mod 4) */
1982       1155, 15, 231, 35, 3, 55, 7, 11, 0
1983     };
1984
1985   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1986
1987   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1988     return false;
1989
1990   uintmax_t sqrt_n = isqrt2 (n1, n0);
1991
1992   if (n0 == sqrt_n * sqrt_n)
1993     {
1994       uintmax_t p1, p0;
1995
1996       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1997       affirm (p0 == n0);
1998
1999       if (n1 == p1)
2000         {
2001           if (prime_p (sqrt_n))
2002             factor_insert_multiplicity (factors, sqrt_n, 2);
2003           else
2004             {
2005               struct factors f;
2006
2007               f.nfactors = 0;
2008               if (!factor_using_squfof (0, sqrt_n, &f))
2009                 {
2010                   /* Try pollard rho instead */
2011                   factor_using_pollard_rho (sqrt_n, 1, &f);
2012                 }
2013               /* Duplicate the new factors */
2014               for (unsigned int i = 0; i < f.nfactors; i++)
2015                 factor_insert_multiplicity (factors, f.p[i], 2 * f.e[i]);
2016             }
2017           return true;
2018         }
2019     }
2020
2021   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2022   for (short const *m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2023        *m; m++)
2024     {
2025       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2026       unsigned int i;
2027       unsigned int mu = *m;
2028       int qpos = 0;
2029
2030       affirm (mu * n0 % 4 == 3);
2031
2032       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2033          get \Delta = 4 mu * n, and the paper's \mu is 2 mu.  As far as
2034          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2035          mu^3 < n.
2036
2037          However, this seems insufficient: With n = 37243139 and mu =
2038          105, we get a trivial factor, from the square 38809 = 197^2,
2039          without any corresponding Q earlier in the iteration.
2040
2041          Requiring 64 mu^3 < n seems sufficient.  */
2042       if (n1 == 0)
2043         {
2044           if ((uintmax_t) mu * mu * mu >= n0 / 64)
2045             continue;
2046         }
2047       else
2048         {
2049           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2050             continue;
2051         }
2052       umul_ppmm (Dh, Dl, n0, mu);
2053       Dh += n1 * mu;
2054
2055       affirm (Dl % 4 != 1);
2056       affirm (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2057
2058       S = isqrt2 (Dh, Dl);
2059
2060       Q1 = 1;
2061       P = S;
2062
2063       /* Square root remainder fits in one word, so ignore high part.  */
2064       Q = Dl - P * P;
2065       /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))?  */
2066       L = isqrt (2 * S);
2067       B = 2 * L;
2068       L1 = mu * 2 * L;
2069
2070       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2071          4 D.  */
2072
2073       for (i = 0; i <= B; i++)
2074         {
2075           uintmax_t q, P1, t, rem;
2076
2077           div_smallq (q, rem, S + P, Q);
2078           P1 = S - rem; /* P1 = q*Q - P */
2079
2080           affirm (q > 0 && Q > 0);
2081
2082 # if STAT_SQUFOF
2083           q_freq[0]++;
2084           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2085 # endif
2086
2087           if (Q <= L1)
2088             {
2089               uintmax_t g = Q;
2090
2091               if ((Q & 1) == 0)
2092                 g /= 2;
2093
2094               g /= gcd_odd (g, mu);
2095
2096               if (g <= L)
2097                 {
2098                   if (qpos >= QUEUE_SIZE)
2099                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2100                   queue[qpos].Q = g;
2101                   queue[qpos].P = P % g;
2102                   qpos++;
2103                 }
2104             }
2105
2106           /* I think the difference can be either sign, but mod
2107              2^W_TYPE_SIZE arithmetic should be fine.  */
2108           t = Q1 + q * (P - P1);
2109           Q1 = Q;
2110           Q = t;
2111           P = P1;
2112
2113           if ((i & 1) == 0)
2114             {
2115               uintmax_t r = is_square (Q);
2116               if (r)
2117                 {
2118                   for (int j = 0; j < qpos; j++)
2119                     {
2120                       if (queue[j].Q == r)
2121                         {
2122                           if (r == 1)
2123                             /* Traversed entire cycle.  */
2124                             goto next_multiplier;
2125
2126                           /* Need the absolute value for divisibility test.  */
2127                           if (P >= queue[j].P)
2128                             t = P - queue[j].P;
2129                           else
2130                             t = queue[j].P - P;
2131                           if (t % r == 0)
2132                             {
2133                               /* Delete entries up to and including entry
2134                                  j, which matched.  */
2135                               memmove (queue, queue + j + 1,
2136                                        (qpos - j - 1) * sizeof (queue[0]));
2137                               qpos -= (j + 1);
2138                             }
2139                           goto next_i;
2140                         }
2141                     }
2142
2143                   /* We have found a square form, which should give a
2144                      factor.  */
2145                   Q1 = r;
2146                   affirm (S >= P); /* What signs are possible?  */
2147                   P += r * ((S - P) / r);
2148
2149                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2150                      for the case D = 2N.  */
2151                   /* Compute Q = (D - P*P) / Q1, but we need double
2152                      precision.  */
2153                   uintmax_t hi, lo;
2154                   umul_ppmm (hi, lo, P, P);
2155                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2156                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2157                   affirm (rem == 0);
2158
2159                   for (;;)
2160                     {
2161                       /* Note: There appears to by a typo in the paper,
2162                          Step 4a in the algorithm description says q <--
2163                          floor([S+P]/\hat Q), but looking at the equations
2164                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2165                          (In this code, \hat Q is Q1).  */
2166                       div_smallq (q, rem, S + P, Q);
2167                       P1 = S - rem;     /* P1 = q*Q - P */
2168
2169 # if STAT_SQUFOF
2170                       q_freq[0]++;
2171                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2172 # endif
2173                       if (P == P1)
2174                         break;
2175                       t = Q1 + q * (P - P1);
2176                       Q1 = Q;
2177                       Q = t;
2178                       P = P1;
2179                     }
2180
2181                   if ((Q & 1) == 0)
2182                     Q /= 2;
2183                   Q /= gcd_odd (Q, mu);
2184
2185                   affirm (Q > 1 && (n1 || Q < n0));
2186
2187                   if (prime_p (Q))
2188                     factor_insert (factors, Q);
2189                   else if (!factor_using_squfof (0, Q, factors))
2190                     factor_using_pollard_rho (Q, 2, factors);
2191
2192                   divexact_21 (n1, n0, n1, n0, Q);
2193
2194                   if (prime2_p (n1, n0))
2195                     factor_insert_large (factors, n1, n0);
2196                   else
2197                     {
2198                       if (!factor_using_squfof (n1, n0, factors))
2199                         {
2200                           if (n1 == 0)
2201                             factor_using_pollard_rho (n0, 1, factors);
2202                           else
2203                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2204                         }
2205                     }
2206
2207                   return true;
2208                 }
2209             }
2210         next_i:;
2211         }
2212     next_multiplier:;
2213     }
2214   return false;
2215 }
2216 #endif
2217
2218 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2219    results in FACTORS.  */
2220 static void
2221 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2222 {
2223   factors->nfactors = 0;
2224   factors->plarge[1] = 0;
2225
2226   if (t1 == 0 && t0 < 2)
2227     return;
2228
2229   t0 = factor_using_division (&t1, t1, t0, factors);
2230
2231   if (t1 == 0 && t0 < 2)
2232     return;
2233
2234   if (prime2_p (t1, t0))
2235     factor_insert_large (factors, t1, t0);
2236   else
2237     {
2238 #if USE_SQUFOF
2239       if (factor_using_squfof (t1, t0, factors))
2240         return;
2241 #endif
2242
2243       if (t1 == 0)
2244         factor_using_pollard_rho (t0, 1, factors);
2245       else
2246         factor_using_pollard_rho2 (t1, t0, 1, factors);
2247     }
2248 }
2249
2250 /* Use Pollard-rho to compute the prime factors of
2251    arbitrary-precision T, and put the results in FACTORS.  */
2252 static void
2253 mp_factor (mpz_t t, struct mp_factors *factors)
2254 {
2255   mp_factor_init (factors);
2256
2257   if (mpz_sgn (t) != 0)
2258     {
2259       mp_factor_using_division (t, factors);
2260
2261       if (mpz_cmp_ui (t, 1) != 0)
2262         {
2263           devmsg ("[is number prime?] ");
2264           if (mp_prime_p (t))
2265             mp_factor_insert (factors, t);
2266           else
2267             mp_factor_using_pollard_rho (t, 1, factors);
2268         }
2269     }
2270 }
2271
2272 static strtol_error
2273 strto2uintmax (uintmax_t *hip, uintmax_t *lop, char const *s)
2274 {
2275   int lo_carry;
2276   uintmax_t hi = 0, lo = 0;
2277
2278   strtol_error err = LONGINT_INVALID;
2279
2280   /* Initial scan for invalid digits.  */
2281   char const *p = s;
2282   for (;;)
2283     {
2284       unsigned char c = *p++;
2285       if (c == 0)
2286         break;
2287
2288       if (UNLIKELY (!ISDIGIT (c)))
2289         {
2290           err = LONGINT_INVALID;
2291           break;
2292         }
2293
2294       err = LONGINT_OK;           /* we've seen at least one valid digit */
2295     }
2296
2297   while (err == LONGINT_OK)
2298     {
2299       unsigned char c = *s++;
2300       if (c == 0)
2301         break;
2302
2303       c -= '0';
2304
2305       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2306         {
2307           err = LONGINT_OVERFLOW;
2308           break;
2309         }
2310       hi = 10 * hi;
2311
2312       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2313       lo_carry += 10 * lo < 2 * lo;
2314
2315       lo = 10 * lo;
2316       lo += c;
2317
2318       lo_carry += lo < c;
2319       hi += lo_carry;
2320       if (UNLIKELY (hi < lo_carry))
2321         {
2322           err = LONGINT_OVERFLOW;
2323           break;
2324         }
2325     }
2326
2327   *hip = hi;
2328   *lop = lo;
2329
2330   return err;
2331 }
2332
2333 /* Structure and routines for buffering and outputting full lines,
2334    to support parallel operation efficiently.  */
2335 static struct lbuf_
2336 {
2337   char *buf;
2338   char *end;
2339 } lbuf;
2340
2341 /* 512 is chosen to give good performance,
2342    and also is the max guaranteed size that
2343    consumers can read atomically through pipes.
2344    Also it's big enough to cater for max line length
2345    even with 128 bit uintmax_t.  */
2346 #define FACTOR_PIPE_BUF 512
2347
2348 static void
2349 lbuf_alloc (void)
2350 {
2351   if (lbuf.buf)
2352     return;
2353
2354   /* Double to ensure enough space for
2355      previous numbers + next number.  */
2356   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2357   lbuf.end = lbuf.buf;
2358 }
2359
2360 /* Write complete LBUF to standard output.  */
2361 static void
2362 lbuf_flush (void)
2363 {
2364   size_t size = lbuf.end - lbuf.buf;
2365   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2366     write_error ();
2367   lbuf.end = lbuf.buf;
2368 }
2369
2370 /* Add a character C to LBUF and if it's a newline
2371    and enough bytes are already buffered,
2372    then write atomically to standard output.  */
2373 static void
2374 lbuf_putc (char c)
2375 {
2376   *lbuf.end++ = c;
2377
2378   if (c == '\n')
2379     {
2380       size_t buffered = lbuf.end - lbuf.buf;
2381
2382       /* Provide immediate output for interactive use.  */
2383       static int line_buffered = -1;
2384       if (line_buffered == -1)
2385         line_buffered = isatty (STDIN_FILENO) || isatty (STDOUT_FILENO);
2386       if (line_buffered)
2387         lbuf_flush ();
2388       else if (buffered >= FACTOR_PIPE_BUF)
2389         {
2390           /* Write output in <= PIPE_BUF chunks
2391              so consumers can read atomically.  */
2392           char const *tend = lbuf.end;
2393
2394           /* Since a umaxint_t's factors must fit in 512
2395              we're guaranteed to find a newline here.  */
2396           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2397           while (*--tlend != '\n');
2398           tlend++;
2399
2400           lbuf.end = tlend;
2401           lbuf_flush ();
2402
2403           /* Buffer the remainder.  */
2404           memcpy (lbuf.buf, tlend, tend - tlend);
2405           lbuf.end = lbuf.buf + (tend - tlend);
2406         }
2407     }
2408 }
2409
2410 /* Buffer an int to the internal LBUF.  */
2411 static void
2412 lbuf_putint (uintmax_t i, size_t min_width)
2413 {
2414   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2415   char const *umaxstr = umaxtostr (i, buf);
2416   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2417   size_t z = width;
2418
2419   for (; z < min_width; z++)
2420     *lbuf.end++ = '0';
2421
2422   memcpy (lbuf.end, umaxstr, width);
2423   lbuf.end += width;
2424 }
2425
2426 static void
2427 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2428 {
2429   uintmax_t q, r;
2430
2431   if (t1 == 0)
2432     lbuf_putint (t0, 0);
2433   else
2434     {
2435       /* Use very plain code here since it seems hard to write fast code
2436          without assuming a specific word size.  */
2437       q = t1 / 1000000000;
2438       r = t1 % 1000000000;
2439       udiv_qrnnd (t0, r, r, t0, 1000000000);
2440       print_uintmaxes (q, t0);
2441       lbuf_putint (r, 9);
2442     }
2443 }
2444
2445 /* Single-precision factoring */
2446 static void
2447 print_factors_single (uintmax_t t1, uintmax_t t0)
2448 {
2449   struct factors factors;
2450
2451   print_uintmaxes (t1, t0);
2452   lbuf_putc (':');
2453
2454   factor (t1, t0, &factors);
2455
2456   for (int j = 0; j < factors.nfactors; j++)
2457     for (int k = 0; k < factors.e[j]; k++)
2458       {
2459         lbuf_putc (' ');
2460         print_uintmaxes (0, factors.p[j]);
2461         if (print_exponents && factors.e[j] > 1)
2462           {
2463             lbuf_putc ('^');
2464             lbuf_putint (factors.e[j], 0);
2465             break;
2466           }
2467       }
2468
2469   if (factors.plarge[1])
2470     {
2471       lbuf_putc (' ');
2472       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2473     }
2474
2475   lbuf_putc ('\n');
2476 }
2477
2478 /* Emit the factors of the indicated number.  If we have the option of using
2479    either algorithm, we select on the basis of the length of the number.
2480    For longer numbers, we prefer the MP algorithm even if the native algorithm
2481    has enough digits, because the algorithm is better.  The turnover point
2482    depends on the value.  */
2483 static bool
2484 print_factors (char const *input)
2485 {
2486   /* Skip initial spaces and '+'.  */
2487   char const *str = input;
2488   while (*str == ' ')
2489     str++;
2490   str += *str == '+';
2491
2492   uintmax_t t1, t0;
2493
2494   /* Try converting the number to one or two words.  If it fails, use GMP or
2495      print an error message.  The 2nd condition checks that the most
2496      significant bit of the two-word number is clear, in a typesize neutral
2497      way.  */
2498   strtol_error err = strto2uintmax (&t1, &t0, str);
2499
2500   switch (err)
2501     {
2502     case LONGINT_OK:
2503       if (((t1 << 1) >> 1) == t1)
2504         {
2505           devmsg ("[using single-precision arithmetic] ");
2506           print_factors_single (t1, t0);
2507           return true;
2508         }
2509       break;
2510
2511     case LONGINT_OVERFLOW:
2512       /* Try GMP.  */
2513       break;
2514
2515     default:
2516       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2517       return false;
2518     }
2519
2520   devmsg ("[using arbitrary-precision arithmetic] ");
2521   mpz_t t;
2522   struct mp_factors factors;
2523
2524   mpz_init_set_str (t, str, 10);
2525
2526   mpz_out_str (stdout, 10, t);
2527   putchar (':');
2528   mp_factor (t, &factors);
2529
2530   for (idx_t j = 0; j < factors.nfactors; j++)
2531     for (unsigned long int k = 0; k < factors.e[j]; k++)
2532       {
2533         putchar (' ');
2534         mpz_out_str (stdout, 10, factors.p[j]);
2535         if (print_exponents && factors.e[j] > 1)
2536           {
2537             printf ("^%lu", factors.e[j]);
2538             break;
2539           }
2540       }
2541
2542   mp_factor_clear (&factors);
2543   mpz_clear (t);
2544   putchar ('\n');
2545   fflush (stdout);
2546   return true;
2547 }
2548
2549 void
2550 usage (int status)
2551 {
2552   if (status != EXIT_SUCCESS)
2553     emit_try_help ();
2554   else
2555     {
2556       printf (_("\
2557 Usage: %s [OPTION] [NUMBER]...\n\
2558 "),
2559               program_name);
2560       fputs (_("\
2561 Print the prime factors of each specified integer NUMBER.  If none\n\
2562 are specified on the command line, read them from standard input.\n\
2563 \n\
2564 "), stdout);
2565       fputs ("\
2566   -h, --exponents   print repeated factors in form p^e unless e is 1\n\
2567 ", stdout);
2568       fputs (HELP_OPTION_DESCRIPTION, stdout);
2569       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2570       emit_ancillary_info (PROGRAM_NAME);
2571     }
2572   exit (status);
2573 }
2574
2575 static bool
2576 do_stdin (void)
2577 {
2578   bool ok = true;
2579   token_buffer tokenbuffer;
2580
2581   init_tokenbuffer (&tokenbuffer);
2582
2583   while (true)
2584     {
2585       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2586                                        &tokenbuffer);
2587       if (token_length == (size_t) -1)
2588         {
2589           if (ferror (stdin))
2590             error (EXIT_FAILURE, errno, _("error reading input"));
2591           break;
2592         }
2593
2594       ok &= print_factors (tokenbuffer.buffer);
2595     }
2596   free (tokenbuffer.buffer);
2597
2598   return ok;
2599 }
2600
2601 int
2602 main (int argc, char **argv)
2603 {
2604   initialize_main (&argc, &argv);
2605   set_program_name (argv[0]);
2606   setlocale (LC_ALL, "");
2607   bindtextdomain (PACKAGE, LOCALEDIR);
2608   textdomain (PACKAGE);
2609
2610   lbuf_alloc ();
2611   atexit (close_stdout);
2612   atexit (lbuf_flush);
2613
2614   int c;
2615   while ((c = getopt_long (argc, argv, "h", long_options, nullptr)) != -1)
2616     {
2617       switch (c)
2618         {
2619         case 'h':  /* NetBSD used -h for this functionality first.  */
2620           print_exponents = true;
2621           break;
2622
2623         case DEV_DEBUG_OPTION:
2624           dev_debug = true;
2625           break;
2626
2627         case_GETOPT_HELP_CHAR;
2628
2629         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2630
2631         default:
2632           usage (EXIT_FAILURE);
2633         }
2634     }
2635
2636 #if STAT_SQUFOF
2637   memset (q_freq, 0, sizeof (q_freq));
2638 #endif
2639
2640   bool ok;
2641   if (argc <= optind)
2642     ok = do_stdin ();
2643   else
2644     {
2645       ok = true;
2646       for (int i = optind; i < argc; i++)
2647         if (! print_factors (argv[i]))
2648           ok = false;
2649     }
2650
2651 #if STAT_SQUFOF
2652   if (q_freq[0] > 0)
2653     {
2654       double acc_f;
2655       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2656       for (int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2657         {
2658           double f = (double) q_freq[i] / q_freq[0];
2659           acc_f += f;
2660           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2661                   100.0 * f, 100.0 * acc_f);
2662         }
2663     }
2664 #endif
2665
2666   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2667 }