src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2024 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organization:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38   Algorithm:
  39
  40     (1) Perform trial division using a small primes table, but without hardware
  41         division since the primes table store inverses modulo the word base.
  42         (The GMP variant of this code doesn't make use of the precomputed
  43         inverses, but instead relies on GMP for fast divisibility testing.)
  44     (2) Check the nature of any non-factored part using Miller-Rabin for
  45         detecting composites, and Lucas for detecting primes.
  46     (3) Factor any remaining composite part using the Pollard-Brent rho
  47         algorithm or if USE_SQUFOF is defined to 1, try that first.
  48         Status of found factors are checked again using Miller-Rabin and Lucas.
  49
  50     We prefer using Hensel norm in the divisions, not the more familiar
  51     Euclidean norm, since the former leads to much faster code.  In the
  52     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  53     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  54     reductions mod n.
  55
  56     The GMP code uses an algorithm that can be considerably slower;
  57     for example, on a circa-2017 Intel Xeon Silver 4116, factoring
  58     2^{127}-3 takes about 50 ms with the two-word algorithm but would
  59     take about 750 ms with the GMP code.
  60
  61   Improvements:
  62
  63     * Use modular inverses also for exact division in the Lucas code, and
  64       elsewhere.  A problem is to locate the inverses not from an index, but
  65       from a prime.  We might instead compute the inverse on-the-fly.
  66
  67     * Tune trial division table size (not forgetting that this is a standalone
  68       program where the table will be read from secondary storage for
  69       each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidean)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdbit.h>
 107 #include <stdio.h>
 108 #include <gmp.h>
 109
 110 #include "system.h"
 111 #include "assure.h"
 112 #include "full-write.h"
 113 #include "quote.h"
 114 #include "readtokens.h"
 115 #include "xstrtol.h"
 116
 117 /* The official name of this program (e.g., no 'g' prefix).  */
 118 #define PROGRAM_NAME "factor"
 119
 120 #define AUTHORS \
 121   proper_name ("Paul Rubin"),                                           \
 122   proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 123   proper_name_lite ("Niels Moller", "Niels M\303\266ller")
 124
 125 /* Token delimiters when reading from a file.  */
 126 #define DELIM "\n\t "
 127
 128 #ifndef USE_LONGLONG_H
 129 /* With the way we use longlong.h, it's only safe to use
 130    when UWtype = UHWtype, as there were various cases
 131    (as can be seen in the history for longlong.h) where
 132    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 133    to avoid compile time or run time issues.  */
 134 # if LONG_MAX == INTMAX_MAX
 135 #  define USE_LONGLONG_H 1
 136 # endif
 137 #endif
 138
 139 #if USE_LONGLONG_H
 140
 141 /* Make definitions for longlong.h to make it do what it can do for us */
 142
 143 /* bitcount for uintmax_t */
 144 # if UINTMAX_MAX == UINT32_MAX
 145 #  define W_TYPE_SIZE 32
 146 # elif UINTMAX_MAX == UINT64_MAX
 147 #  define W_TYPE_SIZE 64
 148 # elif UINTMAX_MAX == UINT128_MAX
 149 #  define W_TYPE_SIZE 128
 150 # endif
 151
 152 # define UWtype  uintmax_t
 153 # define UHWtype unsigned long int
 154 # undef UDWtype
 155 # if HAVE_ATTRIBUTE_MODE
 156 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 157 typedef          int SItype     __attribute__ ((mode (SI)));
 158 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 159 typedef          int DItype     __attribute__ ((mode (DI)));
 160 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 161 # else
 162 typedef unsigned char UQItype;
 163 typedef          long SItype;
 164 typedef unsigned long int USItype;
 165 #  if HAVE_LONG_LONG_INT
 166 typedef long long int DItype;
 167 typedef unsigned long long int UDItype;
 168 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 169 typedef long int DItype;
 170 typedef unsigned long int UDItype;
 171 #  endif
 172 # endif
 173 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 174 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 175 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 176 # ifndef __GMP_GNUC_PREREQ
 177 #  define __GMP_GNUC_PREREQ(a,b) 1
 178 # endif
 179
 180 /* longlong.h uses these macros only in certain system compiler combinations.
 181    Ensure usage to pacify -Wunused-macros.  */
 182 #if defined ASSERT || defined UHWtype || defined __GMP_DECLSPEC
 183 #endif
 184
 185 # if _ARCH_PPC
 186 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 187 # endif
 188 # include "longlong.h"
 189
 190 #else /* not USE_LONGLONG_H */
 191
 192 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 193 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 194 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 195 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 196
 197 #endif
 198
 199 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 200 #define MAX_NFACTS 26
 201
 202 enum
 203 {
 204   DEV_DEBUG_OPTION = CHAR_MAX + 1
 205 };
 206
 207 static struct option const long_options[] =
 208 {
 209   {"exponents", no_argument, nullptr, 'h'},
 210   {"-debug", no_argument, nullptr, DEV_DEBUG_OPTION},
 211   {GETOPT_HELP_OPTION_DECL},
 212   {GETOPT_VERSION_OPTION_DECL},
 213   {nullptr, 0, nullptr, 0}
 214 };
 215
 216 /* If true, use p^e output format.  */
 217 static bool print_exponents;
 218
 219 /* This represents an unsigned integer twice as wide as uintmax_t.  */
 220 typedef struct { uintmax_t uu[2]; } uuint;
 221
 222 /* Accessors and constructors for the type.  Pprograms should not
 223    access the type's internals directly, in case some future version
 224    replaces the type with unsigned __int128 or whatever.  */
 225 static uintmax_t lo (uuint u) { return u.uu[0]; }
 226 static uintmax_t hi (uuint u) { return u.uu[1]; }
 227 static void hiset (uuint *u, uintmax_t hi) { u->uu[1] = hi; }
 228 static void
 229 uuset (uintmax_t *phi, uintmax_t *plo, uuint uu)
 230 {
 231   *phi = hi (uu);
 232   *plo = lo (uu);
 233 }
 234 static uuint
 235 make_uuint (uintmax_t hi, uintmax_t lo)
 236 {
 237   return (uuint) {{lo, hi}};
 238 }
 239
 240 struct factors
 241 {
 242   uuint plarge; /* Can have a single large factor */
 243   uintmax_t     p[MAX_NFACTS];
 244   unsigned char e[MAX_NFACTS];
 245   unsigned char nfactors;
 246 };
 247
 248 struct mp_factors
 249 {
 250   mpz_t             *p;
 251   unsigned long int *e;
 252   idx_t nfactors;
 253   idx_t nalloc;
 254 };
 255
 256 static void factor (uintmax_t, uintmax_t, struct factors *);
 257
 258 #ifndef umul_ppmm
 259 # define umul_ppmm(w1, w0, u, v)                                        \
 260   do {                                                                  \
 261     uintmax_t __x0, __x1, __x2, __x3;                                   \
 262     unsigned long int __ul, __vl, __uh, __vh;                           \
 263     uintmax_t __u = (u), __v = (v);                                     \
 264                                                                         \
 265     __ul = __ll_lowpart (__u);                                          \
 266     __uh = __ll_highpart (__u);                                         \
 267     __vl = __ll_lowpart (__v);                                          \
 268     __vh = __ll_highpart (__v);                                         \
 269                                                                         \
 270     __x0 = (uintmax_t) __ul * __vl;                                     \
 271     __x1 = (uintmax_t) __ul * __vh;                                     \
 272     __x2 = (uintmax_t) __uh * __vl;                                     \
 273     __x3 = (uintmax_t) __uh * __vh;                                     \
 274                                                                         \
 275     __x1 += __ll_highpart (__x0);/* This can't give carry.  */          \
 276     __x1 += __x2;               /* But this indeed can.  */             \
 277     if (__x1 < __x2)            /* Did we get it?  */                   \
 278       __x3 += __ll_B;           /* Yes, add it in the proper pos.  */   \
 279                                                                         \
 280     (w1) = __x3 + __ll_highpart (__x1);                                 \
 281     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 282   } while (0)
 283 #endif
 284
 285 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 286 /* Define our own, not needing normalization.  This function is
 287    currently not performance critical, so keep it simple.  Similar to
 288    the mod macro below.  */
 289 # undef udiv_qrnnd
 290 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 291   do {                                                                  \
 292     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 293                                                                         \
 294     __d1 = (d); __d0 = 0;                                               \
 295     __r1 = (n1); __r0 = (n0);                                           \
 296     affirm (__r1 < __d1);                                               \
 297     __q = 0;                                                            \
 298     for (int __i = W_TYPE_SIZE; __i > 0; __i--)                         \
 299       {                                                                 \
 300         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 301         __q <<= 1;                                                      \
 302         if (ge2 (__r1, __r0, __d1, __d0))                               \
 303           {                                                             \
 304             __q++;                                                      \
 305             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 306           }                                                             \
 307       }                                                                 \
 308     (r) = __r0;                                                         \
 309     (q) = __q;                                                          \
 310   } while (0)
 311 #endif
 312
 313 #if !defined add_ssaaaa
 314 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 315   do {                                                                  \
 316     uintmax_t _add_x;                                                   \
 317     _add_x = (al) + (bl);                                               \
 318     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 319     (sl) = _add_x;                                                      \
 320   } while (0)
 321 #endif
 322
 323 /* Set (rh,rl) = (ah,al) >> cnt, where 0 < cnt < W_TYPE_SIZE.  */
 324 #define rsh2(rh, rl, ah, al, cnt)                                       \
 325   do {                                                                  \
 326     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 327     (rh) = (ah) >> (cnt);                                               \
 328   } while (0)
 329
 330 /* Set (rh,rl) = (ah,al) << cnt, where 0 < cnt < W_TYPE_SIZE.  */
 331 #define lsh2(rh, rl, ah, al, cnt)                                       \
 332   do {                                                                  \
 333     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 334     (rl) = (al) << (cnt);                                               \
 335   } while (0)
 336
 337 #define ge2(ah, al, bh, bl)                                             \
 338   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 339
 340 #define gt2(ah, al, bh, bl)                                             \
 341   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 342
 343 #ifndef sub_ddmmss
 344 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 345   do {                                                                  \
 346     uintmax_t _cy;                                                      \
 347     _cy = (al) < (bl);                                                  \
 348     (rl) = (al) - (bl);                                                 \
 349     (rh) = (ah) - (bh) - _cy;                                           \
 350   } while (0)
 351 #endif
 352
 353 /* Requires that a < n and b <= n */
 354 #define submod(r,a,b,n)                                                 \
 355   do {                                                                  \
 356     uintmax_t _t = - (uintmax_t) (a < b);                               \
 357     (r) = ((n) & _t) + (a) - (b);                                       \
 358   } while (0)
 359
 360 #define addmod(r,a,b,n)                                                 \
 361   submod ((r), (a), ((n) - (b)), (n))
 362
 363 /* Modular two-word addition and subtraction.  For performance reasons, the
 364    most significant bit of n1 must be clear.  The destination variables must be
 365    distinct from the mod operand.  */
 366 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 367   do {                                                                  \
 368     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 369     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 370       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 371   } while (0)
 372 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 373   do {                                                                  \
 374     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 375     if ((intmax_t) (r1) < 0)                                            \
 376       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 377   } while (0)
 378
 379 #define HIGHBIT_TO_MASK(x)                                              \
 380   (((intmax_t)-1 >> 1) < 0                                              \
 381    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 382    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 383       ? UINTMAX_MAX : (uintmax_t) 0))
 384
 385 /* Return r = a mod d, where a = <a1,a0>, d = <d1,d0>.
 386    Requires that d1 != 0.  */
 387 ATTRIBUTE_PURE static uuint
 388 mod2 (uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 389 {
 390   affirm (d1 != 0);
 391
 392   if (a1)
 393     {
 394       int cntd = stdc_leading_zeros (d1);
 395       int cnta = stdc_leading_zeros (a1);
 396       int cnt = cntd - cnta;
 397       if (0 < cnt)
 398         {
 399           lsh2 (d1, d0, d1, d0, cnt);
 400           for (int i = 0; i < cnt; i++)
 401             {
 402               if (ge2 (a1, a0, d1, d0))
 403                 sub_ddmmss (a1, a0, a1, a0, d1, d0);
 404               rsh2 (d1, d0, d1, d0, 1);
 405             }
 406         }
 407     }
 408
 409   return make_uuint (a1, a0);
 410 }
 411
 412 ATTRIBUTE_CONST
 413 static uintmax_t
 414 gcd_odd (uintmax_t a, uintmax_t b)
 415 {
 416   if ((b & 1) == 0)
 417     {
 418       uintmax_t t = b;
 419       b = a;
 420       a = t;
 421     }
 422   if (a == 0)
 423     return b;
 424
 425   /* Take out least significant one bit, to make room for sign */
 426   b >>= 1;
 427
 428   for (;;)
 429     {
 430       uintmax_t t;
 431       uintmax_t bgta;
 432
 433       assume (a);
 434       a >>= stdc_trailing_zeros (a);
 435       a >>= 1;
 436
 437       t = a - b;
 438       if (t == 0)
 439         return (a << 1) + 1;
 440
 441       bgta = HIGHBIT_TO_MASK (t);
 442
 443       /* b <-- min (a, b) */
 444       b += (bgta & t);
 445
 446       /* a <-- |a - b| */
 447       a = (t ^ bgta) - bgta;
 448     }
 449 }
 450
 451 static uintmax_t
 452 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 453 {
 454   affirm (b0 & 1);
 455
 456   if ((a0 | a1) == 0)
 457     {
 458       *r1 = b1;
 459       return b0;
 460     }
 461   if (!a0)
 462     a0 = a1, a1 = 0;
 463   assume (a0);
 464   int ctz = stdc_trailing_zeros (a0);
 465   if (ctz)
 466     rsh2 (a1, a0, a1, a0, ctz);
 467
 468   for (;;)
 469     {
 470       if ((b1 | a1) == 0)
 471         {
 472           *r1 = 0;
 473           return gcd_odd (b0, a0);
 474         }
 475
 476       if (gt2 (a1, a0, b1, b0))
 477         {
 478           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 479           if (!a0)
 480             a0 = a1, a1 = 0;
 481           assume (a0);
 482           ctz = stdc_trailing_zeros (a0);
 483           if (ctz)
 484             rsh2 (a1, a0, a1, a0, ctz);
 485         }
 486       else if (gt2 (b1, b0, a1, a0))
 487         {
 488           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 489           if (!b0)
 490             b0 = b1, b1 = 0;
 491           assume (b0);
 492           ctz = stdc_trailing_zeros (b0);
 493           if (ctz)
 494             rsh2 (b1, b0, b1, b0, ctz);
 495         }
 496       else
 497         break;
 498     }
 499
 500   *r1 = a1;
 501   return a0;
 502 }
 503
 504 static void
 505 factor_insert_multiplicity (struct factors *factors,
 506                             uintmax_t prime, int m)
 507 {
 508   int nfactors = factors->nfactors;
 509   uintmax_t *p = factors->p;
 510   unsigned char *e = factors->e;
 511
 512   /* Locate position for insert new or increment e.  */
 513   int i;
 514   for (i = nfactors - 1; i >= 0; i--)
 515     {
 516       if (p[i] <= prime)
 517         break;
 518     }
 519
 520   if (i < 0 || p[i] != prime)
 521     {
 522       for (int j = nfactors - 1; j > i; j--)
 523         {
 524           p[j + 1] = p[j];
 525           e[j + 1] = e[j];
 526         }
 527       p[i + 1] = prime;
 528       e[i + 1] = m;
 529       factors->nfactors = nfactors + 1;
 530     }
 531   else
 532     {
 533       e[i] += m;
 534     }
 535 }
 536
 537 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 538
 539 static void
 540 factor_insert_large (struct factors *factors,
 541                      uintmax_t p1, uintmax_t p0)
 542 {
 543   if (p1 > 0)
 544     {
 545       affirm (hi (factors->plarge) == 0);
 546       factors->plarge = make_uuint (p1, p0);
 547     }
 548   else
 549     factor_insert (factors, p0);
 550 }
 551
 552 #ifndef mpz_inits
 553
 554 # include <stdarg.h>
 555
 556 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 557 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 558
 559 static void
 560 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 561 {
 562   va_list ap;
 563
 564   va_start (ap, mpz_single_init);
 565
 566   mpz_t *mpz;
 567   while ((mpz = va_arg (ap, mpz_t *)))
 568     mpz_single_init (*mpz);
 569
 570   va_end (ap);
 571 }
 572 #endif
 573
 574 static void mp_factor (mpz_t, struct mp_factors *);
 575
 576 static void
 577 mp_factor_init (struct mp_factors *factors)
 578 {
 579   factors->p = nullptr;
 580   factors->e = nullptr;
 581   factors->nfactors = 0;
 582   factors->nalloc = 0;
 583 }
 584
 585 static void
 586 mp_factor_clear (struct mp_factors *factors)
 587 {
 588   for (idx_t i = 0; i < factors->nfactors; i++)
 589     mpz_clear (factors->p[i]);
 590
 591   free (factors->p);
 592   free (factors->e);
 593 }
 594
 595 static void
 596 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 597 {
 598   idx_t nfactors = factors->nfactors;
 599   mpz_t *p = factors->p;
 600   unsigned long int *e = factors->e;
 601   ptrdiff_t i;
 602
 603   /* Locate position for insert new or increment e.  */
 604   for (i = nfactors - 1; i >= 0; i--)
 605     {
 606       if (mpz_cmp (p[i], prime) <= 0)
 607         break;
 608     }
 609
 610   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 611     {
 612       if (factors->nfactors == factors->nalloc)
 613         {
 614           p = xpalloc (p, &factors->nalloc, 1, -1, sizeof *p);
 615           e = xireallocarray (e, factors->nalloc, sizeof *e);
 616         }
 617
 618       mpz_init (p[nfactors]);
 619       for (ptrdiff_t j = nfactors - 1; j > i; j--)
 620         {
 621           mpz_set (p[j + 1], p[j]);
 622           e[j + 1] = e[j];
 623         }
 624       mpz_set (p[i + 1], prime);
 625       e[i + 1] = 1;
 626
 627       factors->p = p;
 628       factors->e = e;
 629       factors->nfactors = nfactors + 1;
 630     }
 631   else
 632     {
 633       e[i] += 1;
 634     }
 635 }
 636
 637 static void
 638 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 639 {
 640   mpz_t pz;
 641
 642   mpz_init_set_ui (pz, prime);
 643   mp_factor_insert (factors, pz);
 644   mpz_clear (pz);
 645 }
 646
 647
 648 /* Number of bits in an uintmax_t.  */
 649 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 650
 651 /* Verify that uintmax_t does not have holes in its representation.  */
 652 static_assert (UINTMAX_MAX >> (W - 1) == 1);
 653
 654 #define P(a,b,c,d) a,
 655 static const unsigned char primes_diff[] = {
 656 #include "primes.h"
 657 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 658 };
 659 #undef P
 660
 661 #define PRIMES_PTAB_ENTRIES \
 662   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 663
 664 #define P(a,b,c,d) b,
 665 static const unsigned char primes_diff8[] = {
 666 #include "primes.h"
 667 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 668 };
 669 #undef P
 670
 671 struct primes_dtab
 672 {
 673   uintmax_t binv, lim;
 674 };
 675
 676 #define P(a,b,c,d) {c,d},
 677 static const struct primes_dtab primes_dtab[] = {
 678 #include "primes.h"
 679 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 680 };
 681 #undef P
 682
 683 /* Verify that uintmax_t is not wider than
 684    the integers used to generate primes.h.  */
 685 static_assert (W <= WIDE_UINT_BITS);
 686
 687 /* debugging for developers.  Enables devmsg().
 688    This flag is used only in the GMP code.  */
 689 static bool dev_debug = false;
 690
 691 /* Prove primality or run probabilistic tests.  */
 692 static bool flag_prove_primality = PROVE_PRIMALITY;
 693
 694 /* Number of Miller-Rabin tests to run when not proving primality.  */
 695 #define MR_REPS 25
 696
 697 static void
 698 factor_insert_refind (struct factors *factors, uintmax_t p, int i, int off)
 699 {
 700   for (int j = 0; j < off; j++)
 701     p += primes_diff[i + j];
 702   factor_insert (factors, p);
 703 }
 704
 705 /* Trial division with odd primes uses the following trick.
 706
 707    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}.  For simplicity,
 708    consider the case t < B (this is the second loop below).
 709
 710    From our tables we get
 711
 712      binv = p^{-1} (mod B)
 713      lim = floor ((B-1) / p).
 714
 715    First assume that t is a multiple of p, t = q * p.  Then 0 <= q <= lim
 716    (and all quotients in this range occur for some t).
 717
 718    Then t = q * p is true also (mod B), and p is invertible we get
 719
 720      q = t * binv (mod B).
 721
 722    Next, assume that t is *not* divisible by p.  Since multiplication
 723    by binv (mod B) is a one-to-one mapping,
 724
 725      t * binv (mod B) > lim,
 726
 727    because all the smaller values are already taken.
 728
 729    This can be summed up by saying that the function
 730
 731      q(t) = binv * t (mod B)
 732
 733    is a permutation of the range 0 <= t < B, with the curious property
 734    that it maps the multiples of p onto the range 0 <= q <= lim, in
 735    order, and the non-multiples of p onto the range lim < q < B.
 736  */
 737
 738 static uintmax_t
 739 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 740                        struct factors *factors)
 741 {
 742   if (t0 % 2 == 0)
 743     {
 744       int cnt;
 745
 746       if (t0 == 0)
 747         {
 748           assume (t1);
 749           cnt = stdc_trailing_zeros (t1);
 750           t0 = t1 >> cnt;
 751           t1 = 0;
 752           cnt += W_TYPE_SIZE;
 753         }
 754       else
 755         {
 756           cnt = stdc_trailing_zeros (t0);
 757           rsh2 (t1, t0, t1, t0, cnt);
 758         }
 759
 760       factor_insert_multiplicity (factors, 2, cnt);
 761     }
 762
 763   uintmax_t p = 3;
 764   idx_t i;
 765   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 766     {
 767       for (;;)
 768         {
 769           uintmax_t q1, q0, hi;
 770           MAYBE_UNUSED uintmax_t lo;
 771
 772           q0 = t0 * primes_dtab[i].binv;
 773           umul_ppmm (hi, lo, q0, p);
 774           if (hi > t1)
 775             break;
 776           hi = t1 - hi;
 777           q1 = hi * primes_dtab[i].binv;
 778           if (LIKELY (q1 > primes_dtab[i].lim))
 779             break;
 780           t1 = q1; t0 = q0;
 781           factor_insert (factors, p);
 782         }
 783       p += primes_diff[i + 1];
 784     }
 785   if (t1p)
 786     *t1p = t1;
 787
 788 #define DIVBLOCK(I)                                                     \
 789   do {                                                                  \
 790     for (;;)                                                            \
 791       {                                                                 \
 792         q = t0 * pd[I].binv;                                            \
 793         if (LIKELY (q > pd[I].lim))                                     \
 794           break;                                                        \
 795         t0 = q;                                                         \
 796         factor_insert_refind (factors, p, i + 1, I);                    \
 797       }                                                                 \
 798   } while (0)
 799
 800   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 801     {
 802       uintmax_t q;
 803       const struct primes_dtab *pd = &primes_dtab[i];
 804       DIVBLOCK (0);
 805       DIVBLOCK (1);
 806       DIVBLOCK (2);
 807       DIVBLOCK (3);
 808       DIVBLOCK (4);
 809       DIVBLOCK (5);
 810       DIVBLOCK (6);
 811       DIVBLOCK (7);
 812
 813       p += primes_diff8[i];
 814       if (p * p > t0)
 815         break;
 816     }
 817
 818   return t0;
 819 }
 820
 821 static void
 822 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 823 {
 824   mpz_t q;
 825   mp_bitcnt_t p;
 826
 827   devmsg ("[trial division] ");
 828
 829   mpz_init (q);
 830
 831   p = mpz_scan1 (t, 0);
 832   mpz_fdiv_q_2exp (t, t, p);
 833   while (p)
 834     {
 835       mp_factor_insert_ui (factors, 2);
 836       --p;
 837     }
 838
 839   unsigned long int d = 3;
 840   for (idx_t i = 1; i <= PRIMES_PTAB_ENTRIES;)
 841     {
 842       if (! mpz_divisible_ui_p (t, d))
 843         {
 844           d += primes_diff[i++];
 845           if (mpz_cmp_ui (t, d * d) < 0)
 846             break;
 847         }
 848       else
 849         {
 850           mpz_tdiv_q_ui (t, t, d);
 851           mp_factor_insert_ui (factors, d);
 852         }
 853     }
 854
 855   mpz_clear (q);
 856 }
 857
 858 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 859 static const unsigned char  binvert_table[128] =
 860 {
 861   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 862   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 863   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 864   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 865   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 866   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 867   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 868   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 869   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 870   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 871   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 872   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 873   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 874   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 875   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 876   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 877 };
 878
 879 /* Compute n^(-1) mod B, using a Newton iteration.  */
 880 #define binv(inv,n)                                                     \
 881   do {                                                                  \
 882     uintmax_t  __n = (n);                                               \
 883     uintmax_t  __inv;                                                   \
 884                                                                         \
 885     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 886     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 887     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 888     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 889                                                                         \
 890     if (W_TYPE_SIZE > 64)                                               \
 891       {                                                                 \
 892         int  __invbits = 64;                                            \
 893         do {                                                            \
 894           __inv = 2 * __inv - __inv * __inv * __n;                      \
 895           __invbits *= 2;                                               \
 896         } while (__invbits < W_TYPE_SIZE);                              \
 897       }                                                                 \
 898                                                                         \
 899     (inv) = __inv;                                                      \
 900   } while (0)
 901
 902 /* q = u / d, assuming d|u.  */
 903 #define divexact_21(q1, q0, u1, u0, d)                                  \
 904   do {                                                                  \
 905     uintmax_t _di, _q0;                                                 \
 906     binv (_di, (d));                                                    \
 907     _q0 = (u0) * _di;                                                   \
 908     if ((u1) >= (d))                                                    \
 909       {                                                                 \
 910         uintmax_t _p1;                                                  \
 911         MAYBE_UNUSED intmax_t _p0;                                      \
 912         umul_ppmm (_p1, _p0, _q0, d);                                   \
 913         (q1) = ((u1) - _p1) * _di;                                      \
 914         (q0) = _q0;                                                     \
 915       }                                                                 \
 916     else                                                                \
 917       {                                                                 \
 918         (q0) = _q0;                                                     \
 919         (q1) = 0;                                                       \
 920       }                                                                 \
 921   } while (0)
 922
 923 /* x B (mod n).  */
 924 #define redcify(r_prim, r, n)                                           \
 925   do {                                                                  \
 926     MAYBE_UNUSED uintmax_t _redcify_q;                                  \
 927     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 928   } while (0)
 929
 930 /* x B^2 (mod n).  Requires x > 0, n1 < B/2.  */
 931 #define redcify2(r1, r0, x, n1, n0)                                     \
 932   do {                                                                  \
 933     uintmax_t _r1, _r0, _i;                                             \
 934     if ((x) < (n1))                                                     \
 935       {                                                                 \
 936         _r1 = (x); _r0 = 0;                                             \
 937         _i = W_TYPE_SIZE;                                               \
 938       }                                                                 \
 939     else                                                                \
 940       {                                                                 \
 941         _r1 = 0; _r0 = (x);                                             \
 942         _i = 2 * W_TYPE_SIZE;                                           \
 943       }                                                                 \
 944     while (_i-- > 0)                                                    \
 945       {                                                                 \
 946         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 947         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 948           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 949       }                                                                 \
 950     (r1) = _r1;                                                         \
 951     (r0) = _r0;                                                         \
 952   } while (0)
 953
 954 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 955    Both a and b must be in redc form, the result will be in redc form too.  */
 956 static inline uintmax_t
 957 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 958 {
 959   uintmax_t rh, rl, q, th, xh;
 960   MAYBE_UNUSED uintmax_t tl;
 961
 962   umul_ppmm (rh, rl, a, b);
 963   q = rl * mi;
 964   umul_ppmm (th, tl, q, m);
 965   xh = rh - th;
 966   if (rh < th)
 967     xh += m;
 968
 969   return xh;
 970 }
 971
 972 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 973    Both a and b must be in redc form, the result will be in redc form too.
 974    For performance reasons, the most significant bit of m must be clear.  */
 975 static uintmax_t
 976 mulredc2 (uintmax_t *r1p,
 977           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 978           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 979 {
 980   uintmax_t r1, r0, q, p1, t1, t0, s1, s0;
 981   MAYBE_UNUSED uintmax_t p0;
 982   mi = -mi;
 983   affirm ((a1 >> (W_TYPE_SIZE - 1)) == 0);
 984   affirm ((b1 >> (W_TYPE_SIZE - 1)) == 0);
 985   affirm ((m1 >> (W_TYPE_SIZE - 1)) == 0);
 986
 987   /* First compute a0 * <b1, b0> B^{-1}
 988         +-----+
 989         |a0 b0|
 990      +--+--+--+
 991      |a0 b1|
 992      +--+--+--+
 993         |q0 m0|
 994      +--+--+--+
 995      |q0 m1|
 996     -+--+--+--+
 997      |r1|r0| 0|
 998      +--+--+--+
 999   */
1000   umul_ppmm (t1, t0, a0, b0);
1001   umul_ppmm (r1, r0, a0, b1);
1002   q = mi * t0;
1003   umul_ppmm (p1, p0, q, m0);
1004   umul_ppmm (s1, s0, q, m1);
1005   r0 += (t0 != 0); /* Carry */
1006   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1007   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1008   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1009
1010   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1011         +-----+
1012         |a1 b0|
1013         +--+--+
1014         |r1|r0|
1015      +--+--+--+
1016      |a1 b1|
1017      +--+--+--+
1018         |q1 m0|
1019      +--+--+--+
1020      |q1 m1|
1021     -+--+--+--+
1022      |r1|r0| 0|
1023      +--+--+--+
1024   */
1025   umul_ppmm (t1, t0, a1, b0);
1026   umul_ppmm (s1, s0, a1, b1);
1027   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1028   q = mi * t0;
1029   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1030   umul_ppmm (p1, p0, q, m0);
1031   umul_ppmm (s1, s0, q, m1);
1032   r0 += (t0 != 0); /* Carry */
1033   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1034   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1035   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1036
1037   if (ge2 (r1, r0, m1, m0))
1038     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1039
1040   *r1p = r1;
1041   return r0;
1042 }
1043
1044 ATTRIBUTE_CONST
1045 static uintmax_t
1046 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1047 {
1048   uintmax_t y = one;
1049
1050   if (e & 1)
1051     y = b;
1052
1053   while (e != 0)
1054     {
1055       b = mulredc (b, b, n, ni);
1056       e >>= 1;
1057
1058       if (e & 1)
1059         y = mulredc (y, b, n, ni);
1060     }
1061
1062   return y;
1063 }
1064
1065 static uintmax_t
1066 powm2 (uintmax_t *r1m,
1067        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1068        uintmax_t ni, const uintmax_t *one)
1069 {
1070   uintmax_t r1, r0, b1, b0, n1, n0;
1071   int i;
1072   uintmax_t e;
1073
1074   b0 = bp[0];
1075   b1 = bp[1];
1076   n0 = np[0];
1077   n1 = np[1];
1078
1079   r0 = one[0];
1080   r1 = one[1];
1081
1082   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1083     {
1084       if (e & 1)
1085         {
1086           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1087           r1 = *r1m;
1088         }
1089       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1090       b1 = *r1m;
1091     }
1092   for (e = ep[1]; e > 0; e >>= 1)
1093     {
1094       if (e & 1)
1095         {
1096           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1097           r1 = *r1m;
1098         }
1099       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1100       b1 = *r1m;
1101     }
1102   *r1m = r1;
1103   return r0;
1104 }
1105
1106 ATTRIBUTE_CONST
1107 static bool
1108 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1109              int k, uintmax_t one)
1110 {
1111   uintmax_t y = powm (b, q, n, ni, one);
1112
1113   uintmax_t nm1 = n - one;      /* -1, but in redc representation.  */
1114
1115   if (y == one || y == nm1)
1116     return true;
1117
1118   for (int i = 1; i < k; i++)
1119     {
1120       y = mulredc (y, y, n, ni);
1121
1122       if (y == nm1)
1123         return true;
1124       if (y == one)
1125         return false;
1126     }
1127   return false;
1128 }
1129
1130 ATTRIBUTE_PURE static bool
1131 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1132               const uintmax_t *qp, int k, const uintmax_t *one)
1133 {
1134   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1135
1136   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1137   y1 = r1m;
1138
1139   if (y0 == one[0] && y1 == one[1])
1140     return true;
1141
1142   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1143
1144   if (y0 == nm1_0 && y1 == nm1_1)
1145     return true;
1146
1147   for (int i = 1; i < k; i++)
1148     {
1149       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1150       y1 = r1m;
1151
1152       if (y0 == nm1_0 && y1 == nm1_1)
1153         return true;
1154       if (y0 == one[0] && y1 == one[1])
1155         return false;
1156     }
1157   return false;
1158 }
1159
1160 static bool
1161 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1162                 mpz_srcptr q, mp_bitcnt_t k)
1163 {
1164   mpz_powm (y, x, q, n);
1165
1166   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1167     return true;
1168
1169   for (mp_bitcnt_t i = 1; i < k; i++)
1170     {
1171       mpz_powm_ui (y, y, 2, n);
1172       if (mpz_cmp (y, nm1) == 0)
1173         return true;
1174       if (mpz_cmp_ui (y, 1) == 0)
1175         return false;
1176     }
1177   return false;
1178 }
1179
1180 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1181    have been observed.  The average seem to be about 2.  */
1182 static bool ATTRIBUTE_PURE
1183 prime_p (uintmax_t n)
1184 {
1185   bool is_prime;
1186   uintmax_t a_prim, one, ni;
1187   struct factors factors;
1188
1189   if (n <= 1)
1190     return false;
1191
1192   /* We have already cast out small primes.  */
1193   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1194     return true;
1195
1196   /* Precomputation for Miller-Rabin.  */
1197   int k = stdc_trailing_zeros (n - 1);
1198   uintmax_t q = (n - 1) >> k;
1199
1200   uintmax_t a = 2;
1201   binv (ni, n);                 /* ni <- 1/n mod B */
1202   redcify (one, 1, n);
1203   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1204
1205   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1206   if (!millerrabin (n, ni, a_prim, q, k, one))
1207     return false;
1208
1209   if (flag_prove_primality)
1210     {
1211       /* Factor n-1 for Lucas.  */
1212       factor (0, n - 1, &factors);
1213     }
1214
1215   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1216      number composite.  */
1217   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1218     {
1219       if (flag_prove_primality)
1220         {
1221           is_prime = true;
1222           for (int i = 0; i < factors.nfactors && is_prime; i++)
1223             {
1224               is_prime
1225                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1226             }
1227         }
1228       else
1229         {
1230           /* After enough Miller-Rabin runs, be content.  */
1231           is_prime = (r == MR_REPS - 1);
1232         }
1233
1234       if (is_prime)
1235         return true;
1236
1237       a += primes_diff[r];      /* Establish new base.  */
1238
1239       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1240          on most processors, since it avoids udiv_qrnnd.  If we go down the
1241          udiv_qrnnd_preinv path, this code should be replaced.  */
1242       {
1243         uintmax_t s1, s0;
1244         umul_ppmm (s1, s0, one, a);
1245         if (LIKELY (s1 == 0))
1246           a_prim = s0 % n;
1247         else
1248           {
1249             MAYBE_UNUSED uintmax_t dummy;
1250             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1251           }
1252       }
1253
1254       if (!millerrabin (n, ni, a_prim, q, k, one))
1255         return false;
1256     }
1257
1258   affirm (!"Lucas prime test failure.  This should not happen");
1259 }
1260
1261 static bool ATTRIBUTE_PURE
1262 prime2_p (uintmax_t n1, uintmax_t n0)
1263 {
1264   uintmax_t q[2], nm1[2];
1265   uintmax_t a_prim[2];
1266   uintmax_t one[2];
1267   uintmax_t na[2];
1268   uintmax_t ni;
1269   int k;
1270   struct factors factors;
1271
1272   if (n1 == 0)
1273     return prime_p (n0);
1274
1275   nm1[1] = n1 - (n0 == 0);
1276   nm1[0] = n0 - 1;
1277   if (nm1[0] == 0)
1278     {
1279       assume (nm1[1]);
1280       k = stdc_trailing_zeros (nm1[1]);
1281
1282       q[0] = nm1[1] >> k;
1283       q[1] = 0;
1284       k += W_TYPE_SIZE;
1285     }
1286   else
1287     {
1288       k = stdc_trailing_zeros (nm1[0]);
1289       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1290     }
1291
1292   uintmax_t a = 2;
1293   binv (ni, n0);
1294   redcify2 (one[1], one[0], 1, n1, n0);
1295   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1296
1297   /* FIXME: Use scalars or pointers in arguments?  Some consistency needed.  */
1298   na[0] = n0;
1299   na[1] = n1;
1300
1301   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1302     return false;
1303
1304   if (flag_prove_primality)
1305     {
1306       /* Factor n-1 for Lucas.  */
1307       factor (nm1[1], nm1[0], &factors);
1308     }
1309
1310   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1311      number composite.  */
1312   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1313     {
1314       bool is_prime;
1315       uintmax_t e[2], y[2];
1316
1317       if (flag_prove_primality)
1318         {
1319           is_prime = true;
1320           if (hi (factors.plarge))
1321             {
1322               uintmax_t pi;
1323               binv (pi, lo (factors.plarge));
1324               e[0] = pi * nm1[0];
1325               e[1] = 0;
1326               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1327               is_prime = (y[0] != one[0] || y[1] != one[1]);
1328             }
1329           for (int i = 0; i < factors.nfactors && is_prime; i++)
1330             {
1331               /* FIXME: We always have the factor 2.  Do we really need to
1332                  handle it here?  We have done the same powering as part
1333                  of millerrabin.  */
1334               if (factors.p[i] == 2)
1335                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1336               else
1337                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1338               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1339               is_prime = (y[0] != one[0] || y[1] != one[1]);
1340             }
1341         }
1342       else
1343         {
1344           /* After enough Miller-Rabin runs, be content.  */
1345           is_prime = (r == MR_REPS - 1);
1346         }
1347
1348       if (is_prime)
1349         return true;
1350
1351       a += primes_diff[r];      /* Establish new base.  */
1352       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1353
1354       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1355         return false;
1356     }
1357
1358   affirm (!"Lucas prime test failure.  This should not happen");
1359 }
1360
1361 static bool
1362 mp_prime_p (mpz_t n)
1363 {
1364   bool is_prime;
1365   mpz_t q, a, nm1, tmp;
1366   struct mp_factors factors;
1367
1368   if (mpz_cmp_ui (n, 1) <= 0)
1369     return false;
1370
1371   /* We have already cast out small primes.  */
1372   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1373     return true;
1374
1375   mpz_inits (q, a, nm1, tmp, nullptr);
1376
1377   /* Precomputation for Miller-Rabin.  */
1378   mpz_sub_ui (nm1, n, 1);
1379
1380   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1381   mp_bitcnt_t k = mpz_scan1 (nm1, 0);
1382   mpz_tdiv_q_2exp (q, nm1, k);
1383
1384   mpz_set_ui (a, 2);
1385
1386   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1387   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1388     {
1389       is_prime = false;
1390       goto ret2;
1391     }
1392
1393   if (flag_prove_primality)
1394     {
1395       /* Factor n-1 for Lucas.  */
1396       mpz_set (tmp, nm1);
1397       mp_factor (tmp, &factors);
1398     }
1399
1400   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1401      number composite.  */
1402   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1403     {
1404       if (flag_prove_primality)
1405         {
1406           is_prime = true;
1407           for (idx_t i = 0; i < factors.nfactors && is_prime; i++)
1408             {
1409               mpz_divexact (tmp, nm1, factors.p[i]);
1410               mpz_powm (tmp, a, tmp, n);
1411               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1412             }
1413         }
1414       else
1415         {
1416           /* After enough Miller-Rabin runs, be content.  */
1417           is_prime = (r == MR_REPS - 1);
1418         }
1419
1420       if (is_prime)
1421         goto ret1;
1422
1423       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1424
1425       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1426         {
1427           is_prime = false;
1428           goto ret1;
1429         }
1430     }
1431
1432   affirm (!"Lucas prime test failure.  This should not happen");
1433
1434  ret1:
1435   if (flag_prove_primality)
1436     mp_factor_clear (&factors);
1437  ret2:
1438   mpz_clears (q, a, nm1, tmp, nullptr);
1439
1440   return is_prime;
1441 }
1442
1443 static void
1444 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1445                           struct factors *factors)
1446 {
1447   uintmax_t x, z, y, P, t, ni, g;
1448
1449   unsigned long int k = 1;
1450   unsigned long int l = 1;
1451
1452   redcify (P, 1, n);
1453   addmod (x, P, P, n);          /* i.e., redcify(2) */
1454   y = z = x;
1455
1456   while (n != 1)
1457     {
1458       affirm (a < n);
1459
1460       binv (ni, n);             /* FIXME: when could we use old 'ni' value?  */
1461
1462       for (;;)
1463         {
1464           do
1465             {
1466               x = mulredc (x, x, n, ni);
1467               addmod (x, x, a, n);
1468
1469               submod (t, z, x, n);
1470               P = mulredc (P, t, n, ni);
1471
1472               if (k % 32 == 1)
1473                 {
1474                   if (gcd_odd (P, n) != 1)
1475                     goto factor_found;
1476                   y = x;
1477                 }
1478             }
1479           while (--k != 0);
1480
1481           z = x;
1482           k = l;
1483           l = 2 * l;
1484           for (unsigned long int i = 0; i < k; i++)
1485             {
1486               x = mulredc (x, x, n, ni);
1487               addmod (x, x, a, n);
1488             }
1489           y = x;
1490         }
1491
1492     factor_found:
1493       do
1494         {
1495           y = mulredc (y, y, n, ni);
1496           addmod (y, y, a, n);
1497
1498           submod (t, z, y, n);
1499           g = gcd_odd (t, n);
1500         }
1501       while (g == 1);
1502
1503       if (n == g)
1504         {
1505           /* Found n itself as factor.  Restart with different params.  */
1506           factor_using_pollard_rho (n, a + 1, factors);
1507           return;
1508         }
1509
1510       n = n / g;
1511
1512       if (!prime_p (g))
1513         factor_using_pollard_rho (g, a + 1, factors);
1514       else
1515         factor_insert (factors, g);
1516
1517       if (prime_p (n))
1518         {
1519           factor_insert (factors, n);
1520           break;
1521         }
1522
1523       x = x % n;
1524       z = z % n;
1525       y = y % n;
1526     }
1527 }
1528
1529 static void
1530 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1531                            struct factors *factors)
1532 {
1533   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1534
1535   unsigned long int k = 1;
1536   unsigned long int l = 1;
1537
1538   redcify2 (P1, P0, 1, n1, n0);
1539   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1540   y1 = z1 = x1;
1541   y0 = z0 = x0;
1542
1543   while (n1 != 0 || n0 != 1)
1544     {
1545       binv (ni, n0);
1546
1547       for (;;)
1548         {
1549           do
1550             {
1551               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1552               x1 = r1m;
1553               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1554
1555               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1556               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1557               P1 = r1m;
1558
1559               if (k % 32 == 1)
1560                 {
1561                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1562                   if (g1 != 0 || g0 != 1)
1563                     goto factor_found;
1564                   y1 = x1; y0 = x0;
1565                 }
1566             }
1567           while (--k != 0);
1568
1569           z1 = x1; z0 = x0;
1570           k = l;
1571           l = 2 * l;
1572           for (unsigned long int i = 0; i < k; i++)
1573             {
1574               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1575               x1 = r1m;
1576               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1577             }
1578           y1 = x1; y0 = x0;
1579         }
1580
1581     factor_found:
1582       do
1583         {
1584           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1585           y1 = r1m;
1586           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1587
1588           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1589           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1590         }
1591       while (g1 == 0 && g0 == 1);
1592
1593       if (g1 == 0)
1594         {
1595           /* The found factor is one word, and > 1.  */
1596           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1597
1598           if (!prime_p (g0))
1599             factor_using_pollard_rho (g0, a + 1, factors);
1600           else
1601             factor_insert (factors, g0);
1602         }
1603       else
1604         {
1605           /* The found factor is two words.  This is highly unlikely, thus hard
1606              to trigger.  Please be careful before you change this code!  */
1607           uintmax_t ginv;
1608
1609           if (n1 == g1 && n0 == g0)
1610             {
1611               /* Found n itself as factor.  Restart with different params.  */
1612               factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1613               return;
1614             }
1615
1616           /* Compute n = n / g.  Since the result will fit one word,
1617              we can compute the quotient modulo B, ignoring the high
1618              divisor word.  */
1619           binv (ginv, g0);
1620           n0 = ginv * n0;
1621           n1 = 0;
1622
1623           if (!prime2_p (g1, g0))
1624             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1625           else
1626             factor_insert_large (factors, g1, g0);
1627         }
1628
1629       if (n1 == 0)
1630         {
1631           if (prime_p (n0))
1632             {
1633               factor_insert (factors, n0);
1634               break;
1635             }
1636
1637           factor_using_pollard_rho (n0, a, factors);
1638           return;
1639         }
1640
1641       if (prime2_p (n1, n0))
1642         {
1643           factor_insert_large (factors, n1, n0);
1644           break;
1645         }
1646
1647       uuset (&x1, &x0, mod2 (x1, x0, n1, n0));
1648       uuset (&z1, &z0, mod2 (z1, z0, n1, n0));
1649       uuset (&y1, &y0, mod2 (y1, y0, n1, n0));
1650     }
1651 }
1652
1653 static void
1654 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1655                              struct mp_factors *factors)
1656 {
1657   mpz_t x, z, y, P;
1658   mpz_t t, t2;
1659
1660   devmsg ("[pollard-rho (%lu)] ", a);
1661
1662   mpz_inits (t, t2, nullptr);
1663   mpz_init_set_si (y, 2);
1664   mpz_init_set_si (x, 2);
1665   mpz_init_set_si (z, 2);
1666   mpz_init_set_ui (P, 1);
1667
1668   unsigned long long int k = 1;
1669   unsigned long long int l = 1;
1670
1671   while (mpz_cmp_ui (n, 1) != 0)
1672     {
1673       for (;;)
1674         {
1675           do
1676             {
1677               mpz_mul (t, x, x);
1678               mpz_mod (x, t, n);
1679               mpz_add_ui (x, x, a);
1680
1681               mpz_sub (t, z, x);
1682               mpz_mul (t2, P, t);
1683               mpz_mod (P, t2, n);
1684
1685               if (k % 32 == 1)
1686                 {
1687                   mpz_gcd (t, P, n);
1688                   if (mpz_cmp_ui (t, 1) != 0)
1689                     goto factor_found;
1690                   mpz_set (y, x);
1691                 }
1692             }
1693           while (--k != 0);
1694
1695           mpz_set (z, x);
1696           k = l;
1697           l = 2 * l;
1698           for (unsigned long long int i = 0; i < k; i++)
1699             {
1700               mpz_mul (t, x, x);
1701               mpz_mod (x, t, n);
1702               mpz_add_ui (x, x, a);
1703             }
1704           mpz_set (y, x);
1705         }
1706
1707     factor_found:
1708       do
1709         {
1710           mpz_mul (t, y, y);
1711           mpz_mod (y, t, n);
1712           mpz_add_ui (y, y, a);
1713
1714           mpz_sub (t, z, y);
1715           mpz_gcd (t, t, n);
1716         }
1717       while (mpz_cmp_ui (t, 1) == 0);
1718
1719       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1720
1721       if (!mp_prime_p (t))
1722         {
1723           devmsg ("[composite factor--restarting pollard-rho] ");
1724           mp_factor_using_pollard_rho (t, a + 1, factors);
1725         }
1726       else
1727         {
1728           mp_factor_insert (factors, t);
1729         }
1730
1731       if (mp_prime_p (n))
1732         {
1733           mp_factor_insert (factors, n);
1734           break;
1735         }
1736
1737       mpz_mod (x, x, n);
1738       mpz_mod (z, z, n);
1739       mpz_mod (y, y, n);
1740     }
1741
1742   mpz_clears (P, t2, t, z, x, y, nullptr);
1743 }
1744
1745 #if USE_SQUFOF
1746 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1747    algorithm is replaced, consider also returning the remainder.  */
1748 ATTRIBUTE_CONST
1749 static uintmax_t
1750 isqrt (uintmax_t n)
1751 {
1752   if (n == 0)
1753     return 0;
1754
1755   int c = stdc_leading_zeros (n);
1756
1757   /* Make x > sqrt(n).  This will be invariant through the loop.  */
1758   uintmax_t x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) >> 1);
1759
1760   for (;;)
1761     {
1762       uintmax_t y = (x + n / x) / 2;
1763       if (y >= x)
1764         return x;
1765
1766       x = y;
1767     }
1768 }
1769
1770 ATTRIBUTE_CONST
1771 static uintmax_t
1772 isqrt2 (uintmax_t nh, uintmax_t nl)
1773 {
1774   /* Ensures the remainder fits in an uintmax_t.  */
1775   affirm (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1776
1777   if (nh == 0)
1778     return isqrt (nl);
1779
1780   int shift = stdc_leading_zeros (nh) & ~1;
1781
1782   /* Make x > sqrt (n).  */
1783   uintmax_t x = isqrt ((nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1784   x <<= (W_TYPE_SIZE - shift) >> 1;
1785
1786   /* Do we need more than one iteration?  */
1787   for (;;)
1788     {
1789       MAYBE_UNUSED uintmax_t r;
1790       uintmax_t q, y;
1791       udiv_qrnnd (q, r, nh, nl, x);
1792       y = (x + q) / 2;
1793
1794       if (y >= x)
1795         {
1796           uintmax_t hi, lo;
1797           umul_ppmm (hi, lo, x + 1, x + 1);
1798           affirm (gt2 (hi, lo, nh, nl));
1799
1800           umul_ppmm (hi, lo, x, x);
1801           affirm (ge2 (nh, nl, hi, lo));
1802           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1803           affirm (hi == 0);
1804
1805           return x;
1806         }
1807
1808       x = y;
1809     }
1810 }
1811
1812 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N.  */
1813 # define MAGIC64 0x0202021202030213ULL
1814 # define MAGIC63 0x0402483012450293ULL
1815 # define MAGIC65 0x218a019866014613ULL
1816 # define MAGIC11 0x23b
1817
1818 /* Return the square root if the input is a square, otherwise 0.  */
1819 ATTRIBUTE_CONST
1820 static uintmax_t
1821 is_square (uintmax_t x)
1822 {
1823   /* Uses the tests suggested by Cohen.  Excludes 99% of the non-squares before
1824      computing the square root.  */
1825   if (((MAGIC64 >> (x & 63)) & 1)
1826       && ((MAGIC63 >> (x % 63)) & 1)
1827       /* Both 0 and 64 are squares mod (65).  */
1828       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1829       && ((MAGIC11 >> (x % 11) & 1)))
1830     {
1831       uintmax_t r = isqrt (x);
1832       if (r * r == x)
1833         return r;
1834     }
1835   return 0;
1836 }
1837
1838 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1839 static short const invtab[0x81] =
1840   {
1841     0x200,
1842     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1843     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1844     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1845     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1846     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1847     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1848     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1849     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1850     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1851     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1852     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1853     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1854     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1855     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1856     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1857     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1858   };
1859
1860 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1861    that q < 0x40; here it instead uses a table of (Euclidean) inverses.  */
1862 # define div_smallq(q, r, u, d)                                          \
1863   do {                                                                  \
1864     if ((u) / 0x40 < (d))                                               \
1865       {                                                                 \
1866         uintmax_t _dinv, _mask, _q, _r;                                 \
1867         int _cnt = stdc_leading_zeros (d);                              \
1868         _r = (u);                                                       \
1869         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1870           {                                                             \
1871             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1872             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1873           }                                                             \
1874         else                                                            \
1875           {                                                             \
1876             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1877             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1878           }                                                             \
1879         _r -= _q * (d);                                                 \
1880                                                                         \
1881         _mask = -(uintmax_t) (_r >= (d));                               \
1882         (r) = _r - (_mask & (d));                                       \
1883         (q) = _q - _mask;                                               \
1884         affirm ((q) * (d) + (r) == u);                                  \
1885       }                                                                 \
1886     else                                                                \
1887       {                                                                 \
1888         uintmax_t _q = (u) / (d);                                       \
1889         (r) = (u) - _q * (d);                                           \
1890         (q) = _q;                                                       \
1891       }                                                                 \
1892   } while (0)
1893
1894 /* Notes: Example N = 22117019.  After first phase we find Q1 = 6314, Q
1895    = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1896    with 3025 = 55^2.
1897
1898    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1899    representing G_0 = (-55, 2 * 4652, 8653).
1900
1901    In the notation of the paper:
1902
1903    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1904
1905    Put
1906
1907      t_0 = floor([q_0 + R_0] / S0) = 1
1908      R_1 = t_0 * S_0 - R_0 = 4001
1909      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1910 */
1911
1912 /* Multipliers, in order of efficiency:
1913    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1914    0.7317  3*5*7    =  105 = 1
1915    0.7820  3*5*11   =  165 = 1
1916    0.7872  3*5      =   15 = 3
1917    0.8101  3*7*11   =  231 = 3
1918    0.8155  3*7      =   21 = 1
1919    0.8284  5*7*11   =  385 = 1
1920    0.8339  5*7      =   35 = 3
1921    0.8716  3*11     =   33 = 1
1922    0.8774  3        =    3 = 3
1923    0.8913  5*11     =   55 = 3
1924    0.8972  5        =    5 = 1
1925    0.9233  7*11     =   77 = 1
1926    0.9295  7        =    7 = 3
1927    0.9934  11       =   11 = 3
1928 */
1929 # define QUEUE_SIZE 50
1930 #endif
1931
1932 #if STAT_SQUFOF
1933 # define Q_FREQ_SIZE 50
1934 /* Element 0 keeps the total */
1935 static int q_freq[Q_FREQ_SIZE + 1];
1936 #endif
1937
1938 #if USE_SQUFOF
1939 /* Return true on success.  Expected to fail only for numbers
1940    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit.  */
1941 static bool
1942 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1943 {
1944   /* Uses algorithm and notation from
1945
1946      SQUARE FORM FACTORIZATION
1947      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1948
1949      https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1950    */
1951
1952   static short const multipliers_1[] =
1953     { /* = 1 (mod 4) */
1954       105, 165, 21, 385, 33, 5, 77, 1, 0
1955     };
1956   static short const multipliers_3[] =
1957     { /* = 3 (mod 4) */
1958       1155, 15, 231, 35, 3, 55, 7, 11, 0
1959     };
1960
1961   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1962
1963   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1964     return false;
1965
1966   uintmax_t sqrt_n = isqrt2 (n1, n0);
1967
1968   if (n0 == sqrt_n * sqrt_n)
1969     {
1970       uintmax_t p1, p0;
1971
1972       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1973       affirm (p0 == n0);
1974
1975       if (n1 == p1)
1976         {
1977           if (prime_p (sqrt_n))
1978             factor_insert_multiplicity (factors, sqrt_n, 2);
1979           else
1980             {
1981               struct factors f;
1982
1983               f.nfactors = 0;
1984               if (!factor_using_squfof (0, sqrt_n, &f))
1985                 {
1986                   /* Try pollard rho instead */
1987                   factor_using_pollard_rho (sqrt_n, 1, &f);
1988                 }
1989               /* Duplicate the new factors */
1990               for (unsigned int i = 0; i < f.nfactors; i++)
1991                 factor_insert_multiplicity (factors, f.p[i], 2 * f.e[i]);
1992             }
1993           return true;
1994         }
1995     }
1996
1997   /* Select multipliers so we always get n * mu = 3 (mod 4) */
1998   for (short const *m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
1999        *m; m++)
2000     {
2001       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2002       unsigned int i;
2003       unsigned int mu = *m;
2004       int qpos = 0;
2005
2006       affirm (mu * n0 % 4 == 3);
2007
2008       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2009          get \Delta = 4 mu * n, and the paper's \mu is 2 mu.  As far as
2010          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2011          mu^3 < n.
2012
2013          However, this seems insufficient: With n = 37243139 and mu =
2014          105, we get a trivial factor, from the square 38809 = 197^2,
2015          without any corresponding Q earlier in the iteration.
2016
2017          Requiring 64 mu^3 < n seems sufficient.  */
2018       if (n1 == 0)
2019         {
2020           if ((uintmax_t) mu * mu * mu >= n0 / 64)
2021             continue;
2022         }
2023       else
2024         {
2025           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2026             continue;
2027         }
2028       umul_ppmm (Dh, Dl, n0, mu);
2029       Dh += n1 * mu;
2030
2031       affirm (Dl % 4 != 1);
2032       affirm (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2033
2034       S = isqrt2 (Dh, Dl);
2035
2036       Q1 = 1;
2037       P = S;
2038
2039       /* Square root remainder fits in one word, so ignore high part.  */
2040       Q = Dl - P * P;
2041       /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))?  */
2042       L = isqrt (2 * S);
2043       B = 2 * L;
2044       L1 = mu * 2 * L;
2045
2046       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2047          4 D.  */
2048
2049       for (i = 0; i <= B; i++)
2050         {
2051           uintmax_t q, P1, t, rem;
2052
2053           div_smallq (q, rem, S + P, Q);
2054           P1 = S - rem; /* P1 = q*Q - P */
2055
2056           affirm (q > 0 && Q > 0);
2057
2058 # if STAT_SQUFOF
2059           q_freq[0]++;
2060           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2061 # endif
2062
2063           if (Q <= L1)
2064             {
2065               uintmax_t g = Q;
2066
2067               if ((Q & 1) == 0)
2068                 g /= 2;
2069
2070               g /= gcd_odd (g, mu);
2071
2072               if (g <= L)
2073                 {
2074                   if (qpos >= QUEUE_SIZE)
2075                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2076                   queue[qpos].Q = g;
2077                   queue[qpos].P = P % g;
2078                   qpos++;
2079                 }
2080             }
2081
2082           /* I think the difference can be either sign, but mod
2083              2^W_TYPE_SIZE arithmetic should be fine.  */
2084           t = Q1 + q * (P - P1);
2085           Q1 = Q;
2086           Q = t;
2087           P = P1;
2088
2089           if ((i & 1) == 0)
2090             {
2091               uintmax_t r = is_square (Q);
2092               if (r)
2093                 {
2094                   for (int j = 0; j < qpos; j++)
2095                     {
2096                       if (queue[j].Q == r)
2097                         {
2098                           if (r == 1)
2099                             /* Traversed entire cycle.  */
2100                             goto next_multiplier;
2101
2102                           /* Need the absolute value for divisibility test.  */
2103                           if (P >= queue[j].P)
2104                             t = P - queue[j].P;
2105                           else
2106                             t = queue[j].P - P;
2107                           if (t % r == 0)
2108                             {
2109                               /* Delete entries up to and including entry
2110                                  j, which matched.  */
2111                               memmove (queue, queue + j + 1,
2112                                        (qpos - j - 1) * sizeof (queue[0]));
2113                               qpos -= (j + 1);
2114                             }
2115                           goto next_i;
2116                         }
2117                     }
2118
2119                   /* We have found a square form, which should give a
2120                      factor.  */
2121                   Q1 = r;
2122                   affirm (S >= P); /* What signs are possible?  */
2123                   P += r * ((S - P) / r);
2124
2125                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2126                      for the case D = 2N.  */
2127                   /* Compute Q = (D - P*P) / Q1, but we need double
2128                      precision.  */
2129                   uintmax_t hi, lo;
2130                   umul_ppmm (hi, lo, P, P);
2131                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2132                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2133                   affirm (rem == 0);
2134
2135                   for (;;)
2136                     {
2137                       /* Note: There appears to by a typo in the paper,
2138                          Step 4a in the algorithm description says q <--
2139                          floor([S+P]/\hat Q), but looking at the equations
2140                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2141                          (In this code, \hat Q is Q1).  */
2142                       div_smallq (q, rem, S + P, Q);
2143                       P1 = S - rem;     /* P1 = q*Q - P */
2144
2145 # if STAT_SQUFOF
2146                       q_freq[0]++;
2147                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2148 # endif
2149                       if (P == P1)
2150                         break;
2151                       t = Q1 + q * (P - P1);
2152                       Q1 = Q;
2153                       Q = t;
2154                       P = P1;
2155                     }
2156
2157                   if ((Q & 1) == 0)
2158                     Q /= 2;
2159                   Q /= gcd_odd (Q, mu);
2160
2161                   affirm (Q > 1 && (n1 || Q < n0));
2162
2163                   if (prime_p (Q))
2164                     factor_insert (factors, Q);
2165                   else if (!factor_using_squfof (0, Q, factors))
2166                     factor_using_pollard_rho (Q, 2, factors);
2167
2168                   divexact_21 (n1, n0, n1, n0, Q);
2169
2170                   if (prime2_p (n1, n0))
2171                     factor_insert_large (factors, n1, n0);
2172                   else
2173                     {
2174                       if (!factor_using_squfof (n1, n0, factors))
2175                         {
2176                           if (n1 == 0)
2177                             factor_using_pollard_rho (n0, 1, factors);
2178                           else
2179                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2180                         }
2181                     }
2182
2183                   return true;
2184                 }
2185             }
2186         next_i:;
2187         }
2188     next_multiplier:;
2189     }
2190   return false;
2191 }
2192 #endif
2193
2194 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2195    results in FACTORS.  */
2196 static void
2197 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2198 {
2199   factors->nfactors = 0;
2200   hiset (&factors->plarge, 0);
2201
2202   if (t1 == 0 && t0 < 2)
2203     return;
2204
2205   t0 = factor_using_division (&t1, t1, t0, factors);
2206
2207   if (t1 == 0 && t0 < 2)
2208     return;
2209
2210   if (prime2_p (t1, t0))
2211     factor_insert_large (factors, t1, t0);
2212   else
2213     {
2214 #if USE_SQUFOF
2215       if (factor_using_squfof (t1, t0, factors))
2216         return;
2217 #endif
2218
2219       if (t1 == 0)
2220         factor_using_pollard_rho (t0, 1, factors);
2221       else
2222         factor_using_pollard_rho2 (t1, t0, 1, factors);
2223     }
2224 }
2225
2226 /* Use Pollard-rho to compute the prime factors of
2227    arbitrary-precision T, and put the results in FACTORS.  */
2228 static void
2229 mp_factor (mpz_t t, struct mp_factors *factors)
2230 {
2231   mp_factor_init (factors);
2232
2233   if (mpz_sgn (t) != 0)
2234     {
2235       mp_factor_using_division (t, factors);
2236
2237       if (mpz_cmp_ui (t, 1) != 0)
2238         {
2239           devmsg ("[is number prime?] ");
2240           if (mp_prime_p (t))
2241             mp_factor_insert (factors, t);
2242           else
2243             mp_factor_using_pollard_rho (t, 1, factors);
2244         }
2245     }
2246 }
2247
2248 static strtol_error
2249 strto2uintmax (uintmax_t *hip, uintmax_t *lop, char const *s)
2250 {
2251   int lo_carry;
2252   uintmax_t hi = 0, lo = 0;
2253
2254   strtol_error err = LONGINT_INVALID;
2255
2256   /* Initial scan for invalid digits.  */
2257   char const *p = s;
2258   for (;;)
2259     {
2260       unsigned char c = *p++;
2261       if (c == 0)
2262         break;
2263
2264       if (UNLIKELY (!ISDIGIT (c)))
2265         {
2266           err = LONGINT_INVALID;
2267           break;
2268         }
2269
2270       err = LONGINT_OK;           /* we've seen at least one valid digit */
2271     }
2272
2273   while (err == LONGINT_OK)
2274     {
2275       unsigned char c = *s++;
2276       if (c == 0)
2277         break;
2278
2279       c -= '0';
2280
2281       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2282         {
2283           err = LONGINT_OVERFLOW;
2284           break;
2285         }
2286       hi = 10 * hi;
2287
2288       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2289       lo_carry += 10 * lo < 2 * lo;
2290
2291       lo = 10 * lo;
2292       lo += c;
2293
2294       lo_carry += lo < c;
2295       hi += lo_carry;
2296       if (UNLIKELY (hi < lo_carry))
2297         {
2298           err = LONGINT_OVERFLOW;
2299           break;
2300         }
2301     }
2302
2303   *hip = hi;
2304   *lop = lo;
2305
2306   return err;
2307 }
2308
2309 /* Structure and routines for buffering and outputting full lines,
2310    to support parallel operation efficiently.  */
2311 static struct lbuf_
2312 {
2313   char *buf;
2314   char *end;
2315 } lbuf;
2316
2317 /* 512 is chosen to give good performance,
2318    and also is the max guaranteed size that
2319    consumers can read atomically through pipes.
2320    Also it's big enough to cater for max line length
2321    even with 128 bit uintmax_t.  */
2322 #define FACTOR_PIPE_BUF 512
2323
2324 static void
2325 lbuf_alloc (void)
2326 {
2327   if (lbuf.buf)
2328     return;
2329
2330   /* Double to ensure enough space for
2331      previous numbers + next number.  */
2332   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2333   lbuf.end = lbuf.buf;
2334 }
2335
2336 /* Write complete LBUF to standard output.  */
2337 static void
2338 lbuf_flush (void)
2339 {
2340   size_t size = lbuf.end - lbuf.buf;
2341   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2342     write_error ();
2343   lbuf.end = lbuf.buf;
2344 }
2345
2346 /* Add a character C to LBUF and if it's a newline
2347    and enough bytes are already buffered,
2348    then write atomically to standard output.  */
2349 static void
2350 lbuf_putc (char c)
2351 {
2352   *lbuf.end++ = c;
2353
2354   if (c == '\n')
2355     {
2356       size_t buffered = lbuf.end - lbuf.buf;
2357
2358       /* Provide immediate output for interactive use.  */
2359       static int line_buffered = -1;
2360       if (line_buffered == -1)
2361         line_buffered = isatty (STDIN_FILENO) || isatty (STDOUT_FILENO);
2362       if (line_buffered)
2363         lbuf_flush ();
2364       else if (buffered >= FACTOR_PIPE_BUF)
2365         {
2366           /* Write output in <= PIPE_BUF chunks
2367              so consumers can read atomically.  */
2368           char const *tend = lbuf.end;
2369
2370           /* Since a umaxint_t's factors must fit in 512
2371              we're guaranteed to find a newline here.  */
2372           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2373           while (*--tlend != '\n');
2374           tlend++;
2375
2376           lbuf.end = tlend;
2377           lbuf_flush ();
2378
2379           /* Buffer the remainder.  */
2380           memcpy (lbuf.buf, tlend, tend - tlend);
2381           lbuf.end = lbuf.buf + (tend - tlend);
2382         }
2383     }
2384 }
2385
2386 /* Buffer an int to the internal LBUF.  */
2387 static void
2388 lbuf_putint (uintmax_t i, size_t min_width)
2389 {
2390   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2391   char const *umaxstr = umaxtostr (i, buf);
2392   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2393   size_t z = width;
2394
2395   for (; z < min_width; z++)
2396     *lbuf.end++ = '0';
2397
2398   memcpy (lbuf.end, umaxstr, width);
2399   lbuf.end += width;
2400 }
2401
2402 static void
2403 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2404 {
2405   uintmax_t q, r;
2406
2407   if (t1 == 0)
2408     lbuf_putint (t0, 0);
2409   else
2410     {
2411       /* Use very plain code here since it seems hard to write fast code
2412          without assuming a specific word size.  */
2413       q = t1 / 1000000000;
2414       r = t1 % 1000000000;
2415       udiv_qrnnd (t0, r, r, t0, 1000000000);
2416       print_uintmaxes (q, t0);
2417       lbuf_putint (r, 9);
2418     }
2419 }
2420
2421 /* Single-precision factoring */
2422 static void
2423 print_factors_single (uintmax_t t1, uintmax_t t0)
2424 {
2425   struct factors factors;
2426
2427   print_uintmaxes (t1, t0);
2428   lbuf_putc (':');
2429
2430   factor (t1, t0, &factors);
2431
2432   for (int j = 0; j < factors.nfactors; j++)
2433     for (int k = 0; k < factors.e[j]; k++)
2434       {
2435         lbuf_putc (' ');
2436         print_uintmaxes (0, factors.p[j]);
2437         if (print_exponents && factors.e[j] > 1)
2438           {
2439             lbuf_putc ('^');
2440             lbuf_putint (factors.e[j], 0);
2441             break;
2442           }
2443       }
2444
2445   if (hi (factors.plarge))
2446     {
2447       lbuf_putc (' ');
2448       print_uintmaxes (hi (factors.plarge), lo (factors.plarge));
2449     }
2450
2451   lbuf_putc ('\n');
2452 }
2453
2454 /* Emit the factors of the indicated number.  If we have the option of using
2455    either algorithm, we select on the basis of the length of the number.
2456    For longer numbers, we prefer the MP algorithm even if the native algorithm
2457    has enough digits, because the algorithm is better.  The turnover point
2458    depends on the value.  */
2459 static bool
2460 print_factors (char const *input)
2461 {
2462   /* Skip initial spaces and '+'.  */
2463   char const *str = input;
2464   while (*str == ' ')
2465     str++;
2466   str += *str == '+';
2467
2468   uintmax_t t1, t0;
2469
2470   /* Try converting the number to one or two words.  If it fails, use GMP or
2471      print an error message.  The 2nd condition checks that the most
2472      significant bit of the two-word number is clear, in a typesize neutral
2473      way.  */
2474   strtol_error err = strto2uintmax (&t1, &t0, str);
2475
2476   switch (err)
2477     {
2478     case LONGINT_OK:
2479       if (((t1 << 1) >> 1) == t1)
2480         {
2481           devmsg ("[using single-precision arithmetic] ");
2482           print_factors_single (t1, t0);
2483           return true;
2484         }
2485       break;
2486
2487     case LONGINT_OVERFLOW:
2488       /* Try GMP.  */
2489       break;
2490
2491     default:
2492       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2493       return false;
2494     }
2495
2496   devmsg ("[using arbitrary-precision arithmetic] ");
2497   mpz_t t;
2498   struct mp_factors factors;
2499
2500   mpz_init_set_str (t, str, 10);
2501
2502   mpz_out_str (stdout, 10, t);
2503   putchar (':');
2504   mp_factor (t, &factors);
2505
2506   for (idx_t j = 0; j < factors.nfactors; j++)
2507     for (unsigned long int k = 0; k < factors.e[j]; k++)
2508       {
2509         putchar (' ');
2510         mpz_out_str (stdout, 10, factors.p[j]);
2511         if (print_exponents && factors.e[j] > 1)
2512           {
2513             printf ("^%lu", factors.e[j]);
2514             break;
2515           }
2516       }
2517
2518   mp_factor_clear (&factors);
2519   mpz_clear (t);
2520   putchar ('\n');
2521   fflush (stdout);
2522   return true;
2523 }
2524
2525 void
2526 usage (int status)
2527 {
2528   if (status != EXIT_SUCCESS)
2529     emit_try_help ();
2530   else
2531     {
2532       printf (_("\
2533 Usage: %s [OPTION] [NUMBER]...\n\
2534 "),
2535               program_name);
2536       fputs (_("\
2537 Print the prime factors of each specified integer NUMBER.  If none\n\
2538 are specified on the command line, read them from standard input.\n\
2539 \n\
2540 "), stdout);
2541       fputs ("\
2542   -h, --exponents   print repeated factors in form p^e unless e is 1\n\
2543 ", stdout);
2544       fputs (HELP_OPTION_DESCRIPTION, stdout);
2545       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2546       emit_ancillary_info (PROGRAM_NAME);
2547     }
2548   exit (status);
2549 }
2550
2551 static bool
2552 do_stdin (void)
2553 {
2554   bool ok = true;
2555   token_buffer tokenbuffer;
2556
2557   init_tokenbuffer (&tokenbuffer);
2558
2559   while (true)
2560     {
2561       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2562                                        &tokenbuffer);
2563       if (token_length == (size_t) -1)
2564         {
2565           if (ferror (stdin))
2566             error (EXIT_FAILURE, errno, _("error reading input"));
2567           break;
2568         }
2569
2570       ok &= print_factors (tokenbuffer.buffer);
2571     }
2572   free (tokenbuffer.buffer);
2573
2574   return ok;
2575 }
2576
2577 int
2578 main (int argc, char **argv)
2579 {
2580   initialize_main (&argc, &argv);
2581   set_program_name (argv[0]);
2582   setlocale (LC_ALL, "");
2583   bindtextdomain (PACKAGE, LOCALEDIR);
2584   textdomain (PACKAGE);
2585
2586   lbuf_alloc ();
2587   atexit (close_stdout);
2588   atexit (lbuf_flush);
2589
2590   int c;
2591   while ((c = getopt_long (argc, argv, "h", long_options, nullptr)) != -1)
2592     {
2593       switch (c)
2594         {
2595         case 'h':  /* NetBSD used -h for this functionality first.  */
2596           print_exponents = true;
2597           break;
2598
2599         case DEV_DEBUG_OPTION:
2600           dev_debug = true;
2601           break;
2602
2603         case_GETOPT_HELP_CHAR;
2604
2605         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2606
2607         default:
2608           usage (EXIT_FAILURE);
2609         }
2610     }
2611
2612 #if STAT_SQUFOF
2613   memset (q_freq, 0, sizeof (q_freq));
2614 #endif
2615
2616   bool ok;
2617   if (argc <= optind)
2618     ok = do_stdin ();
2619   else
2620     {
2621       ok = true;
2622       for (int i = optind; i < argc; i++)
2623         if (! print_factors (argv[i]))
2624           ok = false;
2625     }
2626
2627 #if STAT_SQUFOF
2628   if (q_freq[0] > 0)
2629     {
2630       double acc_f;
2631       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2632       for (int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2633         {
2634           double f = (double) q_freq[i] / q_freq[0];
2635           acc_f += f;
2636           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2637                   100.0 * f, 100.0 * acc_f);
2638         }
2639     }
2640 #endif
2641
2642   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2643 }