src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2024 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organization:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38   Algorithm:
  39
  40     (1) Perform trial division using a small primes table, but without hardware
  41         division since the primes table store inverses modulo the word base.
  42         (The GMP variant of this code doesn't make use of the precomputed
  43         inverses, but instead relies on GMP for fast divisibility testing.)
  44     (2) Check the nature of any non-factored part using Miller-Rabin for
  45         detecting composites, and Lucas for detecting primes.
  46     (3) Factor any remaining composite part using the Pollard-Brent rho
  47         algorithm or if USE_SQUFOF is defined to 1, try that first.
  48         Status of found factors are checked again using Miller-Rabin and Lucas.
  49
  50     We prefer using Hensel norm in the divisions, not the more familiar
  51     Euclidean norm, since the former leads to much faster code.  In the
  52     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  53     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  54     reductions mod n.
  55
  56     The GMP code uses an algorithm that can be considerably slower;
  57     for example, on a circa-2017 Intel Xeon Silver 4116, factoring
  58     2^{127}-3 takes about 50 ms with the two-word algorithm but would
  59     take about 750 ms with the GMP code.
  60
  61   Improvements:
  62
  63     * Use modular inverses also for exact division in the Lucas code, and
  64       elsewhere.  A problem is to locate the inverses not from an index, but
  65       from a prime.  We might instead compute the inverse on-the-fly.
  66
  67     * Tune trial division table size (not forgetting that this is a standalone
  68       program where the table will be read from secondary storage for
  69       each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidean)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdbit.h>
 107 #include <stdio.h>
 108 #include <gmp.h>
 109
 110 #include "system.h"
 111 #include "assure.h"
 112 #include "full-write.h"
 113 #include "quote.h"
 114 #include "readtokens.h"
 115 #include "xstrtol.h"
 116
 117 /* The official name of this program (e.g., no 'g' prefix).  */
 118 #define PROGRAM_NAME "factor"
 119
 120 #define AUTHORS \
 121   proper_name ("Paul Rubin"),                                           \
 122   proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 123   proper_name_lite ("Niels Moller", "Niels M\303\266ller")
 124
 125 /* Token delimiters when reading from a file.  */
 126 #define DELIM "\n\t "
 127
 128 #ifndef USE_LONGLONG_H
 129 /* With the way we use longlong.h, it's only safe to use
 130    when UWtype = UHWtype, as there were various cases
 131    (as can be seen in the history for longlong.h) where
 132    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 133    to avoid compile time or run time issues.  */
 134 # if LONG_MAX == INTMAX_MAX
 135 #  define USE_LONGLONG_H 1
 136 # endif
 137 #endif
 138
 139 #define W_TYPE_SIZE UINTMAX_WIDTH
 140
 141 #if USE_LONGLONG_H
 142
 143 /* Make definitions for longlong.h to make it do what it can do for us */
 144
 145 # define UWtype  uintmax_t
 146 # define UHWtype unsigned long int
 147 # undef UDWtype
 148 # if HAVE_ATTRIBUTE_MODE
 149 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 150 typedef          int SItype     __attribute__ ((mode (SI)));
 151 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 152 typedef          int DItype     __attribute__ ((mode (DI)));
 153 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 154 # else
 155 typedef unsigned char UQItype;
 156 typedef          long SItype;
 157 typedef unsigned long int USItype;
 158 #  if HAVE_LONG_LONG_INT
 159 typedef long long int DItype;
 160 typedef unsigned long long int UDItype;
 161 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 162 typedef long int DItype;
 163 typedef unsigned long int UDItype;
 164 #  endif
 165 # endif
 166 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 167 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 168 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 169 # ifndef __GMP_GNUC_PREREQ
 170 #  define __GMP_GNUC_PREREQ(a,b) 1
 171 # endif
 172
 173 /* longlong.h uses these macros only in certain system compiler combinations.
 174    Ensure usage to pacify -Wunused-macros.  */
 175 # if defined ASSERT || defined UHWtype || defined __GMP_DECLSPEC
 176 # endif
 177
 178 # if _ARCH_PPC
 179 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 180 # endif
 181 # include "longlong.h"
 182
 183 #else /* not USE_LONGLONG_H */
 184
 185 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 186 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 187 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 188
 189 #endif
 190
 191 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 192 #define MAX_NFACTS 26
 193
 194 enum
 195 {
 196   DEV_DEBUG_OPTION = CHAR_MAX + 1
 197 };
 198
 199 static struct option const long_options[] =
 200 {
 201   {"exponents", no_argument, nullptr, 'h'},
 202   {"-debug", no_argument, nullptr, DEV_DEBUG_OPTION},
 203   {GETOPT_HELP_OPTION_DECL},
 204   {GETOPT_VERSION_OPTION_DECL},
 205   {nullptr, 0, nullptr, 0}
 206 };
 207
 208 /* If true, use p^e output format.  */
 209 static bool print_exponents;
 210
 211 /* This represents an unsigned integer twice as wide as uintmax_t.  */
 212 typedef struct { uintmax_t uu[2]; } uuint;
 213
 214 /* Accessors and constructors for the type.  Pprograms should not
 215    access the type's internals directly, in case some future version
 216    replaces the type with unsigned __int128 or whatever.  */
 217 static uintmax_t lo (uuint u) { return u.uu[0]; }
 218 static uintmax_t hi (uuint u) { return u.uu[1]; }
 219 static void hiset (uuint *u, uintmax_t hi) { u->uu[1] = hi; }
 220 static void
 221 uuset (uintmax_t *phi, uintmax_t *plo, uuint uu)
 222 {
 223   *phi = hi (uu);
 224   *plo = lo (uu);
 225 }
 226 static uuint
 227 make_uuint (uintmax_t hi, uintmax_t lo)
 228 {
 229   return (uuint) {{lo, hi}};
 230 }
 231
 232 /* BIG_POWER_OF_10 is a positive power of 10 that does not exceed UINTMAX_MAX.
 233    The larger it is, the more efficient the code will likely be.
 234    LOG_BIG_POWER_OF_10 = log (BIG_POWER_OF_10).  */
 235 #if UINTMAX_WIDTH < 64
 236 # error "platform does not support 64-bit integers"
 237 #elif UINTMAX_WIDTH < 128 || !defined UINTMAX_C
 238 /* Mainstream platforms as of 2024, with at-least-64-bit uintmax_t.  */
 239 static uintmax_t const BIG_POWER_OF_10 = 10000000000000000000llu;
 240 enum { LOG_BIG_POWER_OF_10 = 19 };
 241 #else
 242 /* For so-far-only-theoretical platforms with at-least-128-bit uintmax_t.
 243    This is for performance; the 64-bit mainstream code will still work.  */
 244 static uintmax_t const BIG_POWER_OF_10 =
 245   UINTMAX_C (100000000000000000000000000000000000000);
 246 enum { LOG_BIG_POWER_OF_10 = 38 };
 247 #endif
 248
 249 struct factors
 250 {
 251   uuint plarge; /* Can have a single large factor */
 252   uintmax_t     p[MAX_NFACTS];
 253   unsigned char e[MAX_NFACTS];
 254   unsigned char nfactors;
 255 };
 256
 257 struct mp_factors
 258 {
 259   mpz_t             *p;
 260   unsigned long int *e;
 261   idx_t nfactors;
 262   idx_t nalloc;
 263 };
 264
 265 static void factor (uintmax_t, uintmax_t, struct factors *);
 266
 267 #ifndef umul_ppmm
 268 # define umul_ppmm(w1, w0, u, v)                                        \
 269   do {                                                                  \
 270     uintmax_t __x0, __x1, __x2, __x3;                                   \
 271     unsigned long int __ul, __vl, __uh, __vh;                           \
 272     uintmax_t __u = (u), __v = (v);                                     \
 273                                                                         \
 274     __ul = __ll_lowpart (__u);                                          \
 275     __uh = __ll_highpart (__u);                                         \
 276     __vl = __ll_lowpart (__v);                                          \
 277     __vh = __ll_highpart (__v);                                         \
 278                                                                         \
 279     __x0 = (uintmax_t) __ul * __vl;                                     \
 280     __x1 = (uintmax_t) __ul * __vh;                                     \
 281     __x2 = (uintmax_t) __uh * __vl;                                     \
 282     __x3 = (uintmax_t) __uh * __vh;                                     \
 283                                                                         \
 284     __x1 += __ll_highpart (__x0);/* This can't give carry.  */          \
 285     __x1 += __x2;               /* But this indeed can.  */             \
 286     if (__x1 < __x2)            /* Did we get it?  */                   \
 287       __x3 += __ll_B;           /* Yes, add it in the proper pos.  */   \
 288                                                                         \
 289     (w1) = __x3 + __ll_highpart (__x1);                                 \
 290     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 291   } while (0)
 292 #endif
 293
 294 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 295 /* Define our own, not needing normalization.  This function is
 296    currently not performance critical, so keep it simple.  Similar to
 297    the mod macro below.  */
 298 # undef udiv_qrnnd
 299 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 300   do {                                                                  \
 301     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 302                                                                         \
 303     __d1 = (d); __d0 = 0;                                               \
 304     __r1 = (n1); __r0 = (n0);                                           \
 305     affirm (__r1 < __d1);                                               \
 306     __q = 0;                                                            \
 307     for (int __i = W_TYPE_SIZE; __i > 0; __i--)                         \
 308       {                                                                 \
 309         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 310         __q <<= 1;                                                      \
 311         if (ge2 (__r1, __r0, __d1, __d0))                               \
 312           {                                                             \
 313             __q++;                                                      \
 314             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 315           }                                                             \
 316       }                                                                 \
 317     (r) = __r0;                                                         \
 318     (q) = __q;                                                          \
 319   } while (0)
 320 #endif
 321
 322 #if !defined add_ssaaaa
 323 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 324   do {                                                                  \
 325     uintmax_t _add_x;                                                   \
 326     _add_x = (al) + (bl);                                               \
 327     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 328     (sl) = _add_x;                                                      \
 329   } while (0)
 330 #endif
 331
 332 /* Set (rh,rl) = (ah,al) >> cnt, where 0 < cnt < W_TYPE_SIZE.  */
 333 #define rsh2(rh, rl, ah, al, cnt)                                       \
 334   do {                                                                  \
 335     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 336     (rh) = (ah) >> (cnt);                                               \
 337   } while (0)
 338
 339 /* Set (rh,rl) = (ah,al) << cnt, where 0 < cnt < W_TYPE_SIZE.  */
 340 #define lsh2(rh, rl, ah, al, cnt)                                       \
 341   do {                                                                  \
 342     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 343     (rl) = (al) << (cnt);                                               \
 344   } while (0)
 345
 346 #define ge2(ah, al, bh, bl)                                             \
 347   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 348
 349 #define gt2(ah, al, bh, bl)                                             \
 350   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 351
 352 #ifndef sub_ddmmss
 353 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 354   do {                                                                  \
 355     uintmax_t _cy;                                                      \
 356     _cy = (al) < (bl);                                                  \
 357     (rl) = (al) - (bl);                                                 \
 358     (rh) = (ah) - (bh) - _cy;                                           \
 359   } while (0)
 360 #endif
 361
 362 /* Requires that a < n and b <= n */
 363 #define submod(r,a,b,n)                                                 \
 364   do {                                                                  \
 365     uintmax_t _t = - (uintmax_t) (a < b);                               \
 366     (r) = ((n) & _t) + (a) - (b);                                       \
 367   } while (0)
 368
 369 #define addmod(r,a,b,n)                                                 \
 370   submod ((r), (a), ((n) - (b)), (n))
 371
 372 /* Modular two-word addition and subtraction.  For performance reasons, the
 373    most significant bit of n1 must be clear.  The destination variables must be
 374    distinct from the mod operand.  */
 375 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 376   do {                                                                  \
 377     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 378     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 379       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 380   } while (0)
 381 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 382   do {                                                                  \
 383     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 384     if ((intmax_t) (r1) < 0)                                            \
 385       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 386   } while (0)
 387
 388 #define HIGHBIT_TO_MASK(x)                                              \
 389   (((intmax_t)-1 >> 1) < 0                                              \
 390    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 391    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 392       ? UINTMAX_MAX : (uintmax_t) 0))
 393
 394 /* Return r = a mod d, where a = <a1,a0>, d = <d1,d0>.
 395    Requires that d1 != 0.  */
 396 ATTRIBUTE_PURE static uuint
 397 mod2 (uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 398 {
 399   affirm (d1 != 0);
 400
 401   if (a1)
 402     {
 403       int cntd = stdc_leading_zeros (d1);
 404       int cnta = stdc_leading_zeros (a1);
 405       int cnt = cntd - cnta;
 406       if (0 < cnt)
 407         {
 408           lsh2 (d1, d0, d1, d0, cnt);
 409           for (int i = 0; i < cnt; i++)
 410             {
 411               if (ge2 (a1, a0, d1, d0))
 412                 sub_ddmmss (a1, a0, a1, a0, d1, d0);
 413               rsh2 (d1, d0, d1, d0, 1);
 414             }
 415         }
 416     }
 417
 418   return make_uuint (a1, a0);
 419 }
 420
 421 ATTRIBUTE_CONST
 422 static uintmax_t
 423 gcd_odd (uintmax_t a, uintmax_t b)
 424 {
 425   if ((b & 1) == 0)
 426     {
 427       uintmax_t t = b;
 428       b = a;
 429       a = t;
 430     }
 431   if (a == 0)
 432     return b;
 433
 434   /* Take out least significant one bit, to make room for sign */
 435   b >>= 1;
 436
 437   for (;;)
 438     {
 439       uintmax_t t;
 440       uintmax_t bgta;
 441
 442       assume (a);
 443       a >>= stdc_trailing_zeros (a);
 444       a >>= 1;
 445
 446       t = a - b;
 447       if (t == 0)
 448         return (a << 1) + 1;
 449
 450       bgta = HIGHBIT_TO_MASK (t);
 451
 452       /* b <-- min (a, b) */
 453       b += (bgta & t);
 454
 455       /* a <-- |a - b| */
 456       a = (t ^ bgta) - bgta;
 457     }
 458 }
 459
 460 ATTRIBUTE_PURE static uuint
 461 gcd2_odd (uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 462 {
 463   affirm (b0 & 1);
 464
 465   if ((a0 | a1) == 0)
 466     return make_uuint (b1, b0);
 467   if (!a0)
 468     a0 = a1, a1 = 0;
 469   assume (a0);
 470   int ctz = stdc_trailing_zeros (a0);
 471   if (ctz)
 472     rsh2 (a1, a0, a1, a0, ctz);
 473
 474   for (;;)
 475     {
 476       if ((b1 | a1) == 0)
 477         return make_uuint (0, gcd_odd (b0, a0));
 478
 479       if (gt2 (a1, a0, b1, b0))
 480         {
 481           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 482           if (!a0)
 483             a0 = a1, a1 = 0;
 484           assume (a0);
 485           ctz = stdc_trailing_zeros (a0);
 486           if (ctz)
 487             rsh2 (a1, a0, a1, a0, ctz);
 488         }
 489       else if (gt2 (b1, b0, a1, a0))
 490         {
 491           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 492           if (!b0)
 493             b0 = b1, b1 = 0;
 494           assume (b0);
 495           ctz = stdc_trailing_zeros (b0);
 496           if (ctz)
 497             rsh2 (b1, b0, b1, b0, ctz);
 498         }
 499       else
 500         break;
 501     }
 502
 503   return make_uuint (a1, a0);
 504 }
 505
 506 static void
 507 factor_insert_multiplicity (struct factors *factors,
 508                             uintmax_t prime, int m)
 509 {
 510   int nfactors = factors->nfactors;
 511   uintmax_t *p = factors->p;
 512   unsigned char *e = factors->e;
 513
 514   /* Locate position for insert new or increment e.  */
 515   int i;
 516   for (i = nfactors - 1; i >= 0; i--)
 517     {
 518       if (p[i] <= prime)
 519         break;
 520     }
 521
 522   if (i < 0 || p[i] != prime)
 523     {
 524       for (int j = nfactors - 1; j > i; j--)
 525         {
 526           p[j + 1] = p[j];
 527           e[j + 1] = e[j];
 528         }
 529       p[i + 1] = prime;
 530       e[i + 1] = m;
 531       factors->nfactors = nfactors + 1;
 532     }
 533   else
 534     {
 535       e[i] += m;
 536     }
 537 }
 538
 539 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 540
 541 static void
 542 factor_insert_large (struct factors *factors,
 543                      uintmax_t p1, uintmax_t p0)
 544 {
 545   if (p1 > 0)
 546     {
 547       affirm (hi (factors->plarge) == 0);
 548       factors->plarge = make_uuint (p1, p0);
 549     }
 550   else
 551     factor_insert (factors, p0);
 552 }
 553
 554 #ifndef mpz_inits
 555
 556 # include <stdarg.h>
 557
 558 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 559 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 560
 561 static void
 562 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 563 {
 564   va_list ap;
 565
 566   va_start (ap, mpz_single_init);
 567
 568   mpz_t *mpz;
 569   while ((mpz = va_arg (ap, mpz_t *)))
 570     mpz_single_init (*mpz);
 571
 572   va_end (ap);
 573 }
 574 #endif
 575
 576 static void mp_factor (mpz_t, struct mp_factors *);
 577
 578 static void
 579 mp_factor_init (struct mp_factors *factors)
 580 {
 581   factors->p = nullptr;
 582   factors->e = nullptr;
 583   factors->nfactors = 0;
 584   factors->nalloc = 0;
 585 }
 586
 587 static void
 588 mp_factor_clear (struct mp_factors *factors)
 589 {
 590   for (idx_t i = 0; i < factors->nfactors; i++)
 591     mpz_clear (factors->p[i]);
 592
 593   free (factors->p);
 594   free (factors->e);
 595 }
 596
 597 static void
 598 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 599 {
 600   idx_t nfactors = factors->nfactors;
 601   mpz_t *p = factors->p;
 602   unsigned long int *e = factors->e;
 603   ptrdiff_t i;
 604
 605   /* Locate position for insert new or increment e.  */
 606   for (i = nfactors - 1; i >= 0; i--)
 607     {
 608       if (mpz_cmp (p[i], prime) <= 0)
 609         break;
 610     }
 611
 612   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 613     {
 614       if (factors->nfactors == factors->nalloc)
 615         {
 616           p = xpalloc (p, &factors->nalloc, 1, -1, sizeof *p);
 617           e = xireallocarray (e, factors->nalloc, sizeof *e);
 618         }
 619
 620       mpz_init (p[nfactors]);
 621       for (ptrdiff_t j = nfactors - 1; j > i; j--)
 622         {
 623           mpz_set (p[j + 1], p[j]);
 624           e[j + 1] = e[j];
 625         }
 626       mpz_set (p[i + 1], prime);
 627       e[i + 1] = 1;
 628
 629       factors->p = p;
 630       factors->e = e;
 631       factors->nfactors = nfactors + 1;
 632     }
 633   else
 634     {
 635       e[i] += 1;
 636     }
 637 }
 638
 639 static void
 640 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 641 {
 642   mpz_t pz;
 643
 644   mpz_init_set_ui (pz, prime);
 645   mp_factor_insert (factors, pz);
 646   mpz_clear (pz);
 647 }
 648
 649
 650 #define P(a,b,c,d) a,
 651 static const unsigned char primes_diff[] = {
 652 #include "primes.h"
 653 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 654 };
 655 #undef P
 656
 657 #define PRIMES_PTAB_ENTRIES (ARRAY_CARDINALITY (primes_diff) - 8 + 1)
 658
 659 #define P(a,b,c,d) b,
 660 static const unsigned char primes_diff8[] = {
 661 #include "primes.h"
 662 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 663 };
 664 #undef P
 665
 666 struct primes_dtab
 667 {
 668   uintmax_t binv, lim;
 669 };
 670
 671 #define P(a,b,c,d) {c,d},
 672 static const struct primes_dtab primes_dtab[] = {
 673 #include "primes.h"
 674 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 675 };
 676 #undef P
 677
 678 /* Verify that uintmax_t is not wider than
 679    the integers used to generate primes.h.  */
 680 static_assert (UINTMAX_WIDTH <= WIDE_UINT_BITS);
 681
 682 /* debugging for developers.  Enables devmsg().
 683    This flag is used only in the GMP code.  */
 684 static bool dev_debug = false;
 685
 686 /* Prove primality or run probabilistic tests.  */
 687 static bool flag_prove_primality = PROVE_PRIMALITY;
 688
 689 /* Number of Miller-Rabin tests to run when not proving primality.  */
 690 #define MR_REPS 25
 691
 692 static void
 693 factor_insert_refind (struct factors *factors, uintmax_t p, int i, int off)
 694 {
 695   for (int j = 0; j < off; j++)
 696     p += primes_diff[i + j];
 697   factor_insert (factors, p);
 698 }
 699
 700 /* Trial division with odd primes uses the following trick.
 701
 702    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}.  For simplicity,
 703    consider the case t < B (this is the second loop below).
 704
 705    From our tables we get
 706
 707      binv = p^{-1} (mod B)
 708      lim = floor ((B-1) / p).
 709
 710    First assume that t is a multiple of p, t = q * p.  Then 0 <= q <= lim
 711    (and all quotients in this range occur for some t).
 712
 713    Then t = q * p is true also (mod B), and p is invertible we get
 714
 715      q = t * binv (mod B).
 716
 717    Next, assume that t is *not* divisible by p.  Since multiplication
 718    by binv (mod B) is a one-to-one mapping,
 719
 720      t * binv (mod B) > lim,
 721
 722    because all the smaller values are already taken.
 723
 724    This can be summed up by saying that the function
 725
 726      q(t) = binv * t (mod B)
 727
 728    is a permutation of the range 0 <= t < B, with the curious property
 729    that it maps the multiples of p onto the range 0 <= q <= lim, in
 730    order, and the non-multiples of p onto the range lim < q < B.
 731  */
 732
 733 static uuint
 734 factor_using_division (uintmax_t t1, uintmax_t t0,
 735                        struct factors *factors)
 736 {
 737   if (t0 % 2 == 0)
 738     {
 739       int cnt;
 740
 741       if (t0 == 0)
 742         {
 743           assume (t1);
 744           cnt = stdc_trailing_zeros (t1);
 745           t0 = t1 >> cnt;
 746           t1 = 0;
 747           cnt += W_TYPE_SIZE;
 748         }
 749       else
 750         {
 751           cnt = stdc_trailing_zeros (t0);
 752           rsh2 (t1, t0, t1, t0, cnt);
 753         }
 754
 755       factor_insert_multiplicity (factors, 2, cnt);
 756     }
 757
 758   uintmax_t p = 3;
 759   idx_t i;
 760   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 761     {
 762       for (;;)
 763         {
 764           uintmax_t q1, q0, hi;
 765           MAYBE_UNUSED uintmax_t lo;
 766
 767           q0 = t0 * primes_dtab[i].binv;
 768           umul_ppmm (hi, lo, q0, p);
 769           if (hi > t1)
 770             break;
 771           hi = t1 - hi;
 772           q1 = hi * primes_dtab[i].binv;
 773           if (LIKELY (q1 > primes_dtab[i].lim))
 774             break;
 775           t1 = q1; t0 = q0;
 776           factor_insert (factors, p);
 777         }
 778       p += primes_diff[i + 1];
 779     }
 780
 781 #define DIVBLOCK(I)                                                     \
 782   do {                                                                  \
 783     for (;;)                                                            \
 784       {                                                                 \
 785         q = t0 * pd[I].binv;                                            \
 786         if (LIKELY (q > pd[I].lim))                                     \
 787           break;                                                        \
 788         t0 = q;                                                         \
 789         factor_insert_refind (factors, p, i + 1, I);                    \
 790       }                                                                 \
 791   } while (0)
 792
 793   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 794     {
 795       uintmax_t q;
 796       const struct primes_dtab *pd = &primes_dtab[i];
 797       DIVBLOCK (0);
 798       DIVBLOCK (1);
 799       DIVBLOCK (2);
 800       DIVBLOCK (3);
 801       DIVBLOCK (4);
 802       DIVBLOCK (5);
 803       DIVBLOCK (6);
 804       DIVBLOCK (7);
 805
 806       p += primes_diff8[i];
 807       if (p * p > t0)
 808         break;
 809     }
 810
 811   return make_uuint (t1, t0);
 812 }
 813
 814 static void
 815 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 816 {
 817   mpz_t q;
 818   mp_bitcnt_t p;
 819
 820   devmsg ("[trial division] ");
 821
 822   mpz_init (q);
 823
 824   p = mpz_scan1 (t, 0);
 825   mpz_fdiv_q_2exp (t, t, p);
 826   while (p)
 827     {
 828       mp_factor_insert_ui (factors, 2);
 829       --p;
 830     }
 831
 832   unsigned long int d = 3;
 833   for (idx_t i = 1; i <= PRIMES_PTAB_ENTRIES;)
 834     {
 835       if (! mpz_divisible_ui_p (t, d))
 836         {
 837           d += primes_diff[i++];
 838           if (mpz_cmp_ui (t, d * d) < 0)
 839             break;
 840         }
 841       else
 842         {
 843           mpz_tdiv_q_ui (t, t, d);
 844           mp_factor_insert_ui (factors, d);
 845         }
 846     }
 847
 848   mpz_clear (q);
 849 }
 850
 851 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 852 static const unsigned char  binvert_table[128] =
 853 {
 854   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 855   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 856   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 857   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 858   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 859   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 860   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 861   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 862   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 863   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 864   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 865   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 866   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 867   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 868   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 869   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 870 };
 871
 872 /* Compute n^(-1) mod B, using a Newton iteration.  */
 873 #define binv(inv,n)                                                     \
 874   do {                                                                  \
 875     uintmax_t  __n = (n);                                               \
 876     uintmax_t  __inv;                                                   \
 877                                                                         \
 878     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 879     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 880     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 881     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 882                                                                         \
 883     if (W_TYPE_SIZE > 64)                                               \
 884       {                                                                 \
 885         int  __invbits = 64;                                            \
 886         do {                                                            \
 887           __inv = 2 * __inv - __inv * __inv * __n;                      \
 888           __invbits *= 2;                                               \
 889         } while (__invbits < W_TYPE_SIZE);                              \
 890       }                                                                 \
 891                                                                         \
 892     (inv) = __inv;                                                      \
 893   } while (0)
 894
 895 /* q = u / d, assuming d|u.  */
 896 #define divexact_21(q1, q0, u1, u0, d)                                  \
 897   do {                                                                  \
 898     uintmax_t _di, _q0;                                                 \
 899     binv (_di, (d));                                                    \
 900     _q0 = (u0) * _di;                                                   \
 901     if ((u1) >= (d))                                                    \
 902       {                                                                 \
 903         uintmax_t _p1;                                                  \
 904         MAYBE_UNUSED intmax_t _p0;                                      \
 905         umul_ppmm (_p1, _p0, _q0, d);                                   \
 906         (q1) = ((u1) - _p1) * _di;                                      \
 907         (q0) = _q0;                                                     \
 908       }                                                                 \
 909     else                                                                \
 910       {                                                                 \
 911         (q0) = _q0;                                                     \
 912         (q1) = 0;                                                       \
 913       }                                                                 \
 914   } while (0)
 915
 916 /* x B (mod n).  */
 917 #define redcify(r_prim, r, n)                                           \
 918   do {                                                                  \
 919     MAYBE_UNUSED uintmax_t _redcify_q;                                  \
 920     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 921   } while (0)
 922
 923 /* x B^2 (mod n).  Requires x > 0, n1 < B/2.  */
 924 #define redcify2(r1, r0, x, n1, n0)                                     \
 925   do {                                                                  \
 926     uintmax_t _r1, _r0, _i;                                             \
 927     if ((x) < (n1))                                                     \
 928       {                                                                 \
 929         _r1 = (x); _r0 = 0;                                             \
 930         _i = W_TYPE_SIZE;                                               \
 931       }                                                                 \
 932     else                                                                \
 933       {                                                                 \
 934         _r1 = 0; _r0 = (x);                                             \
 935         _i = 2 * W_TYPE_SIZE;                                           \
 936       }                                                                 \
 937     while (_i-- > 0)                                                    \
 938       {                                                                 \
 939         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 940         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 941           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 942       }                                                                 \
 943     (r1) = _r1;                                                         \
 944     (r0) = _r0;                                                         \
 945   } while (0)
 946
 947 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 948    Both a and b must be in redc form, the result will be in redc form too.  */
 949 static inline uintmax_t
 950 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 951 {
 952   uintmax_t rh, rl, q, th, xh;
 953   MAYBE_UNUSED uintmax_t tl;
 954
 955   umul_ppmm (rh, rl, a, b);
 956   q = rl * mi;
 957   umul_ppmm (th, tl, q, m);
 958   xh = rh - th;
 959   if (rh < th)
 960     xh += m;
 961
 962   return xh;
 963 }
 964
 965 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 966    Both a and b must be in redc form, the result will be in redc form too.
 967    For performance reasons, the most significant bit of m must be clear.  */
 968 static uintmax_t
 969 mulredc2 (uintmax_t *r1p,
 970           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 971           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 972 {
 973   uintmax_t r1, r0, q, p1, t1, t0, s1, s0;
 974   MAYBE_UNUSED uintmax_t p0;
 975   mi = -mi;
 976   affirm ((a1 >> (W_TYPE_SIZE - 1)) == 0);
 977   affirm ((b1 >> (W_TYPE_SIZE - 1)) == 0);
 978   affirm ((m1 >> (W_TYPE_SIZE - 1)) == 0);
 979
 980   /* First compute a0 * <b1, b0> B^{-1}
 981         +-----+
 982         |a0 b0|
 983      +--+--+--+
 984      |a0 b1|
 985      +--+--+--+
 986         |q0 m0|
 987      +--+--+--+
 988      |q0 m1|
 989     -+--+--+--+
 990      |r1|r0| 0|
 991      +--+--+--+
 992   */
 993   umul_ppmm (t1, t0, a0, b0);
 994   umul_ppmm (r1, r0, a0, b1);
 995   q = mi * t0;
 996   umul_ppmm (p1, p0, q, m0);
 997   umul_ppmm (s1, s0, q, m1);
 998   r0 += (t0 != 0); /* Carry */
 999   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1000   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1001   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1002
1003   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1004         +-----+
1005         |a1 b0|
1006         +--+--+
1007         |r1|r0|
1008      +--+--+--+
1009      |a1 b1|
1010      +--+--+--+
1011         |q1 m0|
1012      +--+--+--+
1013      |q1 m1|
1014     -+--+--+--+
1015      |r1|r0| 0|
1016      +--+--+--+
1017   */
1018   umul_ppmm (t1, t0, a1, b0);
1019   umul_ppmm (s1, s0, a1, b1);
1020   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1021   q = mi * t0;
1022   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1023   umul_ppmm (p1, p0, q, m0);
1024   umul_ppmm (s1, s0, q, m1);
1025   r0 += (t0 != 0); /* Carry */
1026   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1027   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1028   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1029
1030   if (ge2 (r1, r0, m1, m0))
1031     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1032
1033   *r1p = r1;
1034   return r0;
1035 }
1036
1037 ATTRIBUTE_CONST
1038 static uintmax_t
1039 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1040 {
1041   uintmax_t y = one;
1042
1043   if (e & 1)
1044     y = b;
1045
1046   while (e != 0)
1047     {
1048       b = mulredc (b, b, n, ni);
1049       e >>= 1;
1050
1051       if (e & 1)
1052         y = mulredc (y, b, n, ni);
1053     }
1054
1055   return y;
1056 }
1057
1058 ATTRIBUTE_PURE static uuint
1059 powm2 (const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1060        uintmax_t ni, const uintmax_t *one)
1061 {
1062   uintmax_t r1, r0, b1, b0, n1, n0;
1063   int i;
1064   uintmax_t e;
1065
1066   b0 = bp[0];
1067   b1 = bp[1];
1068   n0 = np[0];
1069   n1 = np[1];
1070
1071   r0 = one[0];
1072   r1 = one[1];
1073
1074   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1075     {
1076       if (e & 1)
1077         {
1078           uintmax_t r1m1;
1079           r0 = mulredc2 (&r1m1, r1, r0, b1, b0, n1, n0, ni);
1080           r1 = r1m1;
1081         }
1082       uintmax_t r1m;
1083       b0 = mulredc2 (&r1m, b1, b0, b1, b0, n1, n0, ni);
1084       b1 = r1m;
1085     }
1086   for (e = ep[1]; e > 0; e >>= 1)
1087     {
1088       if (e & 1)
1089         {
1090           uintmax_t r1m1;
1091           r0 = mulredc2 (&r1m1, r1, r0, b1, b0, n1, n0, ni);
1092           r1 = r1m1;
1093         }
1094       uintmax_t r1m;
1095       b0 = mulredc2 (&r1m, b1, b0, b1, b0, n1, n0, ni);
1096       b1 = r1m;
1097     }
1098   return make_uuint (r1, r0);
1099 }
1100
1101 ATTRIBUTE_CONST
1102 static bool
1103 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1104              int k, uintmax_t one)
1105 {
1106   uintmax_t y = powm (b, q, n, ni, one);
1107
1108   uintmax_t nm1 = n - one;      /* -1, but in redc representation.  */
1109
1110   if (y == one || y == nm1)
1111     return true;
1112
1113   for (int i = 1; i < k; i++)
1114     {
1115       y = mulredc (y, y, n, ni);
1116
1117       if (y == nm1)
1118         return true;
1119       if (y == one)
1120         return false;
1121     }
1122   return false;
1123 }
1124
1125 ATTRIBUTE_PURE static bool
1126 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1127               const uintmax_t *qp, int k, const uintmax_t *one)
1128 {
1129   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1130
1131   uuset (&y1, &y0, powm2 (bp, qp, np, ni, one));
1132
1133   if (y0 == one[0] && y1 == one[1])
1134     return true;
1135
1136   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1137
1138   if (y0 == nm1_0 && y1 == nm1_1)
1139     return true;
1140
1141   for (int i = 1; i < k; i++)
1142     {
1143       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1144       y1 = r1m;
1145
1146       if (y0 == nm1_0 && y1 == nm1_1)
1147         return true;
1148       if (y0 == one[0] && y1 == one[1])
1149         return false;
1150     }
1151   return false;
1152 }
1153
1154 static bool
1155 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1156                 mpz_srcptr q, mp_bitcnt_t k)
1157 {
1158   mpz_powm (y, x, q, n);
1159
1160   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1161     return true;
1162
1163   for (mp_bitcnt_t i = 1; i < k; i++)
1164     {
1165       mpz_powm_ui (y, y, 2, n);
1166       if (mpz_cmp (y, nm1) == 0)
1167         return true;
1168       if (mpz_cmp_ui (y, 1) == 0)
1169         return false;
1170     }
1171   return false;
1172 }
1173
1174 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1175    have been observed.  The average seem to be about 2.  */
1176 static bool ATTRIBUTE_PURE
1177 prime_p (uintmax_t n)
1178 {
1179   bool is_prime;
1180   uintmax_t a_prim, one, ni;
1181   struct factors factors;
1182
1183   if (n <= 1)
1184     return false;
1185
1186   /* We have already cast out small primes.  */
1187   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1188     return true;
1189
1190   /* Precomputation for Miller-Rabin.  */
1191   int k = stdc_trailing_zeros (n - 1);
1192   uintmax_t q = (n - 1) >> k;
1193
1194   uintmax_t a = 2;
1195   binv (ni, n);                 /* ni <- 1/n mod B */
1196   redcify (one, 1, n);
1197   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1198
1199   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1200   if (!millerrabin (n, ni, a_prim, q, k, one))
1201     return false;
1202
1203   if (flag_prove_primality)
1204     {
1205       /* Factor n-1 for Lucas.  */
1206       factor (0, n - 1, &factors);
1207     }
1208
1209   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1210      number composite.  */
1211   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1212     {
1213       if (flag_prove_primality)
1214         {
1215           is_prime = true;
1216           for (int i = 0; i < factors.nfactors && is_prime; i++)
1217             {
1218               is_prime
1219                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1220             }
1221         }
1222       else
1223         {
1224           /* After enough Miller-Rabin runs, be content.  */
1225           is_prime = (r == MR_REPS - 1);
1226         }
1227
1228       if (is_prime)
1229         return true;
1230
1231       a += primes_diff[r];      /* Establish new base.  */
1232
1233       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1234          on most processors, since it avoids udiv_qrnnd.  If we go down the
1235          udiv_qrnnd_preinv path, this code should be replaced.  */
1236       {
1237         uintmax_t s1, s0;
1238         umul_ppmm (s1, s0, one, a);
1239         if (LIKELY (s1 == 0))
1240           a_prim = s0 % n;
1241         else
1242           {
1243             MAYBE_UNUSED uintmax_t dummy;
1244             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1245           }
1246       }
1247
1248       if (!millerrabin (n, ni, a_prim, q, k, one))
1249         return false;
1250     }
1251
1252   affirm (!"Lucas prime test failure.  This should not happen");
1253 }
1254
1255 static bool ATTRIBUTE_PURE
1256 prime2_p (uintmax_t n1, uintmax_t n0)
1257 {
1258   uintmax_t q[2], nm1[2];
1259   uintmax_t a_prim[2];
1260   uintmax_t one[2];
1261   uintmax_t na[2];
1262   uintmax_t ni;
1263   int k;
1264   struct factors factors;
1265
1266   if (n1 == 0)
1267     return prime_p (n0);
1268
1269   nm1[1] = n1 - (n0 == 0);
1270   nm1[0] = n0 - 1;
1271   if (nm1[0] == 0)
1272     {
1273       assume (nm1[1]);
1274       k = stdc_trailing_zeros (nm1[1]);
1275
1276       q[0] = nm1[1] >> k;
1277       q[1] = 0;
1278       k += W_TYPE_SIZE;
1279     }
1280   else
1281     {
1282       k = stdc_trailing_zeros (nm1[0]);
1283       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1284     }
1285
1286   uintmax_t a = 2;
1287   binv (ni, n0);
1288   redcify2 (one[1], one[0], 1, n1, n0);
1289   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1290
1291   /* FIXME: Use scalars or pointers in arguments?  Some consistency needed.  */
1292   na[0] = n0;
1293   na[1] = n1;
1294
1295   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1296     return false;
1297
1298   if (flag_prove_primality)
1299     {
1300       /* Factor n-1 for Lucas.  */
1301       factor (nm1[1], nm1[0], &factors);
1302     }
1303
1304   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1305      number composite.  */
1306   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1307     {
1308       bool is_prime;
1309       uintmax_t e[2];
1310       uuint y;
1311
1312       if (flag_prove_primality)
1313         {
1314           is_prime = true;
1315           if (hi (factors.plarge))
1316             {
1317               uintmax_t pi;
1318               binv (pi, lo (factors.plarge));
1319               e[0] = pi * nm1[0];
1320               e[1] = 0;
1321               y = powm2 (a_prim, e, na, ni, one);
1322               is_prime = (lo (y) != one[0] || hi (y) != one[1]);
1323             }
1324           for (int i = 0; i < factors.nfactors && is_prime; i++)
1325             {
1326               /* FIXME: We always have the factor 2.  Do we really need to
1327                  handle it here?  We have done the same powering as part
1328                  of millerrabin.  */
1329               if (factors.p[i] == 2)
1330                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1331               else
1332                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1333               y = powm2 (a_prim, e, na, ni, one);
1334               is_prime = (lo (y) != one[0] || hi (y) != one[1]);
1335             }
1336         }
1337       else
1338         {
1339           /* After enough Miller-Rabin runs, be content.  */
1340           is_prime = (r == MR_REPS - 1);
1341         }
1342
1343       if (is_prime)
1344         return true;
1345
1346       a += primes_diff[r];      /* Establish new base.  */
1347       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1348
1349       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1350         return false;
1351     }
1352
1353   affirm (!"Lucas prime test failure.  This should not happen");
1354 }
1355
1356 static bool
1357 mp_prime_p (mpz_t n)
1358 {
1359   bool is_prime;
1360   mpz_t q, a, nm1, tmp;
1361   struct mp_factors factors;
1362
1363   if (mpz_cmp_ui (n, 1) <= 0)
1364     return false;
1365
1366   /* We have already cast out small primes.  */
1367   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1368     return true;
1369
1370   mpz_inits (q, a, nm1, tmp, nullptr);
1371
1372   /* Precomputation for Miller-Rabin.  */
1373   mpz_sub_ui (nm1, n, 1);
1374
1375   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1376   mp_bitcnt_t k = mpz_scan1 (nm1, 0);
1377   mpz_tdiv_q_2exp (q, nm1, k);
1378
1379   mpz_set_ui (a, 2);
1380
1381   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1382   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1383     {
1384       is_prime = false;
1385       goto ret2;
1386     }
1387
1388   if (flag_prove_primality)
1389     {
1390       /* Factor n-1 for Lucas.  */
1391       mpz_set (tmp, nm1);
1392       mp_factor (tmp, &factors);
1393     }
1394
1395   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1396      number composite.  */
1397   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1398     {
1399       if (flag_prove_primality)
1400         {
1401           is_prime = true;
1402           for (idx_t i = 0; i < factors.nfactors && is_prime; i++)
1403             {
1404               mpz_divexact (tmp, nm1, factors.p[i]);
1405               mpz_powm (tmp, a, tmp, n);
1406               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1407             }
1408         }
1409       else
1410         {
1411           /* After enough Miller-Rabin runs, be content.  */
1412           is_prime = (r == MR_REPS - 1);
1413         }
1414
1415       if (is_prime)
1416         goto ret1;
1417
1418       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1419
1420       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1421         {
1422           is_prime = false;
1423           goto ret1;
1424         }
1425     }
1426
1427   affirm (!"Lucas prime test failure.  This should not happen");
1428
1429  ret1:
1430   if (flag_prove_primality)
1431     mp_factor_clear (&factors);
1432  ret2:
1433   mpz_clears (q, a, nm1, tmp, nullptr);
1434
1435   return is_prime;
1436 }
1437
1438 static void
1439 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1440                           struct factors *factors)
1441 {
1442   uintmax_t x, z, y, P, t, ni, g;
1443
1444   unsigned long int k = 1;
1445   unsigned long int l = 1;
1446
1447   redcify (P, 1, n);
1448   addmod (x, P, P, n);          /* i.e., redcify(2) */
1449   y = z = x;
1450
1451   while (n != 1)
1452     {
1453       affirm (a < n);
1454
1455       binv (ni, n);             /* FIXME: when could we use old 'ni' value?  */
1456
1457       for (;;)
1458         {
1459           do
1460             {
1461               x = mulredc (x, x, n, ni);
1462               addmod (x, x, a, n);
1463
1464               submod (t, z, x, n);
1465               P = mulredc (P, t, n, ni);
1466
1467               if (k % 32 == 1)
1468                 {
1469                   if (gcd_odd (P, n) != 1)
1470                     goto factor_found;
1471                   y = x;
1472                 }
1473             }
1474           while (--k != 0);
1475
1476           z = x;
1477           k = l;
1478           l = 2 * l;
1479           for (unsigned long int i = 0; i < k; i++)
1480             {
1481               x = mulredc (x, x, n, ni);
1482               addmod (x, x, a, n);
1483             }
1484           y = x;
1485         }
1486
1487     factor_found:
1488       do
1489         {
1490           y = mulredc (y, y, n, ni);
1491           addmod (y, y, a, n);
1492
1493           submod (t, z, y, n);
1494           g = gcd_odd (t, n);
1495         }
1496       while (g == 1);
1497
1498       if (n == g)
1499         {
1500           /* Found n itself as factor.  Restart with different params.  */
1501           factor_using_pollard_rho (n, a + 1, factors);
1502           return;
1503         }
1504
1505       n = n / g;
1506
1507       if (!prime_p (g))
1508         factor_using_pollard_rho (g, a + 1, factors);
1509       else
1510         factor_insert (factors, g);
1511
1512       if (prime_p (n))
1513         {
1514           factor_insert (factors, n);
1515           break;
1516         }
1517
1518       x = x % n;
1519       z = z % n;
1520       y = y % n;
1521     }
1522 }
1523
1524 static void
1525 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1526                            struct factors *factors)
1527 {
1528   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1529
1530   unsigned long int k = 1;
1531   unsigned long int l = 1;
1532
1533   redcify2 (P1, P0, 1, n1, n0);
1534   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1535   y1 = z1 = x1;
1536   y0 = z0 = x0;
1537
1538   while (n1 != 0 || n0 != 1)
1539     {
1540       binv (ni, n0);
1541
1542       for (;;)
1543         {
1544           do
1545             {
1546               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1547               x1 = r1m;
1548               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1549
1550               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1551               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1552               P1 = r1m;
1553
1554               if (k % 32 == 1)
1555                 {
1556                   uuset (&g1, &g0, gcd2_odd (P1, P0, n1, n0));
1557                   if (g1 != 0 || g0 != 1)
1558                     goto factor_found;
1559                   y1 = x1; y0 = x0;
1560                 }
1561             }
1562           while (--k != 0);
1563
1564           z1 = x1; z0 = x0;
1565           k = l;
1566           l = 2 * l;
1567           for (unsigned long int i = 0; i < k; i++)
1568             {
1569               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1570               x1 = r1m;
1571               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1572             }
1573           y1 = x1; y0 = x0;
1574         }
1575
1576     factor_found:
1577       do
1578         {
1579           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1580           y1 = r1m;
1581           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1582
1583           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1584           uuset (&g1, &g0, gcd2_odd (t1, t0, n1, n0));
1585         }
1586       while (g1 == 0 && g0 == 1);
1587
1588       if (g1 == 0)
1589         {
1590           /* The found factor is one word, and > 1.  */
1591           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1592
1593           if (!prime_p (g0))
1594             factor_using_pollard_rho (g0, a + 1, factors);
1595           else
1596             factor_insert (factors, g0);
1597         }
1598       else
1599         {
1600           /* The found factor is two words.  This is highly unlikely, thus hard
1601              to trigger.  Please be careful before you change this code!  */
1602           uintmax_t ginv;
1603
1604           if (n1 == g1 && n0 == g0)
1605             {
1606               /* Found n itself as factor.  Restart with different params.  */
1607               factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1608               return;
1609             }
1610
1611           /* Compute n = n / g.  Since the result will fit one word,
1612              we can compute the quotient modulo B, ignoring the high
1613              divisor word.  */
1614           binv (ginv, g0);
1615           n0 = ginv * n0;
1616           n1 = 0;
1617
1618           if (!prime2_p (g1, g0))
1619             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1620           else
1621             factor_insert_large (factors, g1, g0);
1622         }
1623
1624       if (n1 == 0)
1625         {
1626           if (prime_p (n0))
1627             {
1628               factor_insert (factors, n0);
1629               break;
1630             }
1631
1632           factor_using_pollard_rho (n0, a, factors);
1633           return;
1634         }
1635
1636       if (prime2_p (n1, n0))
1637         {
1638           factor_insert_large (factors, n1, n0);
1639           break;
1640         }
1641
1642       uuset (&x1, &x0, mod2 (x1, x0, n1, n0));
1643       uuset (&z1, &z0, mod2 (z1, z0, n1, n0));
1644       uuset (&y1, &y0, mod2 (y1, y0, n1, n0));
1645     }
1646 }
1647
1648 static void
1649 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1650                              struct mp_factors *factors)
1651 {
1652   mpz_t x, z, y, P;
1653   mpz_t t, t2;
1654
1655   devmsg ("[pollard-rho (%lu)] ", a);
1656
1657   mpz_inits (t, t2, nullptr);
1658   mpz_init_set_si (y, 2);
1659   mpz_init_set_si (x, 2);
1660   mpz_init_set_si (z, 2);
1661   mpz_init_set_ui (P, 1);
1662
1663   unsigned long long int k = 1;
1664   unsigned long long int l = 1;
1665
1666   while (mpz_cmp_ui (n, 1) != 0)
1667     {
1668       for (;;)
1669         {
1670           do
1671             {
1672               mpz_mul (t, x, x);
1673               mpz_mod (x, t, n);
1674               mpz_add_ui (x, x, a);
1675
1676               mpz_sub (t, z, x);
1677               mpz_mul (t2, P, t);
1678               mpz_mod (P, t2, n);
1679
1680               if (k % 32 == 1)
1681                 {
1682                   mpz_gcd (t, P, n);
1683                   if (mpz_cmp_ui (t, 1) != 0)
1684                     goto factor_found;
1685                   mpz_set (y, x);
1686                 }
1687             }
1688           while (--k != 0);
1689
1690           mpz_set (z, x);
1691           k = l;
1692           l = 2 * l;
1693           for (unsigned long long int i = 0; i < k; i++)
1694             {
1695               mpz_mul (t, x, x);
1696               mpz_mod (x, t, n);
1697               mpz_add_ui (x, x, a);
1698             }
1699           mpz_set (y, x);
1700         }
1701
1702     factor_found:
1703       do
1704         {
1705           mpz_mul (t, y, y);
1706           mpz_mod (y, t, n);
1707           mpz_add_ui (y, y, a);
1708
1709           mpz_sub (t, z, y);
1710           mpz_gcd (t, t, n);
1711         }
1712       while (mpz_cmp_ui (t, 1) == 0);
1713
1714       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1715
1716       if (!mp_prime_p (t))
1717         {
1718           devmsg ("[composite factor--restarting pollard-rho] ");
1719           mp_factor_using_pollard_rho (t, a + 1, factors);
1720         }
1721       else
1722         {
1723           mp_factor_insert (factors, t);
1724         }
1725
1726       if (mp_prime_p (n))
1727         {
1728           mp_factor_insert (factors, n);
1729           break;
1730         }
1731
1732       mpz_mod (x, x, n);
1733       mpz_mod (z, z, n);
1734       mpz_mod (y, y, n);
1735     }
1736
1737   mpz_clears (P, t2, t, z, x, y, nullptr);
1738 }
1739
1740 #if USE_SQUFOF
1741 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1742    algorithm is replaced, consider also returning the remainder.  */
1743 ATTRIBUTE_CONST
1744 static uintmax_t
1745 isqrt (uintmax_t n)
1746 {
1747   if (n == 0)
1748     return 0;
1749
1750   int c = stdc_leading_zeros (n);
1751
1752   /* Make x > sqrt(n).  This will be invariant through the loop.  */
1753   uintmax_t x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) >> 1);
1754
1755   for (;;)
1756     {
1757       uintmax_t y = (x + n / x) / 2;
1758       if (y >= x)
1759         return x;
1760
1761       x = y;
1762     }
1763 }
1764
1765 ATTRIBUTE_CONST
1766 static uintmax_t
1767 isqrt2 (uintmax_t nh, uintmax_t nl)
1768 {
1769   /* Ensures the remainder fits in an uintmax_t.  */
1770   affirm (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1771
1772   if (nh == 0)
1773     return isqrt (nl);
1774
1775   int shift = stdc_leading_zeros (nh) & ~1;
1776
1777   /* Make x > sqrt (n).  */
1778   uintmax_t x = isqrt ((nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1779   x <<= (W_TYPE_SIZE - shift) >> 1;
1780
1781   /* Do we need more than one iteration?  */
1782   for (;;)
1783     {
1784       MAYBE_UNUSED uintmax_t r;
1785       uintmax_t q, y;
1786       udiv_qrnnd (q, r, nh, nl, x);
1787       y = (x + q) / 2;
1788
1789       if (y >= x)
1790         {
1791           uintmax_t hi, lo;
1792           umul_ppmm (hi, lo, x + 1, x + 1);
1793           affirm (gt2 (hi, lo, nh, nl));
1794
1795           umul_ppmm (hi, lo, x, x);
1796           affirm (ge2 (nh, nl, hi, lo));
1797           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1798           affirm (hi == 0);
1799
1800           return x;
1801         }
1802
1803       x = y;
1804     }
1805 }
1806
1807 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N.  */
1808 # define MAGIC64 0x0202021202030213ULL
1809 # define MAGIC63 0x0402483012450293ULL
1810 # define MAGIC65 0x218a019866014613ULL
1811 # define MAGIC11 0x23b
1812
1813 /* Return the square root if the input is a square, otherwise 0.  */
1814 ATTRIBUTE_CONST
1815 static uintmax_t
1816 is_square (uintmax_t x)
1817 {
1818   /* Uses the tests suggested by Cohen.  Excludes 99% of the non-squares before
1819      computing the square root.  */
1820   if (((MAGIC64 >> (x & 63)) & 1)
1821       && ((MAGIC63 >> (x % 63)) & 1)
1822       /* Both 0 and 64 are squares mod (65).  */
1823       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1824       && ((MAGIC11 >> (x % 11) & 1)))
1825     {
1826       uintmax_t r = isqrt (x);
1827       if (r * r == x)
1828         return r;
1829     }
1830   return 0;
1831 }
1832
1833 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1834 static short const invtab[0x81] =
1835   {
1836     0x200,
1837     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1838     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1839     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1840     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1841     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1842     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1843     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1844     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1845     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1846     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1847     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1848     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1849     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1850     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1851     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1852     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1853   };
1854
1855 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1856    that q < 0x40; here it instead uses a table of (Euclidean) inverses.  */
1857 # define div_smallq(q, r, u, d)                                          \
1858   do {                                                                  \
1859     if ((u) / 0x40 < (d))                                               \
1860       {                                                                 \
1861         uintmax_t _dinv, _mask, _q, _r;                                 \
1862         int _cnt = stdc_leading_zeros (d);                              \
1863         _r = (u);                                                       \
1864         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1865           {                                                             \
1866             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1867             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1868           }                                                             \
1869         else                                                            \
1870           {                                                             \
1871             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1872             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1873           }                                                             \
1874         _r -= _q * (d);                                                 \
1875                                                                         \
1876         _mask = -(uintmax_t) (_r >= (d));                               \
1877         (r) = _r - (_mask & (d));                                       \
1878         (q) = _q - _mask;                                               \
1879         affirm ((q) * (d) + (r) == u);                                  \
1880       }                                                                 \
1881     else                                                                \
1882       {                                                                 \
1883         uintmax_t _q = (u) / (d);                                       \
1884         (r) = (u) - _q * (d);                                           \
1885         (q) = _q;                                                       \
1886       }                                                                 \
1887   } while (0)
1888
1889 /* Notes: Example N = 22117019.  After first phase we find Q1 = 6314, Q
1890    = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1891    with 3025 = 55^2.
1892
1893    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1894    representing G_0 = (-55, 2 * 4652, 8653).
1895
1896    In the notation of the paper:
1897
1898    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1899
1900    Put
1901
1902      t_0 = floor([q_0 + R_0] / S0) = 1
1903      R_1 = t_0 * S_0 - R_0 = 4001
1904      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1905 */
1906
1907 /* Multipliers, in order of efficiency:
1908    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1909    0.7317  3*5*7    =  105 = 1
1910    0.7820  3*5*11   =  165 = 1
1911    0.7872  3*5      =   15 = 3
1912    0.8101  3*7*11   =  231 = 3
1913    0.8155  3*7      =   21 = 1
1914    0.8284  5*7*11   =  385 = 1
1915    0.8339  5*7      =   35 = 3
1916    0.8716  3*11     =   33 = 1
1917    0.8774  3        =    3 = 3
1918    0.8913  5*11     =   55 = 3
1919    0.8972  5        =    5 = 1
1920    0.9233  7*11     =   77 = 1
1921    0.9295  7        =    7 = 3
1922    0.9934  11       =   11 = 3
1923 */
1924 # define QUEUE_SIZE 50
1925 #endif
1926
1927 #if STAT_SQUFOF
1928 # define Q_FREQ_SIZE 50
1929 /* Element 0 keeps the total */
1930 static int q_freq[Q_FREQ_SIZE + 1];
1931 #endif
1932
1933 #if USE_SQUFOF
1934 /* Return true on success.  Expected to fail only for numbers
1935    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit.  */
1936 static bool
1937 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1938 {
1939   /* Uses algorithm and notation from
1940
1941      SQUARE FORM FACTORIZATION
1942      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1943
1944      https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1945    */
1946
1947   static short const multipliers_1[] =
1948     { /* = 1 (mod 4) */
1949       105, 165, 21, 385, 33, 5, 77, 1, 0
1950     };
1951   static short const multipliers_3[] =
1952     { /* = 3 (mod 4) */
1953       1155, 15, 231, 35, 3, 55, 7, 11, 0
1954     };
1955
1956   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1957
1958   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1959     return false;
1960
1961   uintmax_t sqrt_n = isqrt2 (n1, n0);
1962
1963   if (n0 == sqrt_n * sqrt_n)
1964     {
1965       uintmax_t p1, p0;
1966
1967       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1968       affirm (p0 == n0);
1969
1970       if (n1 == p1)
1971         {
1972           if (prime_p (sqrt_n))
1973             factor_insert_multiplicity (factors, sqrt_n, 2);
1974           else
1975             {
1976               struct factors f;
1977
1978               f.nfactors = 0;
1979               if (!factor_using_squfof (0, sqrt_n, &f))
1980                 {
1981                   /* Try pollard rho instead */
1982                   factor_using_pollard_rho (sqrt_n, 1, &f);
1983                 }
1984               /* Duplicate the new factors */
1985               for (unsigned int i = 0; i < f.nfactors; i++)
1986                 factor_insert_multiplicity (factors, f.p[i], 2 * f.e[i]);
1987             }
1988           return true;
1989         }
1990     }
1991
1992   /* Select multipliers so we always get n * mu = 3 (mod 4) */
1993   for (short const *m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
1994        *m; m++)
1995     {
1996       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
1997       unsigned int i;
1998       unsigned int mu = *m;
1999       int qpos = 0;
2000
2001       affirm (mu * n0 % 4 == 3);
2002
2003       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2004          get \Delta = 4 mu * n, and the paper's \mu is 2 mu.  As far as
2005          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2006          mu^3 < n.
2007
2008          However, this seems insufficient: With n = 37243139 and mu =
2009          105, we get a trivial factor, from the square 38809 = 197^2,
2010          without any corresponding Q earlier in the iteration.
2011
2012          Requiring 64 mu^3 < n seems sufficient.  */
2013       if (n1 == 0)
2014         {
2015           if ((uintmax_t) mu * mu * mu >= n0 / 64)
2016             continue;
2017         }
2018       else
2019         {
2020           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2021             continue;
2022         }
2023       umul_ppmm (Dh, Dl, n0, mu);
2024       Dh += n1 * mu;
2025
2026       affirm (Dl % 4 != 1);
2027       affirm (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2028
2029       S = isqrt2 (Dh, Dl);
2030
2031       Q1 = 1;
2032       P = S;
2033
2034       /* Square root remainder fits in one word, so ignore high part.  */
2035       Q = Dl - P * P;
2036       /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))?  */
2037       L = isqrt (2 * S);
2038       B = 2 * L;
2039       L1 = mu * 2 * L;
2040
2041       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2042          4 D.  */
2043
2044       for (i = 0; i <= B; i++)
2045         {
2046           uintmax_t q, P1, t, rem;
2047
2048           div_smallq (q, rem, S + P, Q);
2049           P1 = S - rem; /* P1 = q*Q - P */
2050
2051           affirm (q > 0 && Q > 0);
2052
2053 # if STAT_SQUFOF
2054           q_freq[0]++;
2055           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2056 # endif
2057
2058           if (Q <= L1)
2059             {
2060               uintmax_t g = Q;
2061
2062               if ((Q & 1) == 0)
2063                 g /= 2;
2064
2065               g /= gcd_odd (g, mu);
2066
2067               if (g <= L)
2068                 {
2069                   if (qpos >= QUEUE_SIZE)
2070                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2071                   queue[qpos].Q = g;
2072                   queue[qpos].P = P % g;
2073                   qpos++;
2074                 }
2075             }
2076
2077           /* I think the difference can be either sign, but mod
2078              2^W_TYPE_SIZE arithmetic should be fine.  */
2079           t = Q1 + q * (P - P1);
2080           Q1 = Q;
2081           Q = t;
2082           P = P1;
2083
2084           if ((i & 1) == 0)
2085             {
2086               uintmax_t r = is_square (Q);
2087               if (r)
2088                 {
2089                   for (int j = 0; j < qpos; j++)
2090                     {
2091                       if (queue[j].Q == r)
2092                         {
2093                           if (r == 1)
2094                             /* Traversed entire cycle.  */
2095                             goto next_multiplier;
2096
2097                           /* Need the absolute value for divisibility test.  */
2098                           if (P >= queue[j].P)
2099                             t = P - queue[j].P;
2100                           else
2101                             t = queue[j].P - P;
2102                           if (t % r == 0)
2103                             {
2104                               /* Delete entries up to and including entry
2105                                  j, which matched.  */
2106                               memmove (queue, queue + j + 1,
2107                                        (qpos - j - 1) * sizeof (queue[0]));
2108                               qpos -= (j + 1);
2109                             }
2110                           goto next_i;
2111                         }
2112                     }
2113
2114                   /* We have found a square form, which should give a
2115                      factor.  */
2116                   Q1 = r;
2117                   affirm (S >= P); /* What signs are possible?  */
2118                   P += r * ((S - P) / r);
2119
2120                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2121                      for the case D = 2N.  */
2122                   /* Compute Q = (D - P*P) / Q1, but we need double
2123                      precision.  */
2124                   uintmax_t hi, lo;
2125                   umul_ppmm (hi, lo, P, P);
2126                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2127                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2128                   affirm (rem == 0);
2129
2130                   for (;;)
2131                     {
2132                       /* Note: There appears to by a typo in the paper,
2133                          Step 4a in the algorithm description says q <--
2134                          floor([S+P]/\hat Q), but looking at the equations
2135                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2136                          (In this code, \hat Q is Q1).  */
2137                       div_smallq (q, rem, S + P, Q);
2138                       P1 = S - rem;     /* P1 = q*Q - P */
2139
2140 # if STAT_SQUFOF
2141                       q_freq[0]++;
2142                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2143 # endif
2144                       if (P == P1)
2145                         break;
2146                       t = Q1 + q * (P - P1);
2147                       Q1 = Q;
2148                       Q = t;
2149                       P = P1;
2150                     }
2151
2152                   if ((Q & 1) == 0)
2153                     Q /= 2;
2154                   Q /= gcd_odd (Q, mu);
2155
2156                   affirm (Q > 1 && (n1 || Q < n0));
2157
2158                   if (prime_p (Q))
2159                     factor_insert (factors, Q);
2160                   else if (!factor_using_squfof (0, Q, factors))
2161                     factor_using_pollard_rho (Q, 2, factors);
2162
2163                   divexact_21 (n1, n0, n1, n0, Q);
2164
2165                   if (prime2_p (n1, n0))
2166                     factor_insert_large (factors, n1, n0);
2167                   else
2168                     {
2169                       if (!factor_using_squfof (n1, n0, factors))
2170                         {
2171                           if (n1 == 0)
2172                             factor_using_pollard_rho (n0, 1, factors);
2173                           else
2174                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2175                         }
2176                     }
2177
2178                   return true;
2179                 }
2180             }
2181         next_i:;
2182         }
2183     next_multiplier:;
2184     }
2185   return false;
2186 }
2187 #endif
2188
2189 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2190    results in FACTORS.  */
2191 static void
2192 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2193 {
2194   factors->nfactors = 0;
2195   hiset (&factors->plarge, 0);
2196
2197   if (t1 == 0 && t0 < 2)
2198     return;
2199
2200   uuset (&t1, &t0, factor_using_division (t1, t0, factors));
2201
2202   if (t1 == 0 && t0 < 2)
2203     return;
2204
2205   if (prime2_p (t1, t0))
2206     factor_insert_large (factors, t1, t0);
2207   else
2208     {
2209 #if USE_SQUFOF
2210       if (factor_using_squfof (t1, t0, factors))
2211         return;
2212 #endif
2213
2214       if (t1 == 0)
2215         factor_using_pollard_rho (t0, 1, factors);
2216       else
2217         factor_using_pollard_rho2 (t1, t0, 1, factors);
2218     }
2219 }
2220
2221 /* Use Pollard-rho to compute the prime factors of
2222    arbitrary-precision T, and put the results in FACTORS.  */
2223 static void
2224 mp_factor (mpz_t t, struct mp_factors *factors)
2225 {
2226   mp_factor_init (factors);
2227
2228   if (mpz_sgn (t) != 0)
2229     {
2230       mp_factor_using_division (t, factors);
2231
2232       if (mpz_cmp_ui (t, 1) != 0)
2233         {
2234           devmsg ("[is number prime?] ");
2235           if (mp_prime_p (t))
2236             mp_factor_insert (factors, t);
2237           else
2238             mp_factor_using_pollard_rho (t, 1, factors);
2239         }
2240     }
2241 }
2242
2243 static strtol_error
2244 strto2uintmax (uintmax_t *hip, uintmax_t *lop, char const *s)
2245 {
2246   int lo_carry;
2247   uintmax_t hi = 0, lo = 0;
2248
2249   strtol_error err = LONGINT_INVALID;
2250
2251   /* Initial scan for invalid digits.  */
2252   char const *p = s;
2253   for (;;)
2254     {
2255       unsigned char c = *p++;
2256       if (c == 0)
2257         break;
2258
2259       if (UNLIKELY (!ISDIGIT (c)))
2260         {
2261           err = LONGINT_INVALID;
2262           break;
2263         }
2264
2265       err = LONGINT_OK;           /* we've seen at least one valid digit */
2266     }
2267
2268   while (err == LONGINT_OK)
2269     {
2270       unsigned char c = *s++;
2271       if (c == 0)
2272         break;
2273
2274       c -= '0';
2275
2276       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2277         {
2278           err = LONGINT_OVERFLOW;
2279           break;
2280         }
2281       hi = 10 * hi;
2282
2283       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2284       lo_carry += 10 * lo < 2 * lo;
2285
2286       lo = 10 * lo;
2287       lo += c;
2288
2289       lo_carry += lo < c;
2290       hi += lo_carry;
2291       if (UNLIKELY (hi < lo_carry))
2292         {
2293           err = LONGINT_OVERFLOW;
2294           break;
2295         }
2296     }
2297
2298   *hip = hi;
2299   *lop = lo;
2300
2301   return err;
2302 }
2303
2304 /* FACTOR_PIPE_BUF is chosen to give good performance,
2305    and also is the max guaranteed size that
2306    consumers can read atomically through pipes.
2307    Also it's big enough to cater for max line length
2308    even with 128 bit uintmax_t.  */
2309 #ifndef _POSIX_PIPE_BUF
2310 # define _POSIX_PIPE_BUF 512
2311 #endif
2312 #ifdef PIPE_BUF
2313 enum { FACTOR_PIPE_BUF = PIPE_BUF };
2314 #else
2315 enum { FACTOR_PIPE_BUF = _POSIX_PIPE_BUF };
2316 #endif
2317
2318 /* Structure and routines for buffering and outputting full lines, to
2319    support parallel operation efficiently.
2320
2321    The buffer is twice FACTOR_PIPE_BUF so that its second half can
2322    hold the remainder of data that is somewhat too large.  Also, the
2323    very end of the second half is used to hold temporary data when
2324    stringifying integers, which is most conveniently done
2325    right-to-left.
2326
2327    Although the buffer's second half doesn't need to be quite so large
2328    - its necessary size is bounded above by roughly the maximum output
2329    line for a uuint plus the string length of a uuint - it'd be a bit
2330    of a pain to figure out exactly how small it can be without causing
2331    trouble.  */
2332 static char lbuf_buf[2 * FACTOR_PIPE_BUF];
2333 static idx_t lbuffered;
2334
2335 /* Write complete LBUF to standard output.  */
2336 static void
2337 lbuf_flush (void)
2338 {
2339   idx_t size = lbuffered;
2340
2341   /* Update lbuffered now, to avoid infinite recursion on write error.  */
2342   lbuffered = 0;
2343
2344   if (full_write (STDOUT_FILENO, lbuf_buf, size) != size)
2345     write_error ();
2346 }
2347
2348 /* Write LBUF to standard output.
2349    LBUF should contain at least FACTOR_PIPE_BUF bytes.
2350    If possible, write a prefix of LBUF that is newline terminated
2351    and contains <= FACTOR_PIPE_BUF bytes, so consumers can read atomically.
2352    But if the first FACTOR_PIPE_BUF bytes contain no newlines,
2353    give up on atomicity and just write the first FACTOR_PIPE_BUF bytes.  */
2354 static void
2355 lbuf_half_flush (void)
2356 {
2357   char *nl = memrchr (lbuf_buf, '\n', FACTOR_PIPE_BUF);
2358   char *suffix = nl ? nl + 1 : lbuf_buf + FACTOR_PIPE_BUF;
2359   idx_t prefix_size = suffix - lbuf_buf;
2360   idx_t suffix_size = lbuffered - prefix_size;
2361   lbuffered = prefix_size;
2362   lbuf_flush ();
2363   lbuffered = suffix_size;
2364   memmove (lbuf_buf, suffix, suffix_size);
2365 }
2366
2367 /* Add a character C to lbuf_buf.  */
2368 static void
2369 lbuf_putc (char c)
2370 {
2371   lbuf_buf[lbuffered++] = c;
2372 }
2373
2374 /* Add a newline to lbuf_buf.  Then, if enough bytes are already
2375    buffered, write the buffer atomically to standard output.  */
2376 static void
2377 lbuf_putnl (void)
2378 {
2379   lbuf_putc ('\n');
2380
2381   /* Provide immediate output for interactive use.  */
2382   static int line_buffered = -1;
2383   if (line_buffered < 0)
2384     line_buffered = isatty (STDOUT_FILENO);
2385
2386   if (line_buffered)
2387     lbuf_flush ();
2388   else if (FACTOR_PIPE_BUF <= lbuffered)
2389     lbuf_half_flush ();
2390 }
2391
2392 /* Append the string representation of I to lbuf_buf, followed by
2393    everything from BUFEND to lbuf_buf's end.  Use the area just before
2394    BUFEND temporarily.  */
2395 static void
2396 lbuf_putint_append (uintmax_t i, char *bufend)
2397 {
2398   char *istr = bufend;
2399   do
2400     {
2401       *--istr = '0' + i % 10;
2402       i /= 10;
2403     }
2404   while (i);
2405
2406   char *p = lbuf_buf + lbuffered;
2407   do
2408     *p++ = *istr++;
2409   while (istr < lbuf_buf + sizeof lbuf_buf);
2410
2411   lbuffered = p - lbuf_buf;
2412 }
2413
2414 /* Append the string representation of I to lbuf_buf.  */
2415 static void
2416 lbuf_putint (uintmax_t i)
2417 {
2418   return lbuf_putint_append (i, lbuf_buf + sizeof lbuf_buf);
2419 }
2420
2421 /* Append the string representation of T to lbuf_buf.  */
2422 static void
2423 print_uuint (uuint t)
2424 {
2425   uintmax_t t1 = hi (t), t0 = lo (t);
2426   char *bufend = lbuf_buf + sizeof lbuf_buf;
2427
2428   while (t1)
2429     {
2430       uintmax_t r = t1 % BIG_POWER_OF_10;
2431       t1 /= BIG_POWER_OF_10;
2432       udiv_qrnnd (t0, r, r, t0, BIG_POWER_OF_10);
2433       for (int i = 0; i < LOG_BIG_POWER_OF_10; i++)
2434         {
2435           *--bufend = '0' + r % 10;
2436           r /= 10;
2437         }
2438     }
2439
2440   lbuf_putint_append (t0, bufend);
2441 }
2442
2443 /* Buffer an mpz to the internal LBUF, possibly writing if it is long.  */
2444 static void
2445 lbuf_putmpz (mpz_t const i)
2446 {
2447   idx_t sizeinbase = mpz_sizeinbase (i, 10);
2448   char *lbuf_bufend = lbuf_buf + sizeof lbuf_buf;
2449   char *p = lbuf_buf + lbuffered;
2450   if (sizeinbase < lbuf_bufend - p)
2451     {
2452       mpz_get_str (p, 10, i);
2453       p += sizeinbase;
2454       lbuffered = p - !p[-1] - lbuf_buf;
2455       while (FACTOR_PIPE_BUF <= lbuffered)
2456         lbuf_half_flush ();
2457     }
2458   else
2459     {
2460       lbuf_flush ();
2461       char *istr = ximalloc (sizeinbase + 1);
2462       mpz_get_str (istr, 10, i);
2463       idx_t istrlen = sizeinbase - !istr[sizeinbase - 1];
2464       if (full_write (STDOUT_FILENO, istr, istrlen) != istrlen)
2465         write_error ();
2466       free (istr);
2467     }
2468 }
2469
2470 /* Single-precision factoring */
2471 static void
2472 print_factors_single (uintmax_t t1, uintmax_t t0)
2473 {
2474   struct factors factors;
2475
2476   print_uuint (make_uuint (t1, t0));
2477   lbuf_putc (':');
2478
2479   factor (t1, t0, &factors);
2480
2481   for (int j = 0; j < factors.nfactors; j++)
2482     for (int k = 0; k < factors.e[j]; k++)
2483       {
2484         lbuf_putc (' ');
2485         print_uuint (make_uuint (0, factors.p[j]));
2486         if (print_exponents && factors.e[j] > 1)
2487           {
2488             lbuf_putc ('^');
2489             lbuf_putint (factors.e[j]);
2490             break;
2491           }
2492       }
2493
2494   if (hi (factors.plarge))
2495     {
2496       lbuf_putc (' ');
2497       print_uuint (factors.plarge);
2498     }
2499
2500   lbuf_putnl ();
2501 }
2502
2503 /* Emit the factors of the indicated number.  If we have the option of using
2504    either algorithm, we select on the basis of the length of the number.
2505    For longer numbers, we prefer the MP algorithm even if the native algorithm
2506    has enough digits, because the algorithm is better.  The turnover point
2507    depends on the value.  */
2508 static bool
2509 print_factors (char const *input)
2510 {
2511   /* Skip initial spaces and '+'.  */
2512   char const *str = input;
2513   while (*str == ' ')
2514     str++;
2515   str += *str == '+';
2516
2517   uintmax_t t1, t0;
2518
2519   /* Try converting the number to one or two words.  If it fails, use GMP or
2520      print an error message.  The 2nd condition checks that the most
2521      significant bit of the two-word number is clear, in a typesize neutral
2522      way.  */
2523   strtol_error err = strto2uintmax (&t1, &t0, str);
2524
2525   switch (err)
2526     {
2527     case LONGINT_OK:
2528       if (((t1 << 1) >> 1) == t1)
2529         {
2530           devmsg ("[using single-precision arithmetic] ");
2531           print_factors_single (t1, t0);
2532           return true;
2533         }
2534       break;
2535
2536     case LONGINT_OVERFLOW:
2537       /* Try GMP.  */
2538       break;
2539
2540     default:
2541       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2542       return false;
2543     }
2544
2545   devmsg ("[using arbitrary-precision arithmetic] ");
2546   mpz_t t;
2547   struct mp_factors factors;
2548
2549   mpz_init_set_str (t, str, 10);
2550
2551   lbuf_putmpz (t);
2552   lbuf_putc (':');
2553   mp_factor (t, &factors);
2554
2555   for (idx_t j = 0; j < factors.nfactors; j++)
2556     for (unsigned long int k = 0; k < factors.e[j]; k++)
2557       {
2558         lbuf_putc (' ');
2559         lbuf_putmpz (factors.p[j]);
2560         if (print_exponents && factors.e[j] > 1)
2561           {
2562             lbuf_putc ('^');
2563             lbuf_putint (factors.e[j]);
2564             break;
2565           }
2566       }
2567
2568   mp_factor_clear (&factors);
2569   mpz_clear (t);
2570   lbuf_putnl ();
2571   return true;
2572 }
2573
2574 void
2575 usage (int status)
2576 {
2577   if (status != EXIT_SUCCESS)
2578     emit_try_help ();
2579   else
2580     {
2581       printf (_("\
2582 Usage: %s [OPTION] [NUMBER]...\n\
2583 "),
2584               program_name);
2585       fputs (_("\
2586 Print the prime factors of each specified integer NUMBER.  If none\n\
2587 are specified on the command line, read them from standard input.\n\
2588 \n\
2589 "), stdout);
2590       fputs ("\
2591   -h, --exponents   print repeated factors in form p^e unless e is 1\n\
2592 ", stdout);
2593       fputs (HELP_OPTION_DESCRIPTION, stdout);
2594       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2595       emit_ancillary_info (PROGRAM_NAME);
2596     }
2597   exit (status);
2598 }
2599
2600 static bool
2601 do_stdin (void)
2602 {
2603   bool ok = true;
2604   token_buffer tokenbuffer;
2605
2606   init_tokenbuffer (&tokenbuffer);
2607
2608   while (true)
2609     {
2610       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2611                                        &tokenbuffer);
2612       if (token_length == (size_t) -1)
2613         {
2614           if (ferror (stdin))
2615             error (EXIT_FAILURE, errno, _("error reading input"));
2616           break;
2617         }
2618
2619       ok &= print_factors (tokenbuffer.buffer);
2620     }
2621   free (tokenbuffer.buffer);
2622
2623   return ok;
2624 }
2625
2626 int
2627 main (int argc, char **argv)
2628 {
2629   initialize_main (&argc, &argv);
2630   set_program_name (argv[0]);
2631   setlocale (LC_ALL, "");
2632   bindtextdomain (PACKAGE, LOCALEDIR);
2633   textdomain (PACKAGE);
2634
2635   atexit (close_stdout);
2636
2637   int c;
2638   while ((c = getopt_long (argc, argv, "h", long_options, nullptr)) != -1)
2639     {
2640       switch (c)
2641         {
2642         case 'h':  /* NetBSD used -h for this functionality first.  */
2643           print_exponents = true;
2644           break;
2645
2646         case DEV_DEBUG_OPTION:
2647           dev_debug = true;
2648           break;
2649
2650         case_GETOPT_HELP_CHAR;
2651
2652         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2653
2654         default:
2655           usage (EXIT_FAILURE);
2656         }
2657     }
2658
2659   atexit (lbuf_flush);
2660
2661 #if STAT_SQUFOF
2662   memset (q_freq, 0, sizeof (q_freq));
2663 #endif
2664
2665   bool ok;
2666   if (argc <= optind)
2667     ok = do_stdin ();
2668   else
2669     {
2670       ok = true;
2671       for (int i = optind; i < argc; i++)
2672         if (! print_factors (argv[i]))
2673           ok = false;
2674     }
2675
2676 #if STAT_SQUFOF
2677   if (q_freq[0] > 0)
2678     {
2679       double acc_f;
2680       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2681       for (int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2682         {
2683           double f = (double) q_freq[i] / q_freq[0];
2684           acc_f += f;
2685           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2686                   100.0 * f, 100.0 * acc_f);
2687         }
2688     }
2689 #endif
2690
2691   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2692 }