src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2017 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synced 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or if USE_SQUFOF is defined to 1, try that first.
  54         Status of found factors are checked again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdio.h>
 107 #if HAVE_GMP
 108 # include <gmp.h>
 109 # if !HAVE_DECL_MPZ_INITS
 110 #  include <stdarg.h>
 111 # endif
 112 #endif
 113
 114 #include <assert.h>
 115
 116 #include "system.h"
 117 #include "die.h"
 118 #include "error.h"
 119 #include "full-write.h"
 120 #include "quote.h"
 121 #include "readtokens.h"
 122 #include "xstrtol.h"
 123
 124 /* The official name of this program (e.g., no 'g' prefix).  */
 125 #define PROGRAM_NAME "factor"
 126
 127 #define AUTHORS \
 128   proper_name ("Paul Rubin"),                                           \
 129   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 130   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 131
 132 /* Token delimiters when reading from a file.  */
 133 #define DELIM "\n\t "
 134
 135 #ifndef USE_LONGLONG_H
 136 /* With the way we use longlong.h, it's only safe to use
 137    when UWtype = UHWtype, as there were various cases
 138    (as can be seen in the history for longlong.h) where
 139    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 140    to avoid compile time or run time issues.  */
 141 # if LONG_MAX == INTMAX_MAX
 142 #  define USE_LONGLONG_H 1
 143 # endif
 144 #endif
 145
 146 #if USE_LONGLONG_H
 147
 148 /* Make definitions for longlong.h to make it do what it can do for us */
 149
 150 /* bitcount for uintmax_t */
 151 # if UINTMAX_MAX == UINT32_MAX
 152 #  define W_TYPE_SIZE 32
 153 # elif UINTMAX_MAX == UINT64_MAX
 154 #  define W_TYPE_SIZE 64
 155 # elif UINTMAX_MAX == UINT128_MAX
 156 #  define W_TYPE_SIZE 128
 157 # endif
 158
 159 # define UWtype  uintmax_t
 160 # define UHWtype unsigned long int
 161 # undef UDWtype
 162 # if HAVE_ATTRIBUTE_MODE
 163 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 164 typedef          int SItype     __attribute__ ((mode (SI)));
 165 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 166 typedef          int DItype     __attribute__ ((mode (DI)));
 167 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 168 # else
 169 typedef unsigned char UQItype;
 170 typedef          long SItype;
 171 typedef unsigned long int USItype;
 172 #  if HAVE_LONG_LONG_INT
 173 typedef long long int DItype;
 174 typedef unsigned long long int UDItype;
 175 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 176 typedef long int DItype;
 177 typedef unsigned long int UDItype;
 178 #  endif
 179 # endif
 180 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 181 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 182 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 183 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 184 # ifndef __GMP_GNUC_PREREQ
 185 #  define __GMP_GNUC_PREREQ(a,b) 1
 186 # endif
 187
 188 /* These stub macros are only used in longlong.h in certain system compiler
 189    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 190 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 191 ASSERT (1)
 192 __GMP_DECLSPEC
 193 # endif
 194
 195 # if _ARCH_PPC
 196 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 197 # endif
 198 # include "longlong.h"
 199 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 200 const unsigned char factor_clz_tab[129] =
 201 {
 202   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 203   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 204   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 205   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 206   9
 207 };
 208 # endif
 209
 210 #else /* not USE_LONGLONG_H */
 211
 212 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 213 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 214 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 215 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 216
 217 #endif
 218
 219 #if !defined __clz_tab && !defined UHWtype
 220 /* Without this seemingly useless conditional, gcc -Wunused-macros
 221    warns that each of the two tested macros is unused on Fedora 18.
 222    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 223 #endif
 224
 225 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 226 #define MAX_NFACTS 26
 227
 228 enum
 229 {
 230   DEV_DEBUG_OPTION = CHAR_MAX + 1
 231 };
 232
 233 static struct option const long_options[] =
 234 {
 235   {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
 236   {GETOPT_HELP_OPTION_DECL},
 237   {GETOPT_VERSION_OPTION_DECL},
 238   {NULL, 0, NULL, 0}
 239 };
 240
 241 struct factors
 242 {
 243   uintmax_t     plarge[2]; /* Can have a single large factor */
 244   uintmax_t     p[MAX_NFACTS];
 245   unsigned char e[MAX_NFACTS];
 246   unsigned char nfactors;
 247 };
 248
 249 #if HAVE_GMP
 250 struct mp_factors
 251 {
 252   mpz_t             *p;
 253   unsigned long int *e;
 254   unsigned long int nfactors;
 255 };
 256 #endif
 257
 258 static void factor (uintmax_t, uintmax_t, struct factors *);
 259
 260 #ifndef umul_ppmm
 261 # define umul_ppmm(w1, w0, u, v)                                        \
 262   do {                                                                  \
 263     uintmax_t __x0, __x1, __x2, __x3;                                   \
 264     unsigned long int __ul, __vl, __uh, __vh;                           \
 265     uintmax_t __u = (u), __v = (v);                                     \
 266                                                                         \
 267     __ul = __ll_lowpart (__u);                                          \
 268     __uh = __ll_highpart (__u);                                         \
 269     __vl = __ll_lowpart (__v);                                          \
 270     __vh = __ll_highpart (__v);                                         \
 271                                                                         \
 272     __x0 = (uintmax_t) __ul * __vl;                                     \
 273     __x1 = (uintmax_t) __ul * __vh;                                     \
 274     __x2 = (uintmax_t) __uh * __vl;                                     \
 275     __x3 = (uintmax_t) __uh * __vh;                                     \
 276                                                                         \
 277     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 278     __x1 += __x2;               /* but this indeed can */               \
 279     if (__x1 < __x2)            /* did we get it? */                    \
 280       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 281                                                                         \
 282     (w1) = __x3 + __ll_highpart (__x1);                                 \
 283     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 284   } while (0)
 285 #endif
 286
 287 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 288 /* Define our own, not needing normalization. This function is
 289    currently not performance critical, so keep it simple. Similar to
 290    the mod macro below. */
 291 # undef udiv_qrnnd
 292 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 293   do {                                                                  \
 294     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 295                                                                         \
 296     assert ((n1) < (d));                                                \
 297     __d1 = (d); __d0 = 0;                                               \
 298     __r1 = (n1); __r0 = (n0);                                           \
 299     __q = 0;                                                            \
 300     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 301       {                                                                 \
 302         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 303         __q <<= 1;                                                      \
 304         if (ge2 (__r1, __r0, __d1, __d0))                               \
 305           {                                                             \
 306             __q++;                                                      \
 307             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 308           }                                                             \
 309       }                                                                 \
 310     (r) = __r0;                                                         \
 311     (q) = __q;                                                          \
 312   } while (0)
 313 #endif
 314
 315 #if !defined add_ssaaaa
 316 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 317   do {                                                                  \
 318     uintmax_t _add_x;                                                   \
 319     _add_x = (al) + (bl);                                               \
 320     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 321     (sl) = _add_x;                                                      \
 322   } while (0)
 323 #endif
 324
 325 #define rsh2(rh, rl, ah, al, cnt)                                       \
 326   do {                                                                  \
 327     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 328     (rh) = (ah) >> (cnt);                                               \
 329   } while (0)
 330
 331 #define lsh2(rh, rl, ah, al, cnt)                                       \
 332   do {                                                                  \
 333     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 334     (rl) = (al) << (cnt);                                               \
 335   } while (0)
 336
 337 #define ge2(ah, al, bh, bl)                                             \
 338   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 339
 340 #define gt2(ah, al, bh, bl)                                             \
 341   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 342
 343 #ifndef sub_ddmmss
 344 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 345   do {                                                                  \
 346     uintmax_t _cy;                                                      \
 347     _cy = (al) < (bl);                                                  \
 348     (rl) = (al) - (bl);                                                 \
 349     (rh) = (ah) - (bh) - _cy;                                           \
 350   } while (0)
 351 #endif
 352
 353 #ifndef count_leading_zeros
 354 # define count_leading_zeros(count, x) do {                             \
 355     uintmax_t __clz_x = (x);                                            \
 356     unsigned int __clz_c;                                               \
 357     for (__clz_c = 0;                                                   \
 358          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 359          __clz_c += 8)                                                  \
 360       __clz_x <<= 8;                                                    \
 361     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 362       __clz_x <<= 1;                                                    \
 363     (count) = __clz_c;                                                  \
 364   } while (0)
 365 #endif
 366
 367 #ifndef count_trailing_zeros
 368 # define count_trailing_zeros(count, x) do {                            \
 369     uintmax_t __ctz_x = (x);                                            \
 370     unsigned int __ctz_c = 0;                                           \
 371     while ((__ctz_x & 1) == 0)                                          \
 372       {                                                                 \
 373         __ctz_x >>= 1;                                                  \
 374         __ctz_c++;                                                      \
 375       }                                                                 \
 376     (count) = __ctz_c;                                                  \
 377   } while (0)
 378 #endif
 379
 380 /* Requires that a < n and b <= n */
 381 #define submod(r,a,b,n)                                                 \
 382   do {                                                                  \
 383     uintmax_t _t = - (uintmax_t) (a < b);                               \
 384     (r) = ((n) & _t) + (a) - (b);                                       \
 385   } while (0)
 386
 387 #define addmod(r,a,b,n)                                                 \
 388   submod ((r), (a), ((n) - (b)), (n))
 389
 390 /* Modular two-word addition and subtraction.  For performance reasons, the
 391    most significant bit of n1 must be clear.  The destination variables must be
 392    distinct from the mod operand.  */
 393 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 394   do {                                                                  \
 395     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 396     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 397       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 398   } while (0)
 399 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 400   do {                                                                  \
 401     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 402     if ((intmax_t) (r1) < 0)                                            \
 403       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 404   } while (0)
 405
 406 #define HIGHBIT_TO_MASK(x)                                              \
 407   (((intmax_t)-1 >> 1) < 0                                              \
 408    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 409    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 410       ? UINTMAX_MAX : (uintmax_t) 0))
 411
 412 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 413    Requires that d1 != 0.  */
 414 static uintmax_t
 415 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 416 {
 417   int cntd, cnta;
 418
 419   assert (d1 != 0);
 420
 421   if (a1 == 0)
 422     {
 423       *r1 = 0;
 424       return a0;
 425     }
 426
 427   count_leading_zeros (cntd, d1);
 428   count_leading_zeros (cnta, a1);
 429   int cnt = cntd - cnta;
 430   lsh2 (d1, d0, d1, d0, cnt);
 431   for (int i = 0; i < cnt; i++)
 432     {
 433       if (ge2 (a1, a0, d1, d0))
 434         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 435       rsh2 (d1, d0, d1, d0, 1);
 436     }
 437
 438   *r1 = a1;
 439   return a0;
 440 }
 441
 442 static uintmax_t _GL_ATTRIBUTE_CONST
 443 gcd_odd (uintmax_t a, uintmax_t b)
 444 {
 445   if ( (b & 1) == 0)
 446     {
 447       uintmax_t t = b;
 448       b = a;
 449       a = t;
 450     }
 451   if (a == 0)
 452     return b;
 453
 454   /* Take out least significant one bit, to make room for sign */
 455   b >>= 1;
 456
 457   for (;;)
 458     {
 459       uintmax_t t;
 460       uintmax_t bgta;
 461
 462       while ((a & 1) == 0)
 463         a >>= 1;
 464       a >>= 1;
 465
 466       t = a - b;
 467       if (t == 0)
 468         return (a << 1) + 1;
 469
 470       bgta = HIGHBIT_TO_MASK (t);
 471
 472       /* b <-- min (a, b) */
 473       b += (bgta & t);
 474
 475       /* a <-- |a - b| */
 476       a = (t ^ bgta) - bgta;
 477     }
 478 }
 479
 480 static uintmax_t
 481 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 482 {
 483   assert (b0 & 1);
 484
 485   if ( (a0 | a1) == 0)
 486     {
 487       *r1 = b1;
 488       return b0;
 489     }
 490
 491   while ((a0 & 1) == 0)
 492     rsh2 (a1, a0, a1, a0, 1);
 493
 494   for (;;)
 495     {
 496       if ((b1 | a1) == 0)
 497         {
 498           *r1 = 0;
 499           return gcd_odd (b0, a0);
 500         }
 501
 502       if (gt2 (a1, a0, b1, b0))
 503         {
 504           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 505           do
 506             rsh2 (a1, a0, a1, a0, 1);
 507           while ((a0 & 1) == 0);
 508         }
 509       else if (gt2 (b1, b0, a1, a0))
 510         {
 511           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 512           do
 513             rsh2 (b1, b0, b1, b0, 1);
 514           while ((b0 & 1) == 0);
 515         }
 516       else
 517         break;
 518     }
 519
 520   *r1 = a1;
 521   return a0;
 522 }
 523
 524 static void
 525 factor_insert_multiplicity (struct factors *factors,
 526                             uintmax_t prime, unsigned int m)
 527 {
 528   unsigned int nfactors = factors->nfactors;
 529   uintmax_t *p = factors->p;
 530   unsigned char *e = factors->e;
 531
 532   /* Locate position for insert new or increment e.  */
 533   int i;
 534   for (i = nfactors - 1; i >= 0; i--)
 535     {
 536       if (p[i] <= prime)
 537         break;
 538     }
 539
 540   if (i < 0 || p[i] != prime)
 541     {
 542       for (int j = nfactors - 1; j > i; j--)
 543         {
 544           p[j + 1] = p[j];
 545           e[j + 1] = e[j];
 546         }
 547       p[i + 1] = prime;
 548       e[i + 1] = m;
 549       factors->nfactors = nfactors + 1;
 550     }
 551   else
 552     {
 553       e[i] += m;
 554     }
 555 }
 556
 557 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 558
 559 static void
 560 factor_insert_large (struct factors *factors,
 561                      uintmax_t p1, uintmax_t p0)
 562 {
 563   if (p1 > 0)
 564     {
 565       assert (factors->plarge[1] == 0);
 566       factors->plarge[0] = p0;
 567       factors->plarge[1] = p1;
 568     }
 569   else
 570     factor_insert (factors, p0);
 571 }
 572
 573 #if HAVE_GMP
 574
 575 # if !HAVE_DECL_MPZ_INITS
 576
 577 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 578 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 579
 580 static void
 581 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 582 {
 583   va_list ap;
 584
 585   va_start (ap, mpz_single_init);
 586
 587   mpz_t *mpz;
 588   while ((mpz = va_arg (ap, mpz_t *)))
 589     mpz_single_init (*mpz);
 590
 591   va_end (ap);
 592 }
 593 # endif
 594
 595 static void mp_factor (mpz_t, struct mp_factors *);
 596
 597 static void
 598 mp_factor_init (struct mp_factors *factors)
 599 {
 600   factors->p = NULL;
 601   factors->e = NULL;
 602   factors->nfactors = 0;
 603 }
 604
 605 static void
 606 mp_factor_clear (struct mp_factors *factors)
 607 {
 608   for (unsigned int i = 0; i < factors->nfactors; i++)
 609     mpz_clear (factors->p[i]);
 610
 611   free (factors->p);
 612   free (factors->e);
 613 }
 614
 615 static void
 616 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 617 {
 618   unsigned long int nfactors = factors->nfactors;
 619   mpz_t         *p  = factors->p;
 620   unsigned long int *e  = factors->e;
 621   long i;
 622
 623   /* Locate position for insert new or increment e.  */
 624   for (i = nfactors - 1; i >= 0; i--)
 625     {
 626       if (mpz_cmp (p[i], prime) <= 0)
 627         break;
 628     }
 629
 630   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 631     {
 632       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 633       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 634
 635       mpz_init (p[nfactors]);
 636       for (long j = nfactors - 1; j > i; j--)
 637         {
 638           mpz_set (p[j + 1], p[j]);
 639           e[j + 1] = e[j];
 640         }
 641       mpz_set (p[i + 1], prime);
 642       e[i + 1] = 1;
 643
 644       factors->p = p;
 645       factors->e = e;
 646       factors->nfactors = nfactors + 1;
 647     }
 648   else
 649     {
 650       e[i] += 1;
 651     }
 652 }
 653
 654 static void
 655 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 656 {
 657   mpz_t pz;
 658
 659   mpz_init_set_ui (pz, prime);
 660   mp_factor_insert (factors, pz);
 661   mpz_clear (pz);
 662 }
 663 #endif /* HAVE_GMP */
 664
 665
 666 /* Number of bits in an uintmax_t.  */
 667 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 668
 669 /* Verify that uintmax_t does not have holes in its representation.  */
 670 verify (UINTMAX_MAX >> (W - 1) == 1);
 671
 672 #define P(a,b,c,d) a,
 673 static const unsigned char primes_diff[] = {
 674 #include "primes.h"
 675 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 676 };
 677 #undef P
 678
 679 #define PRIMES_PTAB_ENTRIES \
 680   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 681
 682 #define P(a,b,c,d) b,
 683 static const unsigned char primes_diff8[] = {
 684 #include "primes.h"
 685 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 686 };
 687 #undef P
 688
 689 struct primes_dtab
 690 {
 691   uintmax_t binv, lim;
 692 };
 693
 694 #define P(a,b,c,d) {c,d},
 695 static const struct primes_dtab primes_dtab[] = {
 696 #include "primes.h"
 697 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 698 };
 699 #undef P
 700
 701 /* Verify that uintmax_t is not wider than
 702    the integers used to generate primes.h.  */
 703 verify (W <= WIDE_UINT_BITS);
 704
 705 /* debugging for developers.  Enables devmsg().
 706    This flag is used only in the GMP code.  */
 707 static bool dev_debug = false;
 708
 709 /* Prove primality or run probabilistic tests.  */
 710 static bool flag_prove_primality = PROVE_PRIMALITY;
 711
 712 /* Number of Miller-Rabin tests to run when not proving primality. */
 713 #define MR_REPS 25
 714
 715 static void
 716 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 717                       unsigned int off)
 718 {
 719   for (unsigned int j = 0; j < off; j++)
 720     p += primes_diff[i + j];
 721   factor_insert (factors, p);
 722 }
 723
 724 /* Trial division with odd primes uses the following trick.
 725
 726    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 727    consider the case t < B (this is the second loop below).
 728
 729    From our tables we get
 730
 731      binv = p^{-1} (mod B)
 732      lim = floor ( (B-1) / p ).
 733
 734    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 735    (and all quotients in this range occur for some t).
 736
 737    Then t = q * p is true also (mod B), and p is invertible we get
 738
 739      q = t * binv (mod B).
 740
 741    Next, assume that t is *not* divisible by p. Since multiplication
 742    by binv (mod B) is a one-to-one mapping,
 743
 744      t * binv (mod B) > lim,
 745
 746    because all the smaller values are already taken.
 747
 748    This can be summed up by saying that the function
 749
 750      q(t) = binv * t (mod B)
 751
 752    is a permutation of the range 0 <= t < B, with the curious property
 753    that it maps the multiples of p onto the range 0 <= q <= lim, in
 754    order, and the non-multiples of p onto the range lim < q < B.
 755  */
 756
 757 static uintmax_t
 758 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 759                        struct factors *factors)
 760 {
 761   if (t0 % 2 == 0)
 762     {
 763       unsigned int cnt;
 764
 765       if (t0 == 0)
 766         {
 767           count_trailing_zeros (cnt, t1);
 768           t0 = t1 >> cnt;
 769           t1 = 0;
 770           cnt += W_TYPE_SIZE;
 771         }
 772       else
 773         {
 774           count_trailing_zeros (cnt, t0);
 775           rsh2 (t1, t0, t1, t0, cnt);
 776         }
 777
 778       factor_insert_multiplicity (factors, 2, cnt);
 779     }
 780
 781   uintmax_t p = 3;
 782   unsigned int i;
 783   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 784     {
 785       for (;;)
 786         {
 787           uintmax_t q1, q0, hi, lo _GL_UNUSED;
 788
 789           q0 = t0 * primes_dtab[i].binv;
 790           umul_ppmm (hi, lo, q0, p);
 791           if (hi > t1)
 792             break;
 793           hi = t1 - hi;
 794           q1 = hi * primes_dtab[i].binv;
 795           if (LIKELY (q1 > primes_dtab[i].lim))
 796             break;
 797           t1 = q1; t0 = q0;
 798           factor_insert (factors, p);
 799         }
 800       p += primes_diff[i + 1];
 801     }
 802   if (t1p)
 803     *t1p = t1;
 804
 805 #define DIVBLOCK(I)                                                     \
 806   do {                                                                  \
 807     for (;;)                                                            \
 808       {                                                                 \
 809         q = t0 * pd[I].binv;                                            \
 810         if (LIKELY (q > pd[I].lim))                                     \
 811           break;                                                        \
 812         t0 = q;                                                         \
 813         factor_insert_refind (factors, p, i + 1, I);                    \
 814       }                                                                 \
 815   } while (0)
 816
 817   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 818     {
 819       uintmax_t q;
 820       const struct primes_dtab *pd = &primes_dtab[i];
 821       DIVBLOCK (0);
 822       DIVBLOCK (1);
 823       DIVBLOCK (2);
 824       DIVBLOCK (3);
 825       DIVBLOCK (4);
 826       DIVBLOCK (5);
 827       DIVBLOCK (6);
 828       DIVBLOCK (7);
 829
 830       p += primes_diff8[i];
 831       if (p * p > t0)
 832         break;
 833     }
 834
 835   return t0;
 836 }
 837
 838 #if HAVE_GMP
 839 static void
 840 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 841 {
 842   mpz_t q;
 843   unsigned long int p;
 844
 845   devmsg ("[trial division] ");
 846
 847   mpz_init (q);
 848
 849   p = mpz_scan1 (t, 0);
 850   mpz_div_2exp (t, t, p);
 851   while (p)
 852     {
 853       mp_factor_insert_ui (factors, 2);
 854       --p;
 855     }
 856
 857   p = 3;
 858   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 859     {
 860       if (! mpz_divisible_ui_p (t, p))
 861         {
 862           p += primes_diff[i++];
 863           if (mpz_cmp_ui (t, p * p) < 0)
 864             break;
 865         }
 866       else
 867         {
 868           mpz_tdiv_q_ui (t, t, p);
 869           mp_factor_insert_ui (factors, p);
 870         }
 871     }
 872
 873   mpz_clear (q);
 874 }
 875 #endif
 876
 877 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 878 static const unsigned char  binvert_table[128] =
 879 {
 880   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 881   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 882   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 883   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 884   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 885   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 886   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 887   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 888   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 889   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 890   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 891   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 892   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 893   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 894   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 895   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 896 };
 897
 898 /* Compute n^(-1) mod B, using a Newton iteration.  */
 899 #define binv(inv,n)                                                     \
 900   do {                                                                  \
 901     uintmax_t  __n = (n);                                               \
 902     uintmax_t  __inv;                                                   \
 903                                                                         \
 904     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 905     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 906     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 907     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 908                                                                         \
 909     if (W_TYPE_SIZE > 64)                                               \
 910       {                                                                 \
 911         int  __invbits = 64;                                            \
 912         do {                                                            \
 913           __inv = 2 * __inv - __inv * __inv * __n;                      \
 914           __invbits *= 2;                                               \
 915         } while (__invbits < W_TYPE_SIZE);                              \
 916       }                                                                 \
 917                                                                         \
 918     (inv) = __inv;                                                      \
 919   } while (0)
 920
 921 /* q = u / d, assuming d|u.  */
 922 #define divexact_21(q1, q0, u1, u0, d)                                  \
 923   do {                                                                  \
 924     uintmax_t _di, _q0;                                                 \
 925     binv (_di, (d));                                                    \
 926     _q0 = (u0) * _di;                                                   \
 927     if ((u1) >= (d))                                                    \
 928       {                                                                 \
 929         uintmax_t _p1, _p0 _GL_UNUSED;                            \
 930         umul_ppmm (_p1, _p0, _q0, d);                                   \
 931         (q1) = ((u1) - _p1) * _di;                                      \
 932         (q0) = _q0;                                                     \
 933       }                                                                 \
 934     else                                                                \
 935       {                                                                 \
 936         (q0) = _q0;                                                     \
 937         (q1) = 0;                                                       \
 938       }                                                                 \
 939   } while (0)
 940
 941 /* x B (mod n). */
 942 #define redcify(r_prim, r, n)                                           \
 943   do {                                                                  \
 944     uintmax_t _redcify_q _GL_UNUSED;                              \
 945     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 946   } while (0)
 947
 948 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 949 #define redcify2(r1, r0, x, n1, n0)                                     \
 950   do {                                                                  \
 951     uintmax_t _r1, _r0, _i;                                             \
 952     if ((x) < (n1))                                                     \
 953       {                                                                 \
 954         _r1 = (x); _r0 = 0;                                             \
 955         _i = W_TYPE_SIZE;                                               \
 956       }                                                                 \
 957     else                                                                \
 958       {                                                                 \
 959         _r1 = 0; _r0 = (x);                                             \
 960         _i = 2*W_TYPE_SIZE;                                             \
 961       }                                                                 \
 962     while (_i-- > 0)                                                    \
 963       {                                                                 \
 964         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 965         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 966           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 967       }                                                                 \
 968     (r1) = _r1;                                                         \
 969     (r0) = _r0;                                                         \
 970   } while (0)
 971
 972 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 973    Both a and b must be in redc form, the result will be in redc form too. */
 974 static inline uintmax_t
 975 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 976 {
 977   uintmax_t rh, rl, q, th, tl _GL_UNUSED, xh;
 978
 979   umul_ppmm (rh, rl, a, b);
 980   q = rl * mi;
 981   umul_ppmm (th, tl, q, m);
 982   xh = rh - th;
 983   if (rh < th)
 984     xh += m;
 985
 986   return xh;
 987 }
 988
 989 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 990    Both a and b must be in redc form, the result will be in redc form too.
 991    For performance reasons, the most significant bit of m must be clear. */
 992 static uintmax_t
 993 mulredc2 (uintmax_t *r1p,
 994           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 995           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 996 {
 997   uintmax_t r1, r0, q, p1, p0 _GL_UNUSED, t1, t0, s1, s0;
 998   mi = -mi;
 999   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
1000   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
1001   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
1002
1003   /* First compute a0 * <b1, b0> B^{-1}
1004         +-----+
1005         |a0 b0|
1006      +--+--+--+
1007      |a0 b1|
1008      +--+--+--+
1009         |q0 m0|
1010      +--+--+--+
1011      |q0 m1|
1012     -+--+--+--+
1013      |r1|r0| 0|
1014      +--+--+--+
1015   */
1016   umul_ppmm (t1, t0, a0, b0);
1017   umul_ppmm (r1, r0, a0, b1);
1018   q = mi * t0;
1019   umul_ppmm (p1, p0, q, m0);
1020   umul_ppmm (s1, s0, q, m1);
1021   r0 += (t0 != 0); /* Carry */
1022   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1023   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1024   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1025
1026   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1027         +-----+
1028         |a1 b0|
1029         +--+--+
1030         |r1|r0|
1031      +--+--+--+
1032      |a1 b1|
1033      +--+--+--+
1034         |q1 m0|
1035      +--+--+--+
1036      |q1 m1|
1037     -+--+--+--+
1038      |r1|r0| 0|
1039      +--+--+--+
1040   */
1041   umul_ppmm (t1, t0, a1, b0);
1042   umul_ppmm (s1, s0, a1, b1);
1043   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1044   q = mi * t0;
1045   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1046   umul_ppmm (p1, p0, q, m0);
1047   umul_ppmm (s1, s0, q, m1);
1048   r0 += (t0 != 0); /* Carry */
1049   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1050   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1051   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1052
1053   if (ge2 (r1, r0, m1, m0))
1054     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1055
1056   *r1p = r1;
1057   return r0;
1058 }
1059
1060 static uintmax_t _GL_ATTRIBUTE_CONST
1061 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1062 {
1063   uintmax_t y = one;
1064
1065   if (e & 1)
1066     y = b;
1067
1068   while (e != 0)
1069     {
1070       b = mulredc (b, b, n, ni);
1071       e >>= 1;
1072
1073       if (e & 1)
1074         y = mulredc (y, b, n, ni);
1075     }
1076
1077   return y;
1078 }
1079
1080 static uintmax_t
1081 powm2 (uintmax_t *r1m,
1082        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1083        uintmax_t ni, const uintmax_t *one)
1084 {
1085   uintmax_t r1, r0, b1, b0, n1, n0;
1086   unsigned int i;
1087   uintmax_t e;
1088
1089   b0 = bp[0];
1090   b1 = bp[1];
1091   n0 = np[0];
1092   n1 = np[1];
1093
1094   r0 = one[0];
1095   r1 = one[1];
1096
1097   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1098     {
1099       if (e & 1)
1100         {
1101           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1102           r1 = *r1m;
1103         }
1104       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1105       b1 = *r1m;
1106     }
1107   for (e = ep[1]; e > 0; e >>= 1)
1108     {
1109       if (e & 1)
1110         {
1111           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1112           r1 = *r1m;
1113         }
1114       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1115       b1 = *r1m;
1116     }
1117   *r1m = r1;
1118   return r0;
1119 }
1120
1121 static bool _GL_ATTRIBUTE_CONST
1122 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1123              unsigned int k, uintmax_t one)
1124 {
1125   uintmax_t y = powm (b, q, n, ni, one);
1126
1127   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1128
1129   if (y == one || y == nm1)
1130     return true;
1131
1132   for (unsigned int i = 1; i < k; i++)
1133     {
1134       y = mulredc (y, y, n, ni);
1135
1136       if (y == nm1)
1137         return true;
1138       if (y == one)
1139         return false;
1140     }
1141   return false;
1142 }
1143
1144 static bool
1145 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1146               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1147 {
1148   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1149
1150   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1151   y1 = r1m;
1152
1153   if (y0 == one[0] && y1 == one[1])
1154     return true;
1155
1156   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1157
1158   if (y0 == nm1_0 && y1 == nm1_1)
1159     return true;
1160
1161   for (unsigned int i = 1; i < k; i++)
1162     {
1163       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1164       y1 = r1m;
1165
1166       if (y0 == nm1_0 && y1 == nm1_1)
1167         return true;
1168       if (y0 == one[0] && y1 == one[1])
1169         return false;
1170     }
1171   return false;
1172 }
1173
1174 #if HAVE_GMP
1175 static bool
1176 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1177                 mpz_srcptr q, unsigned long int k)
1178 {
1179   mpz_powm (y, x, q, n);
1180
1181   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1182     return true;
1183
1184   for (unsigned long int i = 1; i < k; i++)
1185     {
1186       mpz_powm_ui (y, y, 2, n);
1187       if (mpz_cmp (y, nm1) == 0)
1188         return true;
1189       if (mpz_cmp_ui (y, 1) == 0)
1190         return false;
1191     }
1192   return false;
1193 }
1194 #endif
1195
1196 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1197    have been observed.  The average seem to be about 2.  */
1198 static bool
1199 prime_p (uintmax_t n)
1200 {
1201   int k;
1202   bool is_prime;
1203   uintmax_t a_prim, one, ni;
1204   struct factors factors;
1205
1206   if (n <= 1)
1207     return false;
1208
1209   /* We have already casted out small primes. */
1210   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1211     return true;
1212
1213   /* Precomputation for Miller-Rabin.  */
1214   uintmax_t q = n - 1;
1215   for (k = 0; (q & 1) == 0; k++)
1216     q >>= 1;
1217
1218   uintmax_t a = 2;
1219   binv (ni, n);                 /* ni <- 1/n mod B */
1220   redcify (one, 1, n);
1221   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1222
1223   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1224   if (!millerrabin (n, ni, a_prim, q, k, one))
1225     return false;
1226
1227   if (flag_prove_primality)
1228     {
1229       /* Factor n-1 for Lucas.  */
1230       factor (0, n - 1, &factors);
1231     }
1232
1233   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1234      number composite.  */
1235   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1236     {
1237       if (flag_prove_primality)
1238         {
1239           is_prime = true;
1240           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1241             {
1242               is_prime
1243                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1244             }
1245         }
1246       else
1247         {
1248           /* After enough Miller-Rabin runs, be content. */
1249           is_prime = (r == MR_REPS - 1);
1250         }
1251
1252       if (is_prime)
1253         return true;
1254
1255       a += primes_diff[r];      /* Establish new base.  */
1256
1257       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1258          on most processors, since it avoids udiv_qrnnd.  If we go down the
1259          udiv_qrnnd_preinv path, this code should be replaced.  */
1260       {
1261         uintmax_t s1, s0;
1262         umul_ppmm (s1, s0, one, a);
1263         if (LIKELY (s1 == 0))
1264           a_prim = s0 % n;
1265         else
1266           {
1267             uintmax_t dummy _GL_UNUSED;
1268             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1269           }
1270       }
1271
1272       if (!millerrabin (n, ni, a_prim, q, k, one))
1273         return false;
1274     }
1275
1276   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1277   abort ();
1278 }
1279
1280 static bool
1281 prime2_p (uintmax_t n1, uintmax_t n0)
1282 {
1283   uintmax_t q[2], nm1[2];
1284   uintmax_t a_prim[2];
1285   uintmax_t one[2];
1286   uintmax_t na[2];
1287   uintmax_t ni;
1288   unsigned int k;
1289   struct factors factors;
1290
1291   if (n1 == 0)
1292     return prime_p (n0);
1293
1294   nm1[1] = n1 - (n0 == 0);
1295   nm1[0] = n0 - 1;
1296   if (nm1[0] == 0)
1297     {
1298       count_trailing_zeros (k, nm1[1]);
1299
1300       q[0] = nm1[1] >> k;
1301       q[1] = 0;
1302       k += W_TYPE_SIZE;
1303     }
1304   else
1305     {
1306       count_trailing_zeros (k, nm1[0]);
1307       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1308     }
1309
1310   uintmax_t a = 2;
1311   binv (ni, n0);
1312   redcify2 (one[1], one[0], 1, n1, n0);
1313   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1314
1315   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1316   na[0] = n0;
1317   na[1] = n1;
1318
1319   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1320     return false;
1321
1322   if (flag_prove_primality)
1323     {
1324       /* Factor n-1 for Lucas.  */
1325       factor (nm1[1], nm1[0], &factors);
1326     }
1327
1328   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1329      number composite.  */
1330   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1331     {
1332       bool is_prime;
1333       uintmax_t e[2], y[2];
1334
1335       if (flag_prove_primality)
1336         {
1337           is_prime = true;
1338           if (factors.plarge[1])
1339             {
1340               uintmax_t pi;
1341               binv (pi, factors.plarge[0]);
1342               e[0] = pi * nm1[0];
1343               e[1] = 0;
1344               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1345               is_prime = (y[0] != one[0] || y[1] != one[1]);
1346             }
1347           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1348             {
1349               /* FIXME: We always have the factor 2. Do we really need to
1350                  handle it here? We have done the same powering as part
1351                  of millerrabin. */
1352               if (factors.p[i] == 2)
1353                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1354               else
1355                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1356               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1357               is_prime = (y[0] != one[0] || y[1] != one[1]);
1358             }
1359         }
1360       else
1361         {
1362           /* After enough Miller-Rabin runs, be content. */
1363           is_prime = (r == MR_REPS - 1);
1364         }
1365
1366       if (is_prime)
1367         return true;
1368
1369       a += primes_diff[r];      /* Establish new base.  */
1370       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1371
1372       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1373         return false;
1374     }
1375
1376   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1377   abort ();
1378 }
1379
1380 #if HAVE_GMP
1381 static bool
1382 mp_prime_p (mpz_t n)
1383 {
1384   bool is_prime;
1385   mpz_t q, a, nm1, tmp;
1386   struct mp_factors factors;
1387
1388   if (mpz_cmp_ui (n, 1) <= 0)
1389     return false;
1390
1391   /* We have already casted out small primes. */
1392   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1393     return true;
1394
1395   mpz_inits (q, a, nm1, tmp, NULL);
1396
1397   /* Precomputation for Miller-Rabin.  */
1398   mpz_sub_ui (nm1, n, 1);
1399
1400   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1401   unsigned long int k = mpz_scan1 (nm1, 0);
1402   mpz_tdiv_q_2exp (q, nm1, k);
1403
1404   mpz_set_ui (a, 2);
1405
1406   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1407   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1408     {
1409       is_prime = false;
1410       goto ret2;
1411     }
1412
1413   if (flag_prove_primality)
1414     {
1415       /* Factor n-1 for Lucas.  */
1416       mpz_set (tmp, nm1);
1417       mp_factor (tmp, &factors);
1418     }
1419
1420   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1421      number composite.  */
1422   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1423     {
1424       if (flag_prove_primality)
1425         {
1426           is_prime = true;
1427           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1428             {
1429               mpz_divexact (tmp, nm1, factors.p[i]);
1430               mpz_powm (tmp, a, tmp, n);
1431               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1432             }
1433         }
1434       else
1435         {
1436           /* After enough Miller-Rabin runs, be content. */
1437           is_prime = (r == MR_REPS - 1);
1438         }
1439
1440       if (is_prime)
1441         goto ret1;
1442
1443       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1444
1445       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1446         {
1447           is_prime = false;
1448           goto ret1;
1449         }
1450     }
1451
1452   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1453   abort ();
1454
1455  ret1:
1456   if (flag_prove_primality)
1457     mp_factor_clear (&factors);
1458  ret2:
1459   mpz_clears (q, a, nm1, tmp, NULL);
1460
1461   return is_prime;
1462 }
1463 #endif
1464
1465 static void
1466 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1467                           struct factors *factors)
1468 {
1469   uintmax_t x, z, y, P, t, ni, g;
1470
1471   unsigned long int k = 1;
1472   unsigned long int l = 1;
1473
1474   redcify (P, 1, n);
1475   addmod (x, P, P, n);          /* i.e., redcify(2) */
1476   y = z = x;
1477
1478   while (n != 1)
1479     {
1480       assert (a < n);
1481
1482       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1483
1484       for (;;)
1485         {
1486           do
1487             {
1488               x = mulredc (x, x, n, ni);
1489               addmod (x, x, a, n);
1490
1491               submod (t, z, x, n);
1492               P = mulredc (P, t, n, ni);
1493
1494               if (k % 32 == 1)
1495                 {
1496                   if (gcd_odd (P, n) != 1)
1497                     goto factor_found;
1498                   y = x;
1499                 }
1500             }
1501           while (--k != 0);
1502
1503           z = x;
1504           k = l;
1505           l = 2 * l;
1506           for (unsigned long int i = 0; i < k; i++)
1507             {
1508               x = mulredc (x, x, n, ni);
1509               addmod (x, x, a, n);
1510             }
1511           y = x;
1512         }
1513
1514     factor_found:
1515       do
1516         {
1517           y = mulredc (y, y, n, ni);
1518           addmod (y, y, a, n);
1519
1520           submod (t, z, y, n);
1521           g = gcd_odd (t, n);
1522         }
1523       while (g == 1);
1524
1525       if (n == g)
1526         {
1527           /* Found n itself as factor.  Restart with different params.  */
1528           factor_using_pollard_rho (n, a + 1, factors);
1529           return;
1530         }
1531
1532       n = n / g;
1533
1534       if (!prime_p (g))
1535         factor_using_pollard_rho (g, a + 1, factors);
1536       else
1537         factor_insert (factors, g);
1538
1539       if (prime_p (n))
1540         {
1541           factor_insert (factors, n);
1542           break;
1543         }
1544
1545       x = x % n;
1546       z = z % n;
1547       y = y % n;
1548     }
1549 }
1550
1551 static void
1552 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1553                            struct factors *factors)
1554 {
1555   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1556
1557   unsigned long int k = 1;
1558   unsigned long int l = 1;
1559
1560   redcify2 (P1, P0, 1, n1, n0);
1561   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1562   y1 = z1 = x1;
1563   y0 = z0 = x0;
1564
1565   while (n1 != 0 || n0 != 1)
1566     {
1567       binv (ni, n0);
1568
1569       for (;;)
1570         {
1571           do
1572             {
1573               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1574               x1 = r1m;
1575               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1576
1577               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1578               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1579               P1 = r1m;
1580
1581               if (k % 32 == 1)
1582                 {
1583                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1584                   if (g1 != 0 || g0 != 1)
1585                     goto factor_found;
1586                   y1 = x1; y0 = x0;
1587                 }
1588             }
1589           while (--k != 0);
1590
1591           z1 = x1; z0 = x0;
1592           k = l;
1593           l = 2 * l;
1594           for (unsigned long int i = 0; i < k; i++)
1595             {
1596               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1597               x1 = r1m;
1598               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1599             }
1600           y1 = x1; y0 = x0;
1601         }
1602
1603     factor_found:
1604       do
1605         {
1606           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1607           y1 = r1m;
1608           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1609
1610           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1611           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1612         }
1613       while (g1 == 0 && g0 == 1);
1614
1615       if (g1 == 0)
1616         {
1617           /* The found factor is one word, and > 1. */
1618           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1619
1620           if (!prime_p (g0))
1621             factor_using_pollard_rho (g0, a + 1, factors);
1622           else
1623             factor_insert (factors, g0);
1624         }
1625       else
1626         {
1627           /* The found factor is two words.  This is highly unlikely, thus hard
1628              to trigger.  Please be careful before you change this code!  */
1629           uintmax_t ginv;
1630
1631           if (n1 == g1 && n0 == g0)
1632             {
1633               /* Found n itself as factor.  Restart with different params.  */
1634               factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1635               return;
1636             }
1637
1638           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1639           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1640           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1641
1642           if (!prime2_p (g1, g0))
1643             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1644           else
1645             factor_insert_large (factors, g1, g0);
1646         }
1647
1648       if (n1 == 0)
1649         {
1650           if (prime_p (n0))
1651             {
1652               factor_insert (factors, n0);
1653               break;
1654             }
1655
1656           factor_using_pollard_rho (n0, a, factors);
1657           return;
1658         }
1659
1660       if (prime2_p (n1, n0))
1661         {
1662           factor_insert_large (factors, n1, n0);
1663           break;
1664         }
1665
1666       x0 = mod2 (&x1, x1, x0, n1, n0);
1667       z0 = mod2 (&z1, z1, z0, n1, n0);
1668       y0 = mod2 (&y1, y1, y0, n1, n0);
1669     }
1670 }
1671
1672 #if HAVE_GMP
1673 static void
1674 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1675                              struct mp_factors *factors)
1676 {
1677   mpz_t x, z, y, P;
1678   mpz_t t, t2;
1679
1680   devmsg ("[pollard-rho (%lu)] ", a);
1681
1682   mpz_inits (t, t2, NULL);
1683   mpz_init_set_si (y, 2);
1684   mpz_init_set_si (x, 2);
1685   mpz_init_set_si (z, 2);
1686   mpz_init_set_ui (P, 1);
1687
1688   unsigned long long int k = 1;
1689   unsigned long long int l = 1;
1690
1691   while (mpz_cmp_ui (n, 1) != 0)
1692     {
1693       for (;;)
1694         {
1695           do
1696             {
1697               mpz_mul (t, x, x);
1698               mpz_mod (x, t, n);
1699               mpz_add_ui (x, x, a);
1700
1701               mpz_sub (t, z, x);
1702               mpz_mul (t2, P, t);
1703               mpz_mod (P, t2, n);
1704
1705               if (k % 32 == 1)
1706                 {
1707                   mpz_gcd (t, P, n);
1708                   if (mpz_cmp_ui (t, 1) != 0)
1709                     goto factor_found;
1710                   mpz_set (y, x);
1711                 }
1712             }
1713           while (--k != 0);
1714
1715           mpz_set (z, x);
1716           k = l;
1717           l = 2 * l;
1718           for (unsigned long long int i = 0; i < k; i++)
1719             {
1720               mpz_mul (t, x, x);
1721               mpz_mod (x, t, n);
1722               mpz_add_ui (x, x, a);
1723             }
1724           mpz_set (y, x);
1725         }
1726
1727     factor_found:
1728       do
1729         {
1730           mpz_mul (t, y, y);
1731           mpz_mod (y, t, n);
1732           mpz_add_ui (y, y, a);
1733
1734           mpz_sub (t, z, y);
1735           mpz_gcd (t, t, n);
1736         }
1737       while (mpz_cmp_ui (t, 1) == 0);
1738
1739       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1740
1741       if (!mp_prime_p (t))
1742         {
1743           devmsg ("[composite factor--restarting pollard-rho] ");
1744           mp_factor_using_pollard_rho (t, a + 1, factors);
1745         }
1746       else
1747         {
1748           mp_factor_insert (factors, t);
1749         }
1750
1751       if (mp_prime_p (n))
1752         {
1753           mp_factor_insert (factors, n);
1754           break;
1755         }
1756
1757       mpz_mod (x, x, n);
1758       mpz_mod (z, z, n);
1759       mpz_mod (y, y, n);
1760     }
1761
1762   mpz_clears (P, t2, t, z, x, y, NULL);
1763 }
1764 #endif
1765
1766 #if USE_SQUFOF
1767 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1768    algorithm is replaced, consider also returning the remainder. */
1769 static uintmax_t _GL_ATTRIBUTE_CONST
1770 isqrt (uintmax_t n)
1771 {
1772   uintmax_t x;
1773   unsigned c;
1774   if (n == 0)
1775     return 0;
1776
1777   count_leading_zeros (c, n);
1778
1779   /* Make x > sqrt(n). This will be invariant through the loop. */
1780   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1781
1782   for (;;)
1783     {
1784       uintmax_t y = (x + n/x) / 2;
1785       if (y >= x)
1786         return x;
1787
1788       x = y;
1789     }
1790 }
1791
1792 static uintmax_t _GL_ATTRIBUTE_CONST
1793 isqrt2 (uintmax_t nh, uintmax_t nl)
1794 {
1795   unsigned int shift;
1796   uintmax_t x;
1797
1798   /* Ensures the remainder fits in an uintmax_t. */
1799   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1800
1801   if (nh == 0)
1802     return isqrt (nl);
1803
1804   count_leading_zeros (shift, nh);
1805   shift &= ~1;
1806
1807   /* Make x > sqrt(n) */
1808   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1809   x <<= (W_TYPE_SIZE - shift) / 2;
1810
1811   /* Do we need more than one iteration? */
1812   for (;;)
1813     {
1814       uintmax_t r _GL_UNUSED;
1815       uintmax_t q, y;
1816       udiv_qrnnd (q, r, nh, nl, x);
1817       y = (x + q) / 2;
1818
1819       if (y >= x)
1820         {
1821           uintmax_t hi, lo;
1822           umul_ppmm (hi, lo, x + 1, x + 1);
1823           assert (gt2 (hi, lo, nh, nl));
1824
1825           umul_ppmm (hi, lo, x, x);
1826           assert (ge2 (nh, nl, hi, lo));
1827           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1828           assert (hi == 0);
1829
1830           return x;
1831         }
1832
1833       x = y;
1834     }
1835 }
1836
1837 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1838 # define MAGIC64 0x0202021202030213ULL
1839 # define MAGIC63 0x0402483012450293ULL
1840 # define MAGIC65 0x218a019866014613ULL
1841 # define MAGIC11 0x23b
1842
1843 /* Return the square root if the input is a square, otherwise 0. */
1844 static uintmax_t _GL_ATTRIBUTE_CONST
1845 is_square (uintmax_t x)
1846 {
1847   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1848      computing the square root. */
1849   if (((MAGIC64 >> (x & 63)) & 1)
1850       && ((MAGIC63 >> (x % 63)) & 1)
1851       /* Both 0 and 64 are squares mod (65) */
1852       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1853       && ((MAGIC11 >> (x % 11) & 1)))
1854     {
1855       uintmax_t r = isqrt (x);
1856       if (r*r == x)
1857         return r;
1858     }
1859   return 0;
1860 }
1861
1862 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1863 static const unsigned short invtab[0x81] =
1864   {
1865     0x200,
1866     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1867     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1868     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1869     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1870     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1871     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1872     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1873     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1874     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1875     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1876     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1877     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1878     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1879     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1880     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1881     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1882   };
1883
1884 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1885    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1886 # define div_smallq(q, r, u, d)                                          \
1887   do {                                                                  \
1888     if ((u) / 0x40 < (d))                                               \
1889       {                                                                 \
1890         int _cnt;                                                       \
1891         uintmax_t _dinv, _mask, _q, _r;                                 \
1892         count_leading_zeros (_cnt, (d));                                \
1893         _r = (u);                                                       \
1894         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1895           {                                                             \
1896             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1897             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1898           }                                                             \
1899         else                                                            \
1900           {                                                             \
1901             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1902             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1903           }                                                             \
1904         _r -= _q*(d);                                                   \
1905                                                                         \
1906         _mask = -(uintmax_t) (_r >= (d));                               \
1907         (r) = _r - (_mask & (d));                                       \
1908         (q) = _q - _mask;                                               \
1909         assert ( (q) * (d) + (r) == u);                                 \
1910       }                                                                 \
1911     else                                                                \
1912       {                                                                 \
1913         uintmax_t _q = (u) / (d);                                       \
1914         (r) = (u) - _q * (d);                                           \
1915         (q) = _q;                                                       \
1916       }                                                                 \
1917   } while (0)
1918
1919 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1920    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1921    with 3025 = 55^2.
1922
1923    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1924    representing G_0 = (-55, 2*4652, 8653).
1925
1926    In the notation of the paper:
1927
1928    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1929
1930    Put
1931
1932      t_0 = floor([q_0 + R_0] / S0) = 1
1933      R_1 = t_0 * S_0 - R_0 = 4001
1934      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1935 */
1936
1937 /* Multipliers, in order of efficiency:
1938    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1939    0.7317  3*5*7    =  105 = 1
1940    0.7820  3*5*11   =  165 = 1
1941    0.7872  3*5      =   15 = 3
1942    0.8101  3*7*11   =  231 = 3
1943    0.8155  3*7      =   21 = 1
1944    0.8284  5*7*11   =  385 = 1
1945    0.8339  5*7      =   35 = 3
1946    0.8716  3*11     =   33 = 1
1947    0.8774  3        =    3 = 3
1948    0.8913  5*11     =   55 = 3
1949    0.8972  5        =    5 = 1
1950    0.9233  7*11     =   77 = 1
1951    0.9295  7        =    7 = 3
1952    0.9934  11       =   11 = 3
1953 */
1954 # define QUEUE_SIZE 50
1955 #endif
1956
1957 #if STAT_SQUFOF
1958 # define Q_FREQ_SIZE 50
1959 /* Element 0 keeps the total */
1960 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1961 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1962 #endif
1963
1964 #if USE_SQUFOF
1965 /* Return true on success.  Expected to fail only for numbers
1966    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1967 static bool
1968 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1969 {
1970   /* Uses algorithm and notation from
1971
1972      SQUARE FORM FACTORIZATION
1973      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1974
1975      https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1976    */
1977
1978   static const unsigned int multipliers_1[] =
1979     { /* = 1 (mod 4) */
1980       105, 165, 21, 385, 33, 5, 77, 1, 0
1981     };
1982   static const unsigned int multipliers_3[] =
1983     { /* = 3 (mod 4) */
1984       1155, 15, 231, 35, 3, 55, 7, 11, 0
1985     };
1986
1987   const unsigned int *m;
1988
1989   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1990
1991   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1992     return false;
1993
1994   uintmax_t sqrt_n = isqrt2 (n1, n0);
1995
1996   if (n0 == sqrt_n * sqrt_n)
1997     {
1998       uintmax_t p1, p0;
1999
2000       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
2001       assert (p0 == n0);
2002
2003       if (n1 == p1)
2004         {
2005           if (prime_p (sqrt_n))
2006             factor_insert_multiplicity (factors, sqrt_n, 2);
2007           else
2008             {
2009               struct factors f;
2010
2011               f.nfactors = 0;
2012               if (!factor_using_squfof (0, sqrt_n, &f))
2013                 {
2014                   /* Try pollard rho instead */
2015                   factor_using_pollard_rho (sqrt_n, 1, &f);
2016                 }
2017               /* Duplicate the new factors */
2018               for (unsigned int i = 0; i < f.nfactors; i++)
2019                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
2020             }
2021           return true;
2022         }
2023     }
2024
2025   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2026   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2027        *m; m++)
2028     {
2029       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2030       unsigned int i;
2031       unsigned int mu = *m;
2032       unsigned int qpos = 0;
2033
2034       assert (mu * n0 % 4 == 3);
2035
2036       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2037          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2038          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2039          mu^3 < n.
2040
2041          However, this seems insufficient: With n = 37243139 and mu =
2042          105, we get a trivial factor, from the square 38809 = 197^2,
2043          without any corresponding Q earlier in the iteration.
2044
2045          Requiring 64 mu^3 < n seems sufficient. */
2046       if (n1 == 0)
2047         {
2048           if ((uintmax_t) mu*mu*mu >= n0 / 64)
2049             continue;
2050         }
2051       else
2052         {
2053           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2054             continue;
2055         }
2056       umul_ppmm (Dh, Dl, n0, mu);
2057       Dh += n1 * mu;
2058
2059       assert (Dl % 4 != 1);
2060       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2061
2062       S = isqrt2 (Dh, Dl);
2063
2064       Q1 = 1;
2065       P = S;
2066
2067       /* Square root remainder fits in one word, so ignore high part. */
2068       Q = Dl - P*P;
2069       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2070       L = isqrt (2*S);
2071       B = 2*L;
2072       L1 = mu * 2 * L;
2073
2074       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2075          4 D. */
2076
2077       for (i = 0; i <= B; i++)
2078         {
2079           uintmax_t q, P1, t, rem;
2080
2081           div_smallq (q, rem, S+P, Q);
2082           P1 = S - rem; /* P1 = q*Q - P */
2083
2084           IF_LINT (assert (q > 0 && Q > 0));
2085
2086 # if STAT_SQUFOF
2087           q_freq[0]++;
2088           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2089 # endif
2090
2091           if (Q <= L1)
2092             {
2093               uintmax_t g = Q;
2094
2095               if ( (Q & 1) == 0)
2096                 g /= 2;
2097
2098               g /= gcd_odd (g, mu);
2099
2100               if (g <= L)
2101                 {
2102                   if (qpos >= QUEUE_SIZE)
2103                     die (EXIT_FAILURE, 0, _("squfof queue overflow"));
2104                   queue[qpos].Q = g;
2105                   queue[qpos].P = P % g;
2106                   qpos++;
2107                 }
2108             }
2109
2110           /* I think the difference can be either sign, but mod
2111              2^W_TYPE_SIZE arithmetic should be fine. */
2112           t = Q1 + q * (P - P1);
2113           Q1 = Q;
2114           Q = t;
2115           P = P1;
2116
2117           if ( (i & 1) == 0)
2118             {
2119               uintmax_t r = is_square (Q);
2120               if (r)
2121                 {
2122                   for (unsigned int j = 0; j < qpos; j++)
2123                     {
2124                       if (queue[j].Q == r)
2125                         {
2126                           if (r == 1)
2127                             /* Traversed entire cycle. */
2128                             goto next_multiplier;
2129
2130                           /* Need the absolute value for divisibility test. */
2131                           if (P >= queue[j].P)
2132                             t = P - queue[j].P;
2133                           else
2134                             t = queue[j].P - P;
2135                           if (t % r == 0)
2136                             {
2137                               /* Delete entries up to and including entry
2138                                  j, which matched. */
2139                               memmove (queue, queue + j + 1,
2140                                        (qpos - j - 1) * sizeof (queue[0]));
2141                               qpos -= (j + 1);
2142                             }
2143                           goto next_i;
2144                         }
2145                     }
2146
2147                   /* We have found a square form, which should give a
2148                      factor. */
2149                   Q1 = r;
2150                   assert (S >= P); /* What signs are possible? */
2151                   P += r * ((S - P) / r);
2152
2153                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2154                      for the case D = 2N. */
2155                   /* Compute Q = (D - P*P) / Q1, but we need double
2156                      precision. */
2157                   uintmax_t hi, lo;
2158                   umul_ppmm (hi, lo, P, P);
2159                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2160                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2161                   assert (rem == 0);
2162
2163                   for (;;)
2164                     {
2165                       /* Note: There appears to by a typo in the paper,
2166                          Step 4a in the algorithm description says q <--
2167                          floor([S+P]/\hat Q), but looking at the equations
2168                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2169                          (In this code, \hat Q is Q1). */
2170                       div_smallq (q, rem, S+P, Q);
2171                       P1 = S - rem;     /* P1 = q*Q - P */
2172
2173 # if STAT_SQUFOF
2174                       q_freq[0]++;
2175                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2176 # endif
2177                       if (P == P1)
2178                         break;
2179                       t = Q1 + q * (P - P1);
2180                       Q1 = Q;
2181                       Q = t;
2182                       P = P1;
2183                     }
2184
2185                   if ( (Q & 1) == 0)
2186                     Q /= 2;
2187                   Q /= gcd_odd (Q, mu);
2188
2189                   assert (Q > 1 && (n1 || Q < n0));
2190
2191                   if (prime_p (Q))
2192                     factor_insert (factors, Q);
2193                   else if (!factor_using_squfof (0, Q, factors))
2194                     factor_using_pollard_rho (Q, 2, factors);
2195
2196                   divexact_21 (n1, n0, n1, n0, Q);
2197
2198                   if (prime2_p (n1, n0))
2199                     factor_insert_large (factors, n1, n0);
2200                   else
2201                     {
2202                       if (!factor_using_squfof (n1, n0, factors))
2203                         {
2204                           if (n1 == 0)
2205                             factor_using_pollard_rho (n0, 1, factors);
2206                           else
2207                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2208                         }
2209                     }
2210
2211                   return true;
2212                 }
2213             }
2214         next_i:;
2215         }
2216     next_multiplier:;
2217     }
2218   return false;
2219 }
2220 #endif
2221
2222 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2223    results in FACTORS.  */
2224 static void
2225 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2226 {
2227   factors->nfactors = 0;
2228   factors->plarge[1] = 0;
2229
2230   if (t1 == 0 && t0 < 2)
2231     return;
2232
2233   t0 = factor_using_division (&t1, t1, t0, factors);
2234
2235   if (t1 == 0 && t0 < 2)
2236     return;
2237
2238   if (prime2_p (t1, t0))
2239     factor_insert_large (factors, t1, t0);
2240   else
2241     {
2242 #if USE_SQUFOF
2243       if (factor_using_squfof (t1, t0, factors))
2244         return;
2245 #endif
2246
2247       if (t1 == 0)
2248         factor_using_pollard_rho (t0, 1, factors);
2249       else
2250         factor_using_pollard_rho2 (t1, t0, 1, factors);
2251     }
2252 }
2253
2254 #if HAVE_GMP
2255 /* Use Pollard-rho to compute the prime factors of
2256    arbitrary-precision T, and put the results in FACTORS.  */
2257 static void
2258 mp_factor (mpz_t t, struct mp_factors *factors)
2259 {
2260   mp_factor_init (factors);
2261
2262   if (mpz_sgn (t) != 0)
2263     {
2264       mp_factor_using_division (t, factors);
2265
2266       if (mpz_cmp_ui (t, 1) != 0)
2267         {
2268           devmsg ("[is number prime?] ");
2269           if (mp_prime_p (t))
2270             mp_factor_insert (factors, t);
2271           else
2272             mp_factor_using_pollard_rho (t, 1, factors);
2273         }
2274     }
2275 }
2276 #endif
2277
2278 static strtol_error
2279 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2280 {
2281   unsigned int lo_carry;
2282   uintmax_t hi = 0, lo = 0;
2283
2284   strtol_error err = LONGINT_INVALID;
2285
2286   /* Skip initial spaces and '+'.  */
2287   for (;;)
2288     {
2289       char c = *s;
2290       if (c == ' ')
2291         s++;
2292       else if (c == '+')
2293         {
2294           s++;
2295           break;
2296         }
2297       else
2298         break;
2299     }
2300
2301   /* Initial scan for invalid digits.  */
2302   const char *p = s;
2303   for (;;)
2304     {
2305       unsigned int c = *p++;
2306       if (c == 0)
2307         break;
2308
2309       if (UNLIKELY (!ISDIGIT (c)))
2310         {
2311           err = LONGINT_INVALID;
2312           break;
2313         }
2314
2315       err = LONGINT_OK;           /* we've seen at least one valid digit */
2316     }
2317
2318   for (;err == LONGINT_OK;)
2319     {
2320       unsigned int c = *s++;
2321       if (c == 0)
2322         break;
2323
2324       c -= '0';
2325
2326       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2327         {
2328           err = LONGINT_OVERFLOW;
2329           break;
2330         }
2331       hi = 10 * hi;
2332
2333       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2334       lo_carry += 10 * lo < 2 * lo;
2335
2336       lo = 10 * lo;
2337       lo += c;
2338
2339       lo_carry += lo < c;
2340       hi += lo_carry;
2341       if (UNLIKELY (hi < lo_carry))
2342         {
2343           err = LONGINT_OVERFLOW;
2344           break;
2345         }
2346     }
2347
2348   *hip = hi;
2349   *lop = lo;
2350
2351   return err;
2352 }
2353
2354 /* Structure and routines for buffering and outputting full lines,
2355    to support parallel operation efficiently.  */
2356 static struct lbuf_
2357 {
2358   char *buf;
2359   char *end;
2360 } lbuf;
2361
2362 /* 512 is chosen to give good performance,
2363    and also is the max guaranteed size that
2364    consumers can read atomically through pipes.
2365    Also it's big enough to cater for max line length
2366    even with 128 bit uintmax_t.  */
2367 #define FACTOR_PIPE_BUF 512
2368
2369 static void
2370 lbuf_alloc (void)
2371 {
2372   if (lbuf.buf)
2373     return;
2374
2375   /* Double to ensure enough space for
2376      previous numbers + next number.  */
2377   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2378   lbuf.end = lbuf.buf;
2379 }
2380
2381 /* Write complete LBUF to standard output.  */
2382 static void
2383 lbuf_flush (void)
2384 {
2385   size_t size = lbuf.end - lbuf.buf;
2386   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2387     die (EXIT_FAILURE, errno, "%s", _("write error"));
2388   lbuf.end = lbuf.buf;
2389 }
2390
2391 /* Add a character C to LBUF and if it's a newline
2392    and enough bytes are already buffered,
2393    then write atomically to standard output.  */
2394 static void
2395 lbuf_putc (char c)
2396 {
2397   *lbuf.end++ = c;
2398
2399   if (c == '\n')
2400     {
2401       size_t buffered = lbuf.end - lbuf.buf;
2402
2403       /* Provide immediate output for interactive input.  */
2404       static int line_buffered = -1;
2405       if (line_buffered == -1)
2406         line_buffered = isatty (STDIN_FILENO);
2407       if (line_buffered)
2408         lbuf_flush ();
2409       else if (buffered >= FACTOR_PIPE_BUF)
2410         {
2411           /* Write output in <= PIPE_BUF chunks
2412              so consumers can read atomically.  */
2413           char const *tend = lbuf.end;
2414
2415           /* Since a umaxint_t's factors must fit in 512
2416              we're guaranteed to find a newline here.  */
2417           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2418           while (*--tlend != '\n');
2419           tlend++;
2420
2421           lbuf.end = tlend;
2422           lbuf_flush ();
2423
2424           /* Buffer the remainder.  */
2425           memcpy (lbuf.buf, tlend, tend - tlend);
2426           lbuf.end = lbuf.buf + (tend - tlend);
2427         }
2428     }
2429 }
2430
2431 /* Buffer an int to the internal LBUF.  */
2432 static void
2433 lbuf_putint (uintmax_t i, size_t min_width)
2434 {
2435   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2436   char const *umaxstr = umaxtostr (i, buf);
2437   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2438   size_t z = width;
2439
2440   for (; z < min_width; z++)
2441     *lbuf.end++ = '0';
2442
2443   memcpy (lbuf.end, umaxstr, width);
2444   lbuf.end += width;
2445 }
2446
2447 static void
2448 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2449 {
2450   uintmax_t q, r;
2451
2452   if (t1 == 0)
2453     lbuf_putint (t0, 0);
2454   else
2455     {
2456       /* Use very plain code here since it seems hard to write fast code
2457          without assuming a specific word size.  */
2458       q = t1 / 1000000000;
2459       r = t1 % 1000000000;
2460       udiv_qrnnd (t0, r, r, t0, 1000000000);
2461       print_uintmaxes (q, t0);
2462       lbuf_putint (r, 9);
2463     }
2464 }
2465
2466 /* Single-precision factoring */
2467 static void
2468 print_factors_single (uintmax_t t1, uintmax_t t0)
2469 {
2470   struct factors factors;
2471
2472   print_uintmaxes (t1, t0);
2473   lbuf_putc (':');
2474
2475   factor (t1, t0, &factors);
2476
2477   for (unsigned int j = 0; j < factors.nfactors; j++)
2478     for (unsigned int k = 0; k < factors.e[j]; k++)
2479       {
2480         lbuf_putc (' ');
2481         print_uintmaxes (0, factors.p[j]);
2482       }
2483
2484   if (factors.plarge[1])
2485     {
2486       lbuf_putc (' ');
2487       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2488     }
2489
2490   lbuf_putc ('\n');
2491 }
2492
2493 /* Emit the factors of the indicated number.  If we have the option of using
2494    either algorithm, we select on the basis of the length of the number.
2495    For longer numbers, we prefer the MP algorithm even if the native algorithm
2496    has enough digits, because the algorithm is better.  The turnover point
2497    depends on the value.  */
2498 static bool
2499 print_factors (const char *input)
2500 {
2501   uintmax_t t1, t0;
2502
2503   /* Try converting the number to one or two words.  If it fails, use GMP or
2504      print an error message.  The 2nd condition checks that the most
2505      significant bit of the two-word number is clear, in a typesize neutral
2506      way.  */
2507   strtol_error err = strto2uintmax (&t1, &t0, input);
2508
2509   switch (err)
2510     {
2511     case LONGINT_OK:
2512       if (((t1 << 1) >> 1) == t1)
2513         {
2514           devmsg ("[using single-precision arithmetic] ");
2515           print_factors_single (t1, t0);
2516           return true;
2517         }
2518       break;
2519
2520     case LONGINT_OVERFLOW:
2521       /* Try GMP.  */
2522       break;
2523
2524     default:
2525       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2526       return false;
2527     }
2528
2529 #if HAVE_GMP
2530   devmsg ("[using arbitrary-precision arithmetic] ");
2531   mpz_t t;
2532   struct mp_factors factors;
2533
2534   mpz_init_set_str (t, input, 10);
2535
2536   gmp_printf ("%Zd:", t);
2537   mp_factor (t, &factors);
2538
2539   for (unsigned int j = 0; j < factors.nfactors; j++)
2540     for (unsigned int k = 0; k < factors.e[j]; k++)
2541       gmp_printf (" %Zd", factors.p[j]);
2542
2543   mp_factor_clear (&factors);
2544   mpz_clear (t);
2545   putchar ('\n');
2546   fflush (stdout);
2547   return true;
2548 #else
2549   error (0, 0, _("%s is too large"), quote (input));
2550   return false;
2551 #endif
2552 }
2553
2554 void
2555 usage (int status)
2556 {
2557   if (status != EXIT_SUCCESS)
2558     emit_try_help ();
2559   else
2560     {
2561       printf (_("\
2562 Usage: %s [NUMBER]...\n\
2563   or:  %s OPTION\n\
2564 "),
2565               program_name, program_name);
2566       fputs (_("\
2567 Print the prime factors of each specified integer NUMBER.  If none\n\
2568 are specified on the command line, read them from standard input.\n\
2569 \n\
2570 "), stdout);
2571       fputs (HELP_OPTION_DESCRIPTION, stdout);
2572       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2573       emit_ancillary_info (PROGRAM_NAME);
2574     }
2575   exit (status);
2576 }
2577
2578 static bool
2579 do_stdin (void)
2580 {
2581   bool ok = true;
2582   token_buffer tokenbuffer;
2583
2584   init_tokenbuffer (&tokenbuffer);
2585
2586   while (true)
2587     {
2588       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2589                                        &tokenbuffer);
2590       if (token_length == (size_t) -1)
2591         break;
2592       ok &= print_factors (tokenbuffer.buffer);
2593     }
2594   free (tokenbuffer.buffer);
2595
2596   return ok;
2597 }
2598
2599 int
2600 main (int argc, char **argv)
2601 {
2602   initialize_main (&argc, &argv);
2603   set_program_name (argv[0]);
2604   setlocale (LC_ALL, "");
2605   bindtextdomain (PACKAGE, LOCALEDIR);
2606   textdomain (PACKAGE);
2607
2608   lbuf_alloc ();
2609   atexit (close_stdout);
2610   atexit (lbuf_flush);
2611
2612   int c;
2613   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2614     {
2615       switch (c)
2616         {
2617         case DEV_DEBUG_OPTION:
2618           dev_debug = true;
2619           break;
2620
2621         case_GETOPT_HELP_CHAR;
2622
2623         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2624
2625         default:
2626           usage (EXIT_FAILURE);
2627         }
2628     }
2629
2630 #if STAT_SQUFOF
2631   memset (q_freq, 0, sizeof (q_freq));
2632 #endif
2633
2634   bool ok;
2635   if (argc <= optind)
2636     ok = do_stdin ();
2637   else
2638     {
2639       ok = true;
2640       for (int i = optind; i < argc; i++)
2641         if (! print_factors (argv[i]))
2642           ok = false;
2643     }
2644
2645 #if STAT_SQUFOF
2646   if (q_freq[0] > 0)
2647     {
2648       double acc_f;
2649       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2650       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2651         {
2652           double f = (double) q_freq[i] / q_freq[0];
2653           acc_f += f;
2654           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2655                   100.0 * f, 100.0 * acc_f);
2656         }
2657     }
2658 #endif
2659
2660   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2661 }