src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synced 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or if USE_SQUFOF is defined to 1, try that first.
  54         Status of found factors are checked again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdio.h>
 107 #if HAVE_GMP
 108 # include <gmp.h>
 109 # if !HAVE_DECL_MPZ_INITS
 110 #  include <stdarg.h>
 111 # endif
 112 #endif
 113
 114 #include <assert.h>
 115
 116 #include "system.h"
 117 #include "die.h"
 118 #include "error.h"
 119 #include "full-write.h"
 120 #include "quote.h"
 121 #include "readtokens.h"
 122 #include "xstrtol.h"
 123
 124 /* The official name of this program (e.g., no 'g' prefix).  */
 125 #define PROGRAM_NAME "factor"
 126
 127 #define AUTHORS \
 128   proper_name ("Paul Rubin"),                                           \
 129   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 130   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 131
 132 /* Token delimiters when reading from a file.  */
 133 #define DELIM "\n\t "
 134
 135 #ifndef USE_LONGLONG_H
 136 /* With the way we use longlong.h, it's only safe to use
 137    when UWtype = UHWtype, as there were various cases
 138    (as can be seen in the history for longlong.h) where
 139    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 140    to avoid compile time or run time issues.  */
 141 # if LONG_MAX == INTMAX_MAX
 142 #  define USE_LONGLONG_H 1
 143 # endif
 144 #endif
 145
 146 #if USE_LONGLONG_H
 147
 148 /* Make definitions for longlong.h to make it do what it can do for us */
 149
 150 /* bitcount for uintmax_t */
 151 # if UINTMAX_MAX == UINT32_MAX
 152 #  define W_TYPE_SIZE 32
 153 # elif UINTMAX_MAX == UINT64_MAX
 154 #  define W_TYPE_SIZE 64
 155 # elif UINTMAX_MAX == UINT128_MAX
 156 #  define W_TYPE_SIZE 128
 157 # endif
 158
 159 # define UWtype  uintmax_t
 160 # define UHWtype unsigned long int
 161 # undef UDWtype
 162 # if HAVE_ATTRIBUTE_MODE
 163 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 164 typedef          int SItype     __attribute__ ((mode (SI)));
 165 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 166 typedef          int DItype     __attribute__ ((mode (DI)));
 167 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 168 # else
 169 typedef unsigned char UQItype;
 170 typedef          long SItype;
 171 typedef unsigned long int USItype;
 172 #  if HAVE_LONG_LONG_INT
 173 typedef long long int DItype;
 174 typedef unsigned long long int UDItype;
 175 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 176 typedef long int DItype;
 177 typedef unsigned long int UDItype;
 178 #  endif
 179 # endif
 180 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 181 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 182 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 183 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 184 # ifndef __GMP_GNUC_PREREQ
 185 #  define __GMP_GNUC_PREREQ(a,b) 1
 186 # endif
 187
 188 /* These stub macros are only used in longlong.h in certain system compiler
 189    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 190 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 191 ASSERT (1)
 192 __GMP_DECLSPEC
 193 # endif
 194
 195 # if _ARCH_PPC
 196 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 197 # endif
 198 # include "longlong.h"
 199 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 200 const unsigned char factor_clz_tab[129] =
 201 {
 202   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 203   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 204   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 205   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 206   9
 207 };
 208 # endif
 209
 210 #else /* not USE_LONGLONG_H */
 211
 212 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 213 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 214 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 215 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 216
 217 #endif
 218
 219 #if !defined __clz_tab && !defined UHWtype
 220 /* Without this seemingly useless conditional, gcc -Wunused-macros
 221    warns that each of the two tested macros is unused on Fedora 18.
 222    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 223 #endif
 224
 225 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 226 #define MAX_NFACTS 26
 227
 228 enum
 229 {
 230   DEV_DEBUG_OPTION = CHAR_MAX + 1
 231 };
 232
 233 static struct option const long_options[] =
 234 {
 235   {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
 236   {GETOPT_HELP_OPTION_DECL},
 237   {GETOPT_VERSION_OPTION_DECL},
 238   {NULL, 0, NULL, 0}
 239 };
 240
 241 struct factors
 242 {
 243   uintmax_t     plarge[2]; /* Can have a single large factor */
 244   uintmax_t     p[MAX_NFACTS];
 245   unsigned char e[MAX_NFACTS];
 246   unsigned char nfactors;
 247 };
 248
 249 #if HAVE_GMP
 250 struct mp_factors
 251 {
 252   mpz_t             *p;
 253   unsigned long int *e;
 254   unsigned long int nfactors;
 255 };
 256 #endif
 257
 258 static void factor (uintmax_t, uintmax_t, struct factors *);
 259
 260 #ifndef umul_ppmm
 261 # define umul_ppmm(w1, w0, u, v)                                        \
 262   do {                                                                  \
 263     uintmax_t __x0, __x1, __x2, __x3;                                   \
 264     unsigned long int __ul, __vl, __uh, __vh;                           \
 265     uintmax_t __u = (u), __v = (v);                                     \
 266                                                                         \
 267     __ul = __ll_lowpart (__u);                                          \
 268     __uh = __ll_highpart (__u);                                         \
 269     __vl = __ll_lowpart (__v);                                          \
 270     __vh = __ll_highpart (__v);                                         \
 271                                                                         \
 272     __x0 = (uintmax_t) __ul * __vl;                                     \
 273     __x1 = (uintmax_t) __ul * __vh;                                     \
 274     __x2 = (uintmax_t) __uh * __vl;                                     \
 275     __x3 = (uintmax_t) __uh * __vh;                                     \
 276                                                                         \
 277     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 278     __x1 += __x2;               /* but this indeed can */               \
 279     if (__x1 < __x2)            /* did we get it? */                    \
 280       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 281                                                                         \
 282     (w1) = __x3 + __ll_highpart (__x1);                                 \
 283     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 284   } while (0)
 285 #endif
 286
 287 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 288 /* Define our own, not needing normalization. This function is
 289    currently not performance critical, so keep it simple. Similar to
 290    the mod macro below. */
 291 # undef udiv_qrnnd
 292 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 293   do {                                                                  \
 294     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 295                                                                         \
 296     assert ((n1) < (d));                                                \
 297     __d1 = (d); __d0 = 0;                                               \
 298     __r1 = (n1); __r0 = (n0);                                           \
 299     __q = 0;                                                            \
 300     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 301       {                                                                 \
 302         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 303         __q <<= 1;                                                      \
 304         if (ge2 (__r1, __r0, __d1, __d0))                               \
 305           {                                                             \
 306             __q++;                                                      \
 307             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 308           }                                                             \
 309       }                                                                 \
 310     (r) = __r0;                                                         \
 311     (q) = __q;                                                          \
 312   } while (0)
 313 #endif
 314
 315 #if !defined add_ssaaaa
 316 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 317   do {                                                                  \
 318     uintmax_t _add_x;                                                   \
 319     _add_x = (al) + (bl);                                               \
 320     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 321     (sl) = _add_x;                                                      \
 322   } while (0)
 323 #endif
 324
 325 #define rsh2(rh, rl, ah, al, cnt)                                       \
 326   do {                                                                  \
 327     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 328     (rh) = (ah) >> (cnt);                                               \
 329   } while (0)
 330
 331 #define lsh2(rh, rl, ah, al, cnt)                                       \
 332   do {                                                                  \
 333     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 334     (rl) = (al) << (cnt);                                               \
 335   } while (0)
 336
 337 #define ge2(ah, al, bh, bl)                                             \
 338   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 339
 340 #define gt2(ah, al, bh, bl)                                             \
 341   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 342
 343 #ifndef sub_ddmmss
 344 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 345   do {                                                                  \
 346     uintmax_t _cy;                                                      \
 347     _cy = (al) < (bl);                                                  \
 348     (rl) = (al) - (bl);                                                 \
 349     (rh) = (ah) - (bh) - _cy;                                           \
 350   } while (0)
 351 #endif
 352
 353 #ifndef count_leading_zeros
 354 # define count_leading_zeros(count, x) do {                             \
 355     uintmax_t __clz_x = (x);                                            \
 356     unsigned int __clz_c;                                               \
 357     for (__clz_c = 0;                                                   \
 358          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 359          __clz_c += 8)                                                  \
 360       __clz_x <<= 8;                                                    \
 361     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 362       __clz_x <<= 1;                                                    \
 363     (count) = __clz_c;                                                  \
 364   } while (0)
 365 #endif
 366
 367 #ifndef count_trailing_zeros
 368 # define count_trailing_zeros(count, x) do {                            \
 369     uintmax_t __ctz_x = (x);                                            \
 370     unsigned int __ctz_c = 0;                                           \
 371     while ((__ctz_x & 1) == 0)                                          \
 372       {                                                                 \
 373         __ctz_x >>= 1;                                                  \
 374         __ctz_c++;                                                      \
 375       }                                                                 \
 376     (count) = __ctz_c;                                                  \
 377   } while (0)
 378 #endif
 379
 380 /* Requires that a < n and b <= n */
 381 #define submod(r,a,b,n)                                                 \
 382   do {                                                                  \
 383     uintmax_t _t = - (uintmax_t) (a < b);                               \
 384     (r) = ((n) & _t) + (a) - (b);                                       \
 385   } while (0)
 386
 387 #define addmod(r,a,b,n)                                                 \
 388   submod ((r), (a), ((n) - (b)), (n))
 389
 390 /* Modular two-word addition and subtraction.  For performance reasons, the
 391    most significant bit of n1 must be clear.  The destination variables must be
 392    distinct from the mod operand.  */
 393 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 394   do {                                                                  \
 395     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 396     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 397       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 398   } while (0)
 399 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 400   do {                                                                  \
 401     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 402     if ((intmax_t) (r1) < 0)                                            \
 403       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 404   } while (0)
 405
 406 #define HIGHBIT_TO_MASK(x)                                              \
 407   (((intmax_t)-1 >> 1) < 0                                              \
 408    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 409    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 410       ? UINTMAX_MAX : (uintmax_t) 0))
 411
 412 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 413    Requires that d1 != 0.  */
 414 static uintmax_t
 415 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 416 {
 417   int cntd, cnta;
 418
 419   assert (d1 != 0);
 420
 421   if (a1 == 0)
 422     {
 423       *r1 = 0;
 424       return a0;
 425     }
 426
 427   count_leading_zeros (cntd, d1);
 428   count_leading_zeros (cnta, a1);
 429   int cnt = cntd - cnta;
 430   lsh2 (d1, d0, d1, d0, cnt);
 431   for (int i = 0; i < cnt; i++)
 432     {
 433       if (ge2 (a1, a0, d1, d0))
 434         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 435       rsh2 (d1, d0, d1, d0, 1);
 436     }
 437
 438   *r1 = a1;
 439   return a0;
 440 }
 441
 442 static uintmax_t _GL_ATTRIBUTE_CONST
 443 gcd_odd (uintmax_t a, uintmax_t b)
 444 {
 445   if ( (b & 1) == 0)
 446     {
 447       uintmax_t t = b;
 448       b = a;
 449       a = t;
 450     }
 451   if (a == 0)
 452     return b;
 453
 454   /* Take out least significant one bit, to make room for sign */
 455   b >>= 1;
 456
 457   for (;;)
 458     {
 459       uintmax_t t;
 460       uintmax_t bgta;
 461
 462       while ((a & 1) == 0)
 463         a >>= 1;
 464       a >>= 1;
 465
 466       t = a - b;
 467       if (t == 0)
 468         return (a << 1) + 1;
 469
 470       bgta = HIGHBIT_TO_MASK (t);
 471
 472       /* b <-- min (a, b) */
 473       b += (bgta & t);
 474
 475       /* a <-- |a - b| */
 476       a = (t ^ bgta) - bgta;
 477     }
 478 }
 479
 480 static uintmax_t
 481 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 482 {
 483   while ((a0 & 1) == 0)
 484     rsh2 (a1, a0, a1, a0, 1);
 485   while ((b0 & 1) == 0)
 486     rsh2 (b1, b0, b1, b0, 1);
 487
 488   for (;;)
 489     {
 490       if ((b1 | a1) == 0)
 491         {
 492           *r1 = 0;
 493           return gcd_odd (b0, a0);
 494         }
 495
 496       if (gt2 (a1, a0, b1, b0))
 497         {
 498           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 499           do
 500             rsh2 (a1, a0, a1, a0, 1);
 501           while ((a0 & 1) == 0);
 502         }
 503       else if (gt2 (b1, b0, a1, a0))
 504         {
 505           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 506           do
 507             rsh2 (b1, b0, b1, b0, 1);
 508           while ((b0 & 1) == 0);
 509         }
 510       else
 511         break;
 512     }
 513
 514   *r1 = a1;
 515   return a0;
 516 }
 517
 518 static void
 519 factor_insert_multiplicity (struct factors *factors,
 520                             uintmax_t prime, unsigned int m)
 521 {
 522   unsigned int nfactors = factors->nfactors;
 523   uintmax_t *p = factors->p;
 524   unsigned char *e = factors->e;
 525
 526   /* Locate position for insert new or increment e.  */
 527   int i;
 528   for (i = nfactors - 1; i >= 0; i--)
 529     {
 530       if (p[i] <= prime)
 531         break;
 532     }
 533
 534   if (i < 0 || p[i] != prime)
 535     {
 536       for (int j = nfactors - 1; j > i; j--)
 537         {
 538           p[j + 1] = p[j];
 539           e[j + 1] = e[j];
 540         }
 541       p[i + 1] = prime;
 542       e[i + 1] = m;
 543       factors->nfactors = nfactors + 1;
 544     }
 545   else
 546     {
 547       e[i] += m;
 548     }
 549 }
 550
 551 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 552
 553 static void
 554 factor_insert_large (struct factors *factors,
 555                      uintmax_t p1, uintmax_t p0)
 556 {
 557   if (p1 > 0)
 558     {
 559       assert (factors->plarge[1] == 0);
 560       factors->plarge[0] = p0;
 561       factors->plarge[1] = p1;
 562     }
 563   else
 564     factor_insert (factors, p0);
 565 }
 566
 567 #if HAVE_GMP
 568
 569 # if !HAVE_DECL_MPZ_INITS
 570
 571 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 572 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 573
 574 static void
 575 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 576 {
 577   va_list ap;
 578
 579   va_start (ap, mpz_single_init);
 580
 581   mpz_t *mpz;
 582   while ((mpz = va_arg (ap, mpz_t *)))
 583     mpz_single_init (*mpz);
 584
 585   va_end (ap);
 586 }
 587 # endif
 588
 589 static void mp_factor (mpz_t, struct mp_factors *);
 590
 591 static void
 592 mp_factor_init (struct mp_factors *factors)
 593 {
 594   factors->p = NULL;
 595   factors->e = NULL;
 596   factors->nfactors = 0;
 597 }
 598
 599 static void
 600 mp_factor_clear (struct mp_factors *factors)
 601 {
 602   for (unsigned int i = 0; i < factors->nfactors; i++)
 603     mpz_clear (factors->p[i]);
 604
 605   free (factors->p);
 606   free (factors->e);
 607 }
 608
 609 static void
 610 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 611 {
 612   unsigned long int nfactors = factors->nfactors;
 613   mpz_t         *p  = factors->p;
 614   unsigned long int *e  = factors->e;
 615   long i;
 616
 617   /* Locate position for insert new or increment e.  */
 618   for (i = nfactors - 1; i >= 0; i--)
 619     {
 620       if (mpz_cmp (p[i], prime) <= 0)
 621         break;
 622     }
 623
 624   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 625     {
 626       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 627       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 628
 629       mpz_init (p[nfactors]);
 630       for (long j = nfactors - 1; j > i; j--)
 631         {
 632           mpz_set (p[j + 1], p[j]);
 633           e[j + 1] = e[j];
 634         }
 635       mpz_set (p[i + 1], prime);
 636       e[i + 1] = 1;
 637
 638       factors->p = p;
 639       factors->e = e;
 640       factors->nfactors = nfactors + 1;
 641     }
 642   else
 643     {
 644       e[i] += 1;
 645     }
 646 }
 647
 648 static void
 649 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 650 {
 651   mpz_t pz;
 652
 653   mpz_init_set_ui (pz, prime);
 654   mp_factor_insert (factors, pz);
 655   mpz_clear (pz);
 656 }
 657 #endif /* HAVE_GMP */
 658
 659
 660 /* Number of bits in an uintmax_t.  */
 661 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 662
 663 /* Verify that uintmax_t does not have holes in its representation.  */
 664 verify (UINTMAX_MAX >> (W - 1) == 1);
 665
 666 #define P(a,b,c,d) a,
 667 static const unsigned char primes_diff[] = {
 668 #include "primes.h"
 669 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 670 };
 671 #undef P
 672
 673 #define PRIMES_PTAB_ENTRIES \
 674   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 675
 676 #define P(a,b,c,d) b,
 677 static const unsigned char primes_diff8[] = {
 678 #include "primes.h"
 679 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 680 };
 681 #undef P
 682
 683 struct primes_dtab
 684 {
 685   uintmax_t binv, lim;
 686 };
 687
 688 #define P(a,b,c,d) {c,d},
 689 static const struct primes_dtab primes_dtab[] = {
 690 #include "primes.h"
 691 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 692 };
 693 #undef P
 694
 695 /* Verify that uintmax_t is not wider than
 696    the integers used to generate primes.h.  */
 697 verify (W <= WIDE_UINT_BITS);
 698
 699 /* debugging for developers.  Enables devmsg().
 700    This flag is used only in the GMP code.  */
 701 static bool dev_debug = false;
 702
 703 /* Prove primality or run probabilistic tests.  */
 704 static bool flag_prove_primality = PROVE_PRIMALITY;
 705
 706 /* Number of Miller-Rabin tests to run when not proving primality. */
 707 #define MR_REPS 25
 708
 709 static void
 710 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 711                       unsigned int off)
 712 {
 713   for (unsigned int j = 0; j < off; j++)
 714     p += primes_diff[i + j];
 715   factor_insert (factors, p);
 716 }
 717
 718 /* Trial division with odd primes uses the following trick.
 719
 720    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 721    consider the case t < B (this is the second loop below).
 722
 723    From our tables we get
 724
 725      binv = p^{-1} (mod B)
 726      lim = floor ( (B-1) / p ).
 727
 728    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 729    (and all quotients in this range occur for some t).
 730
 731    Then t = q * p is true also (mod B), and p is invertible we get
 732
 733      q = t * binv (mod B).
 734
 735    Next, assume that t is *not* divisible by p. Since multiplication
 736    by binv (mod B) is a one-to-one mapping,
 737
 738      t * binv (mod B) > lim,
 739
 740    because all the smaller values are already taken.
 741
 742    This can be summed up by saying that the function
 743
 744      q(t) = binv * t (mod B)
 745
 746    is a permutation of the range 0 <= t < B, with the curious property
 747    that it maps the multiples of p onto the range 0 <= q <= lim, in
 748    order, and the non-multiples of p onto the range lim < q < B.
 749  */
 750
 751 static uintmax_t
 752 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 753                        struct factors *factors)
 754 {
 755   if (t0 % 2 == 0)
 756     {
 757       unsigned int cnt;
 758
 759       if (t0 == 0)
 760         {
 761           count_trailing_zeros (cnt, t1);
 762           t0 = t1 >> cnt;
 763           t1 = 0;
 764           cnt += W_TYPE_SIZE;
 765         }
 766       else
 767         {
 768           count_trailing_zeros (cnt, t0);
 769           rsh2 (t1, t0, t1, t0, cnt);
 770         }
 771
 772       factor_insert_multiplicity (factors, 2, cnt);
 773     }
 774
 775   uintmax_t p = 3;
 776   unsigned int i;
 777   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 778     {
 779       for (;;)
 780         {
 781           uintmax_t q1, q0, hi, lo _GL_UNUSED;
 782
 783           q0 = t0 * primes_dtab[i].binv;
 784           umul_ppmm (hi, lo, q0, p);
 785           if (hi > t1)
 786             break;
 787           hi = t1 - hi;
 788           q1 = hi * primes_dtab[i].binv;
 789           if (LIKELY (q1 > primes_dtab[i].lim))
 790             break;
 791           t1 = q1; t0 = q0;
 792           factor_insert (factors, p);
 793         }
 794       p += primes_diff[i + 1];
 795     }
 796   if (t1p)
 797     *t1p = t1;
 798
 799 #define DIVBLOCK(I)                                                     \
 800   do {                                                                  \
 801     for (;;)                                                            \
 802       {                                                                 \
 803         q = t0 * pd[I].binv;                                            \
 804         if (LIKELY (q > pd[I].lim))                                     \
 805           break;                                                        \
 806         t0 = q;                                                         \
 807         factor_insert_refind (factors, p, i + 1, I);                    \
 808       }                                                                 \
 809   } while (0)
 810
 811   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 812     {
 813       uintmax_t q;
 814       const struct primes_dtab *pd = &primes_dtab[i];
 815       DIVBLOCK (0);
 816       DIVBLOCK (1);
 817       DIVBLOCK (2);
 818       DIVBLOCK (3);
 819       DIVBLOCK (4);
 820       DIVBLOCK (5);
 821       DIVBLOCK (6);
 822       DIVBLOCK (7);
 823
 824       p += primes_diff8[i];
 825       if (p * p > t0)
 826         break;
 827     }
 828
 829   return t0;
 830 }
 831
 832 #if HAVE_GMP
 833 static void
 834 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 835 {
 836   mpz_t q;
 837   unsigned long int p;
 838
 839   devmsg ("[trial division] ");
 840
 841   mpz_init (q);
 842
 843   p = mpz_scan1 (t, 0);
 844   mpz_div_2exp (t, t, p);
 845   while (p)
 846     {
 847       mp_factor_insert_ui (factors, 2);
 848       --p;
 849     }
 850
 851   p = 3;
 852   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 853     {
 854       if (! mpz_divisible_ui_p (t, p))
 855         {
 856           p += primes_diff[i++];
 857           if (mpz_cmp_ui (t, p * p) < 0)
 858             break;
 859         }
 860       else
 861         {
 862           mpz_tdiv_q_ui (t, t, p);
 863           mp_factor_insert_ui (factors, p);
 864         }
 865     }
 866
 867   mpz_clear (q);
 868 }
 869 #endif
 870
 871 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 872 static const unsigned char  binvert_table[128] =
 873 {
 874   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 875   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 876   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 877   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 878   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 879   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 880   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 881   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 882   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 883   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 884   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 885   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 886   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 887   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 888   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 889   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 890 };
 891
 892 /* Compute n^(-1) mod B, using a Newton iteration.  */
 893 #define binv(inv,n)                                                     \
 894   do {                                                                  \
 895     uintmax_t  __n = (n);                                               \
 896     uintmax_t  __inv;                                                   \
 897                                                                         \
 898     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 899     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 900     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 901     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 902                                                                         \
 903     if (W_TYPE_SIZE > 64)                                               \
 904       {                                                                 \
 905         int  __invbits = 64;                                            \
 906         do {                                                            \
 907           __inv = 2 * __inv - __inv * __inv * __n;                      \
 908           __invbits *= 2;                                               \
 909         } while (__invbits < W_TYPE_SIZE);                              \
 910       }                                                                 \
 911                                                                         \
 912     (inv) = __inv;                                                      \
 913   } while (0)
 914
 915 /* q = u / d, assuming d|u.  */
 916 #define divexact_21(q1, q0, u1, u0, d)                                  \
 917   do {                                                                  \
 918     uintmax_t _di, _q0;                                                 \
 919     binv (_di, (d));                                                    \
 920     _q0 = (u0) * _di;                                                   \
 921     if ((u1) >= (d))                                                    \
 922       {                                                                 \
 923         uintmax_t _p1, _p0 _GL_UNUSED;                            \
 924         umul_ppmm (_p1, _p0, _q0, d);                                   \
 925         (q1) = ((u1) - _p1) * _di;                                      \
 926         (q0) = _q0;                                                     \
 927       }                                                                 \
 928     else                                                                \
 929       {                                                                 \
 930         (q0) = _q0;                                                     \
 931         (q1) = 0;                                                       \
 932       }                                                                 \
 933   } while (0)
 934
 935 /* x B (mod n). */
 936 #define redcify(r_prim, r, n)                                           \
 937   do {                                                                  \
 938     uintmax_t _redcify_q _GL_UNUSED;                              \
 939     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 940   } while (0)
 941
 942 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 943 #define redcify2(r1, r0, x, n1, n0)                                     \
 944   do {                                                                  \
 945     uintmax_t _r1, _r0, _i;                                             \
 946     if ((x) < (n1))                                                     \
 947       {                                                                 \
 948         _r1 = (x); _r0 = 0;                                             \
 949         _i = W_TYPE_SIZE;                                               \
 950       }                                                                 \
 951     else                                                                \
 952       {                                                                 \
 953         _r1 = 0; _r0 = (x);                                             \
 954         _i = 2*W_TYPE_SIZE;                                             \
 955       }                                                                 \
 956     while (_i-- > 0)                                                    \
 957       {                                                                 \
 958         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 959         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 960           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 961       }                                                                 \
 962     (r1) = _r1;                                                         \
 963     (r0) = _r0;                                                         \
 964   } while (0)
 965
 966 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 967    Both a and b must be in redc form, the result will be in redc form too. */
 968 static inline uintmax_t
 969 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 970 {
 971   uintmax_t rh, rl, q, th, tl _GL_UNUSED, xh;
 972
 973   umul_ppmm (rh, rl, a, b);
 974   q = rl * mi;
 975   umul_ppmm (th, tl, q, m);
 976   xh = rh - th;
 977   if (rh < th)
 978     xh += m;
 979
 980   return xh;
 981 }
 982
 983 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 984    Both a and b must be in redc form, the result will be in redc form too.
 985    For performance reasons, the most significant bit of m must be clear. */
 986 static uintmax_t
 987 mulredc2 (uintmax_t *r1p,
 988           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 989           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 990 {
 991   uintmax_t r1, r0, q, p1, p0 _GL_UNUSED, t1, t0, s1, s0;
 992   mi = -mi;
 993   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
 994   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
 995   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
 996
 997   /* First compute a0 * <b1, b0> B^{-1}
 998         +-----+
 999         |a0 b0|
1000      +--+--+--+
1001      |a0 b1|
1002      +--+--+--+
1003         |q0 m0|
1004      +--+--+--+
1005      |q0 m1|
1006     -+--+--+--+
1007      |r1|r0| 0|
1008      +--+--+--+
1009   */
1010   umul_ppmm (t1, t0, a0, b0);
1011   umul_ppmm (r1, r0, a0, b1);
1012   q = mi * t0;
1013   umul_ppmm (p1, p0, q, m0);
1014   umul_ppmm (s1, s0, q, m1);
1015   r0 += (t0 != 0); /* Carry */
1016   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1017   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1018   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1019
1020   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1021         +-----+
1022         |a1 b0|
1023         +--+--+
1024         |r1|r0|
1025      +--+--+--+
1026      |a1 b1|
1027      +--+--+--+
1028         |q1 m0|
1029      +--+--+--+
1030      |q1 m1|
1031     -+--+--+--+
1032      |r1|r0| 0|
1033      +--+--+--+
1034   */
1035   umul_ppmm (t1, t0, a1, b0);
1036   umul_ppmm (s1, s0, a1, b1);
1037   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1038   q = mi * t0;
1039   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1040   umul_ppmm (p1, p0, q, m0);
1041   umul_ppmm (s1, s0, q, m1);
1042   r0 += (t0 != 0); /* Carry */
1043   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1044   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1045   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1046
1047   if (ge2 (r1, r0, m1, m0))
1048     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1049
1050   *r1p = r1;
1051   return r0;
1052 }
1053
1054 static uintmax_t _GL_ATTRIBUTE_CONST
1055 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1056 {
1057   uintmax_t y = one;
1058
1059   if (e & 1)
1060     y = b;
1061
1062   while (e != 0)
1063     {
1064       b = mulredc (b, b, n, ni);
1065       e >>= 1;
1066
1067       if (e & 1)
1068         y = mulredc (y, b, n, ni);
1069     }
1070
1071   return y;
1072 }
1073
1074 static uintmax_t
1075 powm2 (uintmax_t *r1m,
1076        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1077        uintmax_t ni, const uintmax_t *one)
1078 {
1079   uintmax_t r1, r0, b1, b0, n1, n0;
1080   unsigned int i;
1081   uintmax_t e;
1082
1083   b0 = bp[0];
1084   b1 = bp[1];
1085   n0 = np[0];
1086   n1 = np[1];
1087
1088   r0 = one[0];
1089   r1 = one[1];
1090
1091   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1092     {
1093       if (e & 1)
1094         {
1095           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1096           r1 = *r1m;
1097         }
1098       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1099       b1 = *r1m;
1100     }
1101   for (e = ep[1]; e > 0; e >>= 1)
1102     {
1103       if (e & 1)
1104         {
1105           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1106           r1 = *r1m;
1107         }
1108       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1109       b1 = *r1m;
1110     }
1111   *r1m = r1;
1112   return r0;
1113 }
1114
1115 static bool _GL_ATTRIBUTE_CONST
1116 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1117              unsigned int k, uintmax_t one)
1118 {
1119   uintmax_t y = powm (b, q, n, ni, one);
1120
1121   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1122
1123   if (y == one || y == nm1)
1124     return true;
1125
1126   for (unsigned int i = 1; i < k; i++)
1127     {
1128       y = mulredc (y, y, n, ni);
1129
1130       if (y == nm1)
1131         return true;
1132       if (y == one)
1133         return false;
1134     }
1135   return false;
1136 }
1137
1138 static bool
1139 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1140               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1141 {
1142   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1143
1144   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1145   y1 = r1m;
1146
1147   if (y0 == one[0] && y1 == one[1])
1148     return true;
1149
1150   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1151
1152   if (y0 == nm1_0 && y1 == nm1_1)
1153     return true;
1154
1155   for (unsigned int i = 1; i < k; i++)
1156     {
1157       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1158       y1 = r1m;
1159
1160       if (y0 == nm1_0 && y1 == nm1_1)
1161         return true;
1162       if (y0 == one[0] && y1 == one[1])
1163         return false;
1164     }
1165   return false;
1166 }
1167
1168 #if HAVE_GMP
1169 static bool
1170 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1171                 mpz_srcptr q, unsigned long int k)
1172 {
1173   mpz_powm (y, x, q, n);
1174
1175   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1176     return true;
1177
1178   for (unsigned long int i = 1; i < k; i++)
1179     {
1180       mpz_powm_ui (y, y, 2, n);
1181       if (mpz_cmp (y, nm1) == 0)
1182         return true;
1183       if (mpz_cmp_ui (y, 1) == 0)
1184         return false;
1185     }
1186   return false;
1187 }
1188 #endif
1189
1190 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1191    have been observed.  The average seem to be about 2.  */
1192 static bool
1193 prime_p (uintmax_t n)
1194 {
1195   int k;
1196   bool is_prime;
1197   uintmax_t a_prim, one, ni;
1198   struct factors factors;
1199
1200   if (n <= 1)
1201     return false;
1202
1203   /* We have already casted out small primes. */
1204   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1205     return true;
1206
1207   /* Precomputation for Miller-Rabin.  */
1208   uintmax_t q = n - 1;
1209   for (k = 0; (q & 1) == 0; k++)
1210     q >>= 1;
1211
1212   uintmax_t a = 2;
1213   binv (ni, n);                 /* ni <- 1/n mod B */
1214   redcify (one, 1, n);
1215   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1216
1217   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1218   if (!millerrabin (n, ni, a_prim, q, k, one))
1219     return false;
1220
1221   if (flag_prove_primality)
1222     {
1223       /* Factor n-1 for Lucas.  */
1224       factor (0, n - 1, &factors);
1225     }
1226
1227   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1228      number composite.  */
1229   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1230     {
1231       if (flag_prove_primality)
1232         {
1233           is_prime = true;
1234           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1235             {
1236               is_prime
1237                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1238             }
1239         }
1240       else
1241         {
1242           /* After enough Miller-Rabin runs, be content. */
1243           is_prime = (r == MR_REPS - 1);
1244         }
1245
1246       if (is_prime)
1247         return true;
1248
1249       a += primes_diff[r];      /* Establish new base.  */
1250
1251       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1252          on most processors, since it avoids udiv_qrnnd.  If we go down the
1253          udiv_qrnnd_preinv path, this code should be replaced.  */
1254       {
1255         uintmax_t s1, s0;
1256         umul_ppmm (s1, s0, one, a);
1257         if (LIKELY (s1 == 0))
1258           a_prim = s0 % n;
1259         else
1260           {
1261             uintmax_t dummy _GL_UNUSED;
1262             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1263           }
1264       }
1265
1266       if (!millerrabin (n, ni, a_prim, q, k, one))
1267         return false;
1268     }
1269
1270   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1271   abort ();
1272 }
1273
1274 static bool
1275 prime2_p (uintmax_t n1, uintmax_t n0)
1276 {
1277   uintmax_t q[2], nm1[2];
1278   uintmax_t a_prim[2];
1279   uintmax_t one[2];
1280   uintmax_t na[2];
1281   uintmax_t ni;
1282   unsigned int k;
1283   struct factors factors;
1284
1285   if (n1 == 0)
1286     return prime_p (n0);
1287
1288   nm1[1] = n1 - (n0 == 0);
1289   nm1[0] = n0 - 1;
1290   if (nm1[0] == 0)
1291     {
1292       count_trailing_zeros (k, nm1[1]);
1293
1294       q[0] = nm1[1] >> k;
1295       q[1] = 0;
1296       k += W_TYPE_SIZE;
1297     }
1298   else
1299     {
1300       count_trailing_zeros (k, nm1[0]);
1301       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1302     }
1303
1304   uintmax_t a = 2;
1305   binv (ni, n0);
1306   redcify2 (one[1], one[0], 1, n1, n0);
1307   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1308
1309   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1310   na[0] = n0;
1311   na[1] = n1;
1312
1313   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1314     return false;
1315
1316   if (flag_prove_primality)
1317     {
1318       /* Factor n-1 for Lucas.  */
1319       factor (nm1[1], nm1[0], &factors);
1320     }
1321
1322   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1323      number composite.  */
1324   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1325     {
1326       bool is_prime;
1327       uintmax_t e[2], y[2];
1328
1329       if (flag_prove_primality)
1330         {
1331           is_prime = true;
1332           if (factors.plarge[1])
1333             {
1334               uintmax_t pi;
1335               binv (pi, factors.plarge[0]);
1336               e[0] = pi * nm1[0];
1337               e[1] = 0;
1338               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1339               is_prime = (y[0] != one[0] || y[1] != one[1]);
1340             }
1341           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1342             {
1343               /* FIXME: We always have the factor 2. Do we really need to
1344                  handle it here? We have done the same powering as part
1345                  of millerrabin. */
1346               if (factors.p[i] == 2)
1347                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1348               else
1349                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1350               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1351               is_prime = (y[0] != one[0] || y[1] != one[1]);
1352             }
1353         }
1354       else
1355         {
1356           /* After enough Miller-Rabin runs, be content. */
1357           is_prime = (r == MR_REPS - 1);
1358         }
1359
1360       if (is_prime)
1361         return true;
1362
1363       a += primes_diff[r];      /* Establish new base.  */
1364       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1365
1366       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1367         return false;
1368     }
1369
1370   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1371   abort ();
1372 }
1373
1374 #if HAVE_GMP
1375 static bool
1376 mp_prime_p (mpz_t n)
1377 {
1378   bool is_prime;
1379   mpz_t q, a, nm1, tmp;
1380   struct mp_factors factors;
1381
1382   if (mpz_cmp_ui (n, 1) <= 0)
1383     return false;
1384
1385   /* We have already casted out small primes. */
1386   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1387     return true;
1388
1389   mpz_inits (q, a, nm1, tmp, NULL);
1390
1391   /* Precomputation for Miller-Rabin.  */
1392   mpz_sub_ui (nm1, n, 1);
1393
1394   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1395   unsigned long int k = mpz_scan1 (nm1, 0);
1396   mpz_tdiv_q_2exp (q, nm1, k);
1397
1398   mpz_set_ui (a, 2);
1399
1400   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1401   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1402     {
1403       is_prime = false;
1404       goto ret2;
1405     }
1406
1407   if (flag_prove_primality)
1408     {
1409       /* Factor n-1 for Lucas.  */
1410       mpz_set (tmp, nm1);
1411       mp_factor (tmp, &factors);
1412     }
1413
1414   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1415      number composite.  */
1416   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1417     {
1418       if (flag_prove_primality)
1419         {
1420           is_prime = true;
1421           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1422             {
1423               mpz_divexact (tmp, nm1, factors.p[i]);
1424               mpz_powm (tmp, a, tmp, n);
1425               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1426             }
1427         }
1428       else
1429         {
1430           /* After enough Miller-Rabin runs, be content. */
1431           is_prime = (r == MR_REPS - 1);
1432         }
1433
1434       if (is_prime)
1435         goto ret1;
1436
1437       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1438
1439       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1440         {
1441           is_prime = false;
1442           goto ret1;
1443         }
1444     }
1445
1446   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1447   abort ();
1448
1449  ret1:
1450   if (flag_prove_primality)
1451     mp_factor_clear (&factors);
1452  ret2:
1453   mpz_clears (q, a, nm1, tmp, NULL);
1454
1455   return is_prime;
1456 }
1457 #endif
1458
1459 static void
1460 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1461                           struct factors *factors)
1462 {
1463   uintmax_t x, z, y, P, t, ni, g;
1464
1465   unsigned long int k = 1;
1466   unsigned long int l = 1;
1467
1468   redcify (P, 1, n);
1469   addmod (x, P, P, n);          /* i.e., redcify(2) */
1470   y = z = x;
1471
1472   while (n != 1)
1473     {
1474       assert (a < n);
1475
1476       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1477
1478       for (;;)
1479         {
1480           do
1481             {
1482               x = mulredc (x, x, n, ni);
1483               addmod (x, x, a, n);
1484
1485               submod (t, z, x, n);
1486               P = mulredc (P, t, n, ni);
1487
1488               if (k % 32 == 1)
1489                 {
1490                   if (gcd_odd (P, n) != 1)
1491                     goto factor_found;
1492                   y = x;
1493                 }
1494             }
1495           while (--k != 0);
1496
1497           z = x;
1498           k = l;
1499           l = 2 * l;
1500           for (unsigned long int i = 0; i < k; i++)
1501             {
1502               x = mulredc (x, x, n, ni);
1503               addmod (x, x, a, n);
1504             }
1505           y = x;
1506         }
1507
1508     factor_found:
1509       do
1510         {
1511           y = mulredc (y, y, n, ni);
1512           addmod (y, y, a, n);
1513
1514           submod (t, z, y, n);
1515           g = gcd_odd (t, n);
1516         }
1517       while (g == 1);
1518
1519       n = n / g;
1520
1521       if (!prime_p (g))
1522         factor_using_pollard_rho (g, a + 1, factors);
1523       else
1524         factor_insert (factors, g);
1525
1526       if (prime_p (n))
1527         {
1528           factor_insert (factors, n);
1529           break;
1530         }
1531
1532       x = x % n;
1533       z = z % n;
1534       y = y % n;
1535     }
1536 }
1537
1538 static void
1539 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1540                            struct factors *factors)
1541 {
1542   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1543
1544   unsigned long int k = 1;
1545   unsigned long int l = 1;
1546
1547   redcify2 (P1, P0, 1, n1, n0);
1548   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1549   y1 = z1 = x1;
1550   y0 = z0 = x0;
1551
1552   while (n1 != 0 || n0 != 1)
1553     {
1554       binv (ni, n0);
1555
1556       for (;;)
1557         {
1558           do
1559             {
1560               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1561               x1 = r1m;
1562               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1563
1564               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1565               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1566               P1 = r1m;
1567
1568               if (k % 32 == 1)
1569                 {
1570                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1571                   if (g1 != 0 || g0 != 1)
1572                     goto factor_found;
1573                   y1 = x1; y0 = x0;
1574                 }
1575             }
1576           while (--k != 0);
1577
1578           z1 = x1; z0 = x0;
1579           k = l;
1580           l = 2 * l;
1581           for (unsigned long int i = 0; i < k; i++)
1582             {
1583               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1584               x1 = r1m;
1585               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1586             }
1587           y1 = x1; y0 = x0;
1588         }
1589
1590     factor_found:
1591       do
1592         {
1593           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1594           y1 = r1m;
1595           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1596
1597           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1598           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1599         }
1600       while (g1 == 0 && g0 == 1);
1601
1602       if (g1 == 0)
1603         {
1604           /* The found factor is one word. */
1605           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1606
1607           if (!prime_p (g0))
1608             factor_using_pollard_rho (g0, a + 1, factors);
1609           else
1610             factor_insert (factors, g0);
1611         }
1612       else
1613         {
1614           /* The found factor is two words.  This is highly unlikely, thus hard
1615              to trigger.  Please be careful before you change this code!  */
1616           uintmax_t ginv;
1617
1618           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1619           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1620           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1621
1622           if (!prime2_p (g1, g0))
1623             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1624           else
1625             factor_insert_large (factors, g1, g0);
1626         }
1627
1628       if (n1 == 0)
1629         {
1630           if (prime_p (n0))
1631             {
1632               factor_insert (factors, n0);
1633               break;
1634             }
1635
1636           factor_using_pollard_rho (n0, a, factors);
1637           return;
1638         }
1639
1640       if (prime2_p (n1, n0))
1641         {
1642           factor_insert_large (factors, n1, n0);
1643           break;
1644         }
1645
1646       x0 = mod2 (&x1, x1, x0, n1, n0);
1647       z0 = mod2 (&z1, z1, z0, n1, n0);
1648       y0 = mod2 (&y1, y1, y0, n1, n0);
1649     }
1650 }
1651
1652 #if HAVE_GMP
1653 static void
1654 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1655                              struct mp_factors *factors)
1656 {
1657   mpz_t x, z, y, P;
1658   mpz_t t, t2;
1659
1660   devmsg ("[pollard-rho (%lu)] ", a);
1661
1662   mpz_inits (t, t2, NULL);
1663   mpz_init_set_si (y, 2);
1664   mpz_init_set_si (x, 2);
1665   mpz_init_set_si (z, 2);
1666   mpz_init_set_ui (P, 1);
1667
1668   unsigned long long int k = 1;
1669   unsigned long long int l = 1;
1670
1671   while (mpz_cmp_ui (n, 1) != 0)
1672     {
1673       for (;;)
1674         {
1675           do
1676             {
1677               mpz_mul (t, x, x);
1678               mpz_mod (x, t, n);
1679               mpz_add_ui (x, x, a);
1680
1681               mpz_sub (t, z, x);
1682               mpz_mul (t2, P, t);
1683               mpz_mod (P, t2, n);
1684
1685               if (k % 32 == 1)
1686                 {
1687                   mpz_gcd (t, P, n);
1688                   if (mpz_cmp_ui (t, 1) != 0)
1689                     goto factor_found;
1690                   mpz_set (y, x);
1691                 }
1692             }
1693           while (--k != 0);
1694
1695           mpz_set (z, x);
1696           k = l;
1697           l = 2 * l;
1698           for (unsigned long long int i = 0; i < k; i++)
1699             {
1700               mpz_mul (t, x, x);
1701               mpz_mod (x, t, n);
1702               mpz_add_ui (x, x, a);
1703             }
1704           mpz_set (y, x);
1705         }
1706
1707     factor_found:
1708       do
1709         {
1710           mpz_mul (t, y, y);
1711           mpz_mod (y, t, n);
1712           mpz_add_ui (y, y, a);
1713
1714           mpz_sub (t, z, y);
1715           mpz_gcd (t, t, n);
1716         }
1717       while (mpz_cmp_ui (t, 1) == 0);
1718
1719       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1720
1721       if (!mp_prime_p (t))
1722         {
1723           devmsg ("[composite factor--restarting pollard-rho] ");
1724           mp_factor_using_pollard_rho (t, a + 1, factors);
1725         }
1726       else
1727         {
1728           mp_factor_insert (factors, t);
1729         }
1730
1731       if (mp_prime_p (n))
1732         {
1733           mp_factor_insert (factors, n);
1734           break;
1735         }
1736
1737       mpz_mod (x, x, n);
1738       mpz_mod (z, z, n);
1739       mpz_mod (y, y, n);
1740     }
1741
1742   mpz_clears (P, t2, t, z, x, y, NULL);
1743 }
1744 #endif
1745
1746 #if USE_SQUFOF
1747 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1748    algorithm is replaced, consider also returning the remainder. */
1749 static uintmax_t _GL_ATTRIBUTE_CONST
1750 isqrt (uintmax_t n)
1751 {
1752   uintmax_t x;
1753   unsigned c;
1754   if (n == 0)
1755     return 0;
1756
1757   count_leading_zeros (c, n);
1758
1759   /* Make x > sqrt(n). This will be invariant through the loop. */
1760   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1761
1762   for (;;)
1763     {
1764       uintmax_t y = (x + n/x) / 2;
1765       if (y >= x)
1766         return x;
1767
1768       x = y;
1769     }
1770 }
1771
1772 static uintmax_t _GL_ATTRIBUTE_CONST
1773 isqrt2 (uintmax_t nh, uintmax_t nl)
1774 {
1775   unsigned int shift;
1776   uintmax_t x;
1777
1778   /* Ensures the remainder fits in an uintmax_t. */
1779   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1780
1781   if (nh == 0)
1782     return isqrt (nl);
1783
1784   count_leading_zeros (shift, nh);
1785   shift &= ~1;
1786
1787   /* Make x > sqrt(n) */
1788   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1789   x <<= (W_TYPE_SIZE - shift) / 2;
1790
1791   /* Do we need more than one iteration? */
1792   for (;;)
1793     {
1794       uintmax_t r _GL_UNUSED;
1795       uintmax_t q, y;
1796       udiv_qrnnd (q, r, nh, nl, x);
1797       y = (x + q) / 2;
1798
1799       if (y >= x)
1800         {
1801           uintmax_t hi, lo;
1802           umul_ppmm (hi, lo, x + 1, x + 1);
1803           assert (gt2 (hi, lo, nh, nl));
1804
1805           umul_ppmm (hi, lo, x, x);
1806           assert (ge2 (nh, nl, hi, lo));
1807           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1808           assert (hi == 0);
1809
1810           return x;
1811         }
1812
1813       x = y;
1814     }
1815 }
1816
1817 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1818 # define MAGIC64 0x0202021202030213ULL
1819 # define MAGIC63 0x0402483012450293ULL
1820 # define MAGIC65 0x218a019866014613ULL
1821 # define MAGIC11 0x23b
1822
1823 /* Return the square root if the input is a square, otherwise 0. */
1824 static uintmax_t _GL_ATTRIBUTE_CONST
1825 is_square (uintmax_t x)
1826 {
1827   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1828      computing the square root. */
1829   if (((MAGIC64 >> (x & 63)) & 1)
1830       && ((MAGIC63 >> (x % 63)) & 1)
1831       /* Both 0 and 64 are squares mod (65) */
1832       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1833       && ((MAGIC11 >> (x % 11) & 1)))
1834     {
1835       uintmax_t r = isqrt (x);
1836       if (r*r == x)
1837         return r;
1838     }
1839   return 0;
1840 }
1841
1842 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1843 static const unsigned short invtab[0x81] =
1844   {
1845     0x200,
1846     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1847     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1848     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1849     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1850     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1851     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1852     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1853     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1854     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1855     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1856     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1857     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1858     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1859     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1860     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1861     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1862   };
1863
1864 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1865    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1866 # define div_smallq(q, r, u, d)                                          \
1867   do {                                                                  \
1868     if ((u) / 0x40 < (d))                                               \
1869       {                                                                 \
1870         int _cnt;                                                       \
1871         uintmax_t _dinv, _mask, _q, _r;                                 \
1872         count_leading_zeros (_cnt, (d));                                \
1873         _r = (u);                                                       \
1874         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1875           {                                                             \
1876             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1877             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1878           }                                                             \
1879         else                                                            \
1880           {                                                             \
1881             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1882             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1883           }                                                             \
1884         _r -= _q*(d);                                                   \
1885                                                                         \
1886         _mask = -(uintmax_t) (_r >= (d));                               \
1887         (r) = _r - (_mask & (d));                                       \
1888         (q) = _q - _mask;                                               \
1889         assert ( (q) * (d) + (r) == u);                                 \
1890       }                                                                 \
1891     else                                                                \
1892       {                                                                 \
1893         uintmax_t _q = (u) / (d);                                       \
1894         (r) = (u) - _q * (d);                                           \
1895         (q) = _q;                                                       \
1896       }                                                                 \
1897   } while (0)
1898
1899 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1900    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1901    with 3025 = 55^2.
1902
1903    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1904    representing G_0 = (-55, 2*4652, 8653).
1905
1906    In the notation of the paper:
1907
1908    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1909
1910    Put
1911
1912      t_0 = floor([q_0 + R_0] / S0) = 1
1913      R_1 = t_0 * S_0 - R_0 = 4001
1914      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1915 */
1916
1917 /* Multipliers, in order of efficiency:
1918    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1919    0.7317  3*5*7    =  105 = 1
1920    0.7820  3*5*11   =  165 = 1
1921    0.7872  3*5      =   15 = 3
1922    0.8101  3*7*11   =  231 = 3
1923    0.8155  3*7      =   21 = 1
1924    0.8284  5*7*11   =  385 = 1
1925    0.8339  5*7      =   35 = 3
1926    0.8716  3*11     =   33 = 1
1927    0.8774  3        =    3 = 3
1928    0.8913  5*11     =   55 = 3
1929    0.8972  5        =    5 = 1
1930    0.9233  7*11     =   77 = 1
1931    0.9295  7        =    7 = 3
1932    0.9934  11       =   11 = 3
1933 */
1934 # define QUEUE_SIZE 50
1935 #endif
1936
1937 #if STAT_SQUFOF
1938 # define Q_FREQ_SIZE 50
1939 /* Element 0 keeps the total */
1940 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1941 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1942 #endif
1943
1944 #if USE_SQUFOF
1945 /* Return true on success.  Expected to fail only for numbers
1946    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1947 static bool
1948 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1949 {
1950   /* Uses algorithm and notation from
1951
1952      SQUARE FORM FACTORIZATION
1953      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1954
1955      http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1956    */
1957
1958   static const unsigned int multipliers_1[] =
1959     { /* = 1 (mod 4) */
1960       105, 165, 21, 385, 33, 5, 77, 1, 0
1961     };
1962   static const unsigned int multipliers_3[] =
1963     { /* = 3 (mod 4) */
1964       1155, 15, 231, 35, 3, 55, 7, 11, 0
1965     };
1966
1967   const unsigned int *m;
1968
1969   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1970
1971   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1972     return false;
1973
1974   uintmax_t sqrt_n = isqrt2 (n1, n0);
1975
1976   if (n0 == sqrt_n * sqrt_n)
1977     {
1978       uintmax_t p1, p0;
1979
1980       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1981       assert (p0 == n0);
1982
1983       if (n1 == p1)
1984         {
1985           if (prime_p (sqrt_n))
1986             factor_insert_multiplicity (factors, sqrt_n, 2);
1987           else
1988             {
1989               struct factors f;
1990
1991               f.nfactors = 0;
1992               if (!factor_using_squfof (0, sqrt_n, &f))
1993                 {
1994                   /* Try pollard rho instead */
1995                   factor_using_pollard_rho (sqrt_n, 1, &f);
1996                 }
1997               /* Duplicate the new factors */
1998               for (unsigned int i = 0; i < f.nfactors; i++)
1999                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
2000             }
2001           return true;
2002         }
2003     }
2004
2005   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2006   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2007        *m; m++)
2008     {
2009       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2010       unsigned int i;
2011       unsigned int mu = *m;
2012       unsigned int qpos = 0;
2013
2014       assert (mu * n0 % 4 == 3);
2015
2016       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2017          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2018          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2019          mu^3 < n.
2020
2021          However, this seems insufficient: With n = 37243139 and mu =
2022          105, we get a trivial factor, from the square 38809 = 197^2,
2023          without any corresponding Q earlier in the iteration.
2024
2025          Requiring 64 mu^3 < n seems sufficient. */
2026       if (n1 == 0)
2027         {
2028           if ((uintmax_t) mu*mu*mu >= n0 / 64)
2029             continue;
2030         }
2031       else
2032         {
2033           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2034             continue;
2035         }
2036       umul_ppmm (Dh, Dl, n0, mu);
2037       Dh += n1 * mu;
2038
2039       assert (Dl % 4 != 1);
2040       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2041
2042       S = isqrt2 (Dh, Dl);
2043
2044       Q1 = 1;
2045       P = S;
2046
2047       /* Square root remainder fits in one word, so ignore high part. */
2048       Q = Dl - P*P;
2049       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2050       L = isqrt (2*S);
2051       B = 2*L;
2052       L1 = mu * 2 * L;
2053
2054       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2055          4 D. */
2056
2057       for (i = 0; i <= B; i++)
2058         {
2059           uintmax_t q, P1, t, rem;
2060
2061           div_smallq (q, rem, S+P, Q);
2062           P1 = S - rem; /* P1 = q*Q - P */
2063
2064           IF_LINT (assert (q > 0 && Q > 0));
2065
2066 # if STAT_SQUFOF
2067           q_freq[0]++;
2068           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2069 # endif
2070
2071           if (Q <= L1)
2072             {
2073               uintmax_t g = Q;
2074
2075               if ( (Q & 1) == 0)
2076                 g /= 2;
2077
2078               g /= gcd_odd (g, mu);
2079
2080               if (g <= L)
2081                 {
2082                   if (qpos >= QUEUE_SIZE)
2083                     die (EXIT_FAILURE, 0, _("squfof queue overflow"));
2084                   queue[qpos].Q = g;
2085                   queue[qpos].P = P % g;
2086                   qpos++;
2087                 }
2088             }
2089
2090           /* I think the difference can be either sign, but mod
2091              2^W_TYPE_SIZE arithmetic should be fine. */
2092           t = Q1 + q * (P - P1);
2093           Q1 = Q;
2094           Q = t;
2095           P = P1;
2096
2097           if ( (i & 1) == 0)
2098             {
2099               uintmax_t r = is_square (Q);
2100               if (r)
2101                 {
2102                   for (unsigned int j = 0; j < qpos; j++)
2103                     {
2104                       if (queue[j].Q == r)
2105                         {
2106                           if (r == 1)
2107                             /* Traversed entire cycle. */
2108                             goto next_multiplier;
2109
2110                           /* Need the absolute value for divisibility test. */
2111                           if (P >= queue[j].P)
2112                             t = P - queue[j].P;
2113                           else
2114                             t = queue[j].P - P;
2115                           if (t % r == 0)
2116                             {
2117                               /* Delete entries up to and including entry
2118                                  j, which matched. */
2119                               memmove (queue, queue + j + 1,
2120                                        (qpos - j - 1) * sizeof (queue[0]));
2121                               qpos -= (j + 1);
2122                             }
2123                           goto next_i;
2124                         }
2125                     }
2126
2127                   /* We have found a square form, which should give a
2128                      factor. */
2129                   Q1 = r;
2130                   assert (S >= P); /* What signs are possible? */
2131                   P += r * ((S - P) / r);
2132
2133                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2134                      for the case D = 2N. */
2135                   /* Compute Q = (D - P*P) / Q1, but we need double
2136                      precision. */
2137                   uintmax_t hi, lo;
2138                   umul_ppmm (hi, lo, P, P);
2139                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2140                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2141                   assert (rem == 0);
2142
2143                   for (;;)
2144                     {
2145                       /* Note: There appears to by a typo in the paper,
2146                          Step 4a in the algorithm description says q <--
2147                          floor([S+P]/\hat Q), but looking at the equations
2148                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2149                          (In this code, \hat Q is Q1). */
2150                       div_smallq (q, rem, S+P, Q);
2151                       P1 = S - rem;     /* P1 = q*Q - P */
2152
2153 # if STAT_SQUFOF
2154                       q_freq[0]++;
2155                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2156 # endif
2157                       if (P == P1)
2158                         break;
2159                       t = Q1 + q * (P - P1);
2160                       Q1 = Q;
2161                       Q = t;
2162                       P = P1;
2163                     }
2164
2165                   if ( (Q & 1) == 0)
2166                     Q /= 2;
2167                   Q /= gcd_odd (Q, mu);
2168
2169                   assert (Q > 1 && (n1 || Q < n0));
2170
2171                   if (prime_p (Q))
2172                     factor_insert (factors, Q);
2173                   else if (!factor_using_squfof (0, Q, factors))
2174                     factor_using_pollard_rho (Q, 2, factors);
2175
2176                   divexact_21 (n1, n0, n1, n0, Q);
2177
2178                   if (prime2_p (n1, n0))
2179                     factor_insert_large (factors, n1, n0);
2180                   else
2181                     {
2182                       if (!factor_using_squfof (n1, n0, factors))
2183                         {
2184                           if (n1 == 0)
2185                             factor_using_pollard_rho (n0, 1, factors);
2186                           else
2187                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2188                         }
2189                     }
2190
2191                   return true;
2192                 }
2193             }
2194         next_i:;
2195         }
2196     next_multiplier:;
2197     }
2198   return false;
2199 }
2200 #endif
2201
2202 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2203    results in FACTORS.  */
2204 static void
2205 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2206 {
2207   factors->nfactors = 0;
2208   factors->plarge[1] = 0;
2209
2210   if (t1 == 0 && t0 < 2)
2211     return;
2212
2213   t0 = factor_using_division (&t1, t1, t0, factors);
2214
2215   if (t1 == 0 && t0 < 2)
2216     return;
2217
2218   if (prime2_p (t1, t0))
2219     factor_insert_large (factors, t1, t0);
2220   else
2221     {
2222 #if USE_SQUFOF
2223       if (factor_using_squfof (t1, t0, factors))
2224         return;
2225 #endif
2226
2227       if (t1 == 0)
2228         factor_using_pollard_rho (t0, 1, factors);
2229       else
2230         factor_using_pollard_rho2 (t1, t0, 1, factors);
2231     }
2232 }
2233
2234 #if HAVE_GMP
2235 /* Use Pollard-rho to compute the prime factors of
2236    arbitrary-precision T, and put the results in FACTORS.  */
2237 static void
2238 mp_factor (mpz_t t, struct mp_factors *factors)
2239 {
2240   mp_factor_init (factors);
2241
2242   if (mpz_sgn (t) != 0)
2243     {
2244       mp_factor_using_division (t, factors);
2245
2246       if (mpz_cmp_ui (t, 1) != 0)
2247         {
2248           devmsg ("[is number prime?] ");
2249           if (mp_prime_p (t))
2250             mp_factor_insert (factors, t);
2251           else
2252             mp_factor_using_pollard_rho (t, 1, factors);
2253         }
2254     }
2255 }
2256 #endif
2257
2258 static strtol_error
2259 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2260 {
2261   unsigned int lo_carry;
2262   uintmax_t hi = 0, lo = 0;
2263
2264   strtol_error err = LONGINT_INVALID;
2265
2266   /* Skip initial spaces and '+'.  */
2267   for (;;)
2268     {
2269       char c = *s;
2270       if (c == ' ')
2271         s++;
2272       else if (c == '+')
2273         {
2274           s++;
2275           break;
2276         }
2277       else
2278         break;
2279     }
2280
2281   /* Initial scan for invalid digits.  */
2282   const char *p = s;
2283   for (;;)
2284     {
2285       unsigned int c = *p++;
2286       if (c == 0)
2287         break;
2288
2289       if (UNLIKELY (!ISDIGIT (c)))
2290         {
2291           err = LONGINT_INVALID;
2292           break;
2293         }
2294
2295       err = LONGINT_OK;           /* we've seen at least one valid digit */
2296     }
2297
2298   for (;err == LONGINT_OK;)
2299     {
2300       unsigned int c = *s++;
2301       if (c == 0)
2302         break;
2303
2304       c -= '0';
2305
2306       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2307         {
2308           err = LONGINT_OVERFLOW;
2309           break;
2310         }
2311       hi = 10 * hi;
2312
2313       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2314       lo_carry += 10 * lo < 2 * lo;
2315
2316       lo = 10 * lo;
2317       lo += c;
2318
2319       lo_carry += lo < c;
2320       hi += lo_carry;
2321       if (UNLIKELY (hi < lo_carry))
2322         {
2323           err = LONGINT_OVERFLOW;
2324           break;
2325         }
2326     }
2327
2328   *hip = hi;
2329   *lop = lo;
2330
2331   return err;
2332 }
2333
2334 /* Structure and routines for buffering and outputting full lines,
2335    to support parallel operation efficiently.  */
2336 static struct lbuf_
2337 {
2338   char *buf;
2339   char *end;
2340 } lbuf;
2341
2342 /* 512 is chosen to give good performance,
2343    and also is the max guaranteed size that
2344    consumers can read atomically through pipes.
2345    Also it's big enough to cater for max line length
2346    even with 128 bit uintmax_t.  */
2347 #define FACTOR_PIPE_BUF 512
2348
2349 static void
2350 lbuf_alloc (void)
2351 {
2352   if (lbuf.buf)
2353     return;
2354
2355   /* Double to ensure enough space for
2356      previous numbers + next number.  */
2357   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2358   lbuf.end = lbuf.buf;
2359 }
2360
2361 /* Write complete LBUF to standard output.  */
2362 static void
2363 lbuf_flush (void)
2364 {
2365   size_t size = lbuf.end - lbuf.buf;
2366   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2367     die (EXIT_FAILURE, errno, "%s", _("write error"));
2368   lbuf.end = lbuf.buf;
2369 }
2370
2371 /* Add a character C to LBUF and if it's a newline
2372    and enough bytes are already buffered,
2373    then write atomically to standard output.  */
2374 static void
2375 lbuf_putc (char c)
2376 {
2377   *lbuf.end++ = c;
2378
2379   if (c == '\n')
2380     {
2381       size_t buffered = lbuf.end - lbuf.buf;
2382
2383       /* Provide immediate output for interactive input.  */
2384       static int line_buffered = -1;
2385       if (line_buffered == -1)
2386         line_buffered = isatty (STDIN_FILENO);
2387       if (line_buffered)
2388         lbuf_flush ();
2389       else if (buffered >= FACTOR_PIPE_BUF)
2390         {
2391           /* Write output in <= PIPE_BUF chunks
2392              so consumers can read atomically.  */
2393           char const *tend = lbuf.end;
2394
2395           /* Since a umaxint_t's factors must fit in 512
2396              we're guaranteed to find a newline here.  */
2397           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2398           while (*--tlend != '\n');
2399           tlend++;
2400
2401           lbuf.end = tlend;
2402           lbuf_flush ();
2403
2404           /* Buffer the remainder.  */
2405           memcpy (lbuf.buf, tlend, tend - tlend);
2406           lbuf.end = lbuf.buf + (tend - tlend);
2407         }
2408     }
2409 }
2410
2411 /* Buffer an int to the internal LBUF.  */
2412 static void
2413 lbuf_putint (uintmax_t i, size_t min_width)
2414 {
2415   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2416   char const *umaxstr = umaxtostr (i, buf);
2417   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2418   size_t z = width;
2419
2420   for (; z < min_width; z++)
2421     *lbuf.end++ = '0';
2422
2423   memcpy (lbuf.end, umaxstr, width);
2424   lbuf.end += width;
2425 }
2426
2427 static void
2428 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2429 {
2430   uintmax_t q, r;
2431
2432   if (t1 == 0)
2433     lbuf_putint (t0, 0);
2434   else
2435     {
2436       /* Use very plain code here since it seems hard to write fast code
2437          without assuming a specific word size.  */
2438       q = t1 / 1000000000;
2439       r = t1 % 1000000000;
2440       udiv_qrnnd (t0, r, r, t0, 1000000000);
2441       print_uintmaxes (q, t0);
2442       lbuf_putint (r, 9);
2443     }
2444 }
2445
2446 /* Single-precision factoring */
2447 static void
2448 print_factors_single (uintmax_t t1, uintmax_t t0)
2449 {
2450   struct factors factors;
2451
2452   print_uintmaxes (t1, t0);
2453   lbuf_putc (':');
2454
2455   factor (t1, t0, &factors);
2456
2457   for (unsigned int j = 0; j < factors.nfactors; j++)
2458     for (unsigned int k = 0; k < factors.e[j]; k++)
2459       {
2460         lbuf_putc (' ');
2461         print_uintmaxes (0, factors.p[j]);
2462       }
2463
2464   if (factors.plarge[1])
2465     {
2466       lbuf_putc (' ');
2467       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2468     }
2469
2470   lbuf_putc ('\n');
2471 }
2472
2473 /* Emit the factors of the indicated number.  If we have the option of using
2474    either algorithm, we select on the basis of the length of the number.
2475    For longer numbers, we prefer the MP algorithm even if the native algorithm
2476    has enough digits, because the algorithm is better.  The turnover point
2477    depends on the value.  */
2478 static bool
2479 print_factors (const char *input)
2480 {
2481   uintmax_t t1, t0;
2482
2483   /* Try converting the number to one or two words.  If it fails, use GMP or
2484      print an error message.  The 2nd condition checks that the most
2485      significant bit of the two-word number is clear, in a typesize neutral
2486      way.  */
2487   strtol_error err = strto2uintmax (&t1, &t0, input);
2488
2489   switch (err)
2490     {
2491     case LONGINT_OK:
2492       if (((t1 << 1) >> 1) == t1)
2493         {
2494           devmsg ("[using single-precision arithmetic] ");
2495           print_factors_single (t1, t0);
2496           return true;
2497         }
2498       break;
2499
2500     case LONGINT_OVERFLOW:
2501       /* Try GMP.  */
2502       break;
2503
2504     default:
2505       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2506       return false;
2507     }
2508
2509 #if HAVE_GMP
2510   devmsg ("[using arbitrary-precision arithmetic] ");
2511   mpz_t t;
2512   struct mp_factors factors;
2513
2514   mpz_init_set_str (t, input, 10);
2515
2516   gmp_printf ("%Zd:", t);
2517   mp_factor (t, &factors);
2518
2519   for (unsigned int j = 0; j < factors.nfactors; j++)
2520     for (unsigned int k = 0; k < factors.e[j]; k++)
2521       gmp_printf (" %Zd", factors.p[j]);
2522
2523   mp_factor_clear (&factors);
2524   mpz_clear (t);
2525   putchar ('\n');
2526   fflush (stdout);
2527   return true;
2528 #else
2529   error (0, 0, _("%s is too large"), quote (input));
2530   return false;
2531 #endif
2532 }
2533
2534 void
2535 usage (int status)
2536 {
2537   if (status != EXIT_SUCCESS)
2538     emit_try_help ();
2539   else
2540     {
2541       printf (_("\
2542 Usage: %s [NUMBER]...\n\
2543   or:  %s OPTION\n\
2544 "),
2545               program_name, program_name);
2546       fputs (_("\
2547 Print the prime factors of each specified integer NUMBER.  If none\n\
2548 are specified on the command line, read them from standard input.\n\
2549 \n\
2550 "), stdout);
2551       fputs (HELP_OPTION_DESCRIPTION, stdout);
2552       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2553       emit_ancillary_info (PROGRAM_NAME);
2554     }
2555   exit (status);
2556 }
2557
2558 static bool
2559 do_stdin (void)
2560 {
2561   bool ok = true;
2562   token_buffer tokenbuffer;
2563
2564   init_tokenbuffer (&tokenbuffer);
2565
2566   while (true)
2567     {
2568       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2569                                        &tokenbuffer);
2570       if (token_length == (size_t) -1)
2571         break;
2572       ok &= print_factors (tokenbuffer.buffer);
2573     }
2574   free (tokenbuffer.buffer);
2575
2576   return ok;
2577 }
2578
2579 int
2580 main (int argc, char **argv)
2581 {
2582   initialize_main (&argc, &argv);
2583   set_program_name (argv[0]);
2584   setlocale (LC_ALL, "");
2585   bindtextdomain (PACKAGE, LOCALEDIR);
2586   textdomain (PACKAGE);
2587
2588   lbuf_alloc ();
2589   atexit (close_stdout);
2590   atexit (lbuf_flush);
2591
2592   int c;
2593   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2594     {
2595       switch (c)
2596         {
2597         case DEV_DEBUG_OPTION:
2598           dev_debug = true;
2599           break;
2600
2601         case_GETOPT_HELP_CHAR;
2602
2603         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2604
2605         default:
2606           usage (EXIT_FAILURE);
2607         }
2608     }
2609
2610 #if STAT_SQUFOF
2611   memset (q_freq, 0, sizeof (q_freq));
2612 #endif
2613
2614   bool ok;
2615   if (argc <= optind)
2616     ok = do_stdin ();
2617   else
2618     {
2619       ok = true;
2620       for (int i = optind; i < argc; i++)
2621         if (! print_factors (argv[i]))
2622           ok = false;
2623     }
2624
2625 #if STAT_SQUFOF
2626   if (q_freq[0] > 0)
2627     {
2628       double acc_f;
2629       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2630       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2631         {
2632           double f = (double) q_freq[i] / q_freq[0];
2633           acc_f += f;
2634           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2635                   100.0 * f, 100.0 * acc_f);
2636         }
2637     }
2638 #endif
2639
2640   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2641 }