src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2015 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or if USE_SQUFOF is defined to 1, try that first.
  54         Status of found factors are checked again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdio.h>
 107 #if HAVE_GMP
 108 # include <gmp.h>
 109 # if !HAVE_DECL_MPZ_INITS
 110 #  include <stdarg.h>
 111 # endif
 112 #endif
 113
 114 #include <assert.h>
 115
 116 #include "system.h"
 117 #include "error.h"
 118 #include "full-write.h"
 119 #include "quote.h"
 120 #include "readtokens.h"
 121 #include "xstrtol.h"
 122
 123 /* The official name of this program (e.g., no 'g' prefix).  */
 124 #define PROGRAM_NAME "factor"
 125
 126 #define AUTHORS \
 127   proper_name ("Paul Rubin"),                                           \
 128   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 129   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 130
 131 /* Token delimiters when reading from a file.  */
 132 #define DELIM "\n\t "
 133
 134 #ifndef USE_LONGLONG_H
 135 /* With the way we use longlong.h, it's only safe to use
 136    when UWtype = UHWtype, as there were various cases
 137    (as can be seen in the history for longlong.h) where
 138    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 139    to avoid compile time or run time issues.  */
 140 # if LONG_MAX == INTMAX_MAX
 141 #  define USE_LONGLONG_H 1
 142 # endif
 143 #endif
 144
 145 #if USE_LONGLONG_H
 146
 147 /* Make definitions for longlong.h to make it do what it can do for us */
 148
 149 /* bitcount for uintmax_t */
 150 # if UINTMAX_MAX == UINT32_MAX
 151 #  define W_TYPE_SIZE 32
 152 # elif UINTMAX_MAX == UINT64_MAX
 153 #  define W_TYPE_SIZE 64
 154 # elif UINTMAX_MAX == UINT128_MAX
 155 #  define W_TYPE_SIZE 128
 156 # endif
 157
 158 # define UWtype  uintmax_t
 159 # define UHWtype unsigned long int
 160 # undef UDWtype
 161 # if HAVE_ATTRIBUTE_MODE
 162 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 163 typedef          int SItype     __attribute__ ((mode (SI)));
 164 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 165 typedef          int DItype     __attribute__ ((mode (DI)));
 166 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 167 # else
 168 typedef unsigned char UQItype;
 169 typedef          long SItype;
 170 typedef unsigned long int USItype;
 171 #  if HAVE_LONG_LONG_INT
 172 typedef long long int DItype;
 173 typedef unsigned long long int UDItype;
 174 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 175 typedef long int DItype;
 176 typedef unsigned long int UDItype;
 177 #  endif
 178 # endif
 179 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 180 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 181 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 182 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 183 # ifndef __GMP_GNUC_PREREQ
 184 #  define __GMP_GNUC_PREREQ(a,b) 1
 185 # endif
 186
 187 /* These stub macros are only used in longlong.h in certain system compiler
 188    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 189 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 190 ASSERT (1)
 191 __GMP_DECLSPEC
 192 # endif
 193
 194 # if _ARCH_PPC
 195 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 196 # endif
 197 # include "longlong.h"
 198 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 199 const unsigned char factor_clz_tab[129] =
 200 {
 201   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 202   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 203   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 204   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 205   9
 206 };
 207 # endif
 208
 209 #else /* not USE_LONGLONG_H */
 210
 211 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 212 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 213 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 214 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 215
 216 #endif
 217
 218 #if !defined __clz_tab && !defined UHWtype
 219 /* Without this seemingly useless conditional, gcc -Wunused-macros
 220    warns that each of the two tested macros is unused on Fedora 18.
 221    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 222 #endif
 223
 224 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 225 #define MAX_NFACTS 26
 226
 227 enum
 228 {
 229   DEV_DEBUG_OPTION = CHAR_MAX + 1
 230 };
 231
 232 static struct option const long_options[] =
 233 {
 234   {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
 235   {GETOPT_HELP_OPTION_DECL},
 236   {GETOPT_VERSION_OPTION_DECL},
 237   {NULL, 0, NULL, 0}
 238 };
 239
 240 struct factors
 241 {
 242   uintmax_t     plarge[2]; /* Can have a single large factor */
 243   uintmax_t     p[MAX_NFACTS];
 244   unsigned char e[MAX_NFACTS];
 245   unsigned char nfactors;
 246 };
 247
 248 #if HAVE_GMP
 249 struct mp_factors
 250 {
 251   mpz_t             *p;
 252   unsigned long int *e;
 253   unsigned long int nfactors;
 254 };
 255 #endif
 256
 257 static void factor (uintmax_t, uintmax_t, struct factors *);
 258
 259 #ifndef umul_ppmm
 260 # define umul_ppmm(w1, w0, u, v)                                        \
 261   do {                                                                  \
 262     uintmax_t __x0, __x1, __x2, __x3;                                   \
 263     unsigned long int __ul, __vl, __uh, __vh;                           \
 264     uintmax_t __u = (u), __v = (v);                                     \
 265                                                                         \
 266     __ul = __ll_lowpart (__u);                                          \
 267     __uh = __ll_highpart (__u);                                         \
 268     __vl = __ll_lowpart (__v);                                          \
 269     __vh = __ll_highpart (__v);                                         \
 270                                                                         \
 271     __x0 = (uintmax_t) __ul * __vl;                                     \
 272     __x1 = (uintmax_t) __ul * __vh;                                     \
 273     __x2 = (uintmax_t) __uh * __vl;                                     \
 274     __x3 = (uintmax_t) __uh * __vh;                                     \
 275                                                                         \
 276     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 277     __x1 += __x2;               /* but this indeed can */               \
 278     if (__x1 < __x2)            /* did we get it? */                    \
 279       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 280                                                                         \
 281     (w1) = __x3 + __ll_highpart (__x1);                                 \
 282     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 283   } while (0)
 284 #endif
 285
 286 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 287 /* Define our own, not needing normalization. This function is
 288    currently not performance critical, so keep it simple. Similar to
 289    the mod macro below. */
 290 # undef udiv_qrnnd
 291 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 292   do {                                                                  \
 293     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 294                                                                         \
 295     assert ((n1) < (d));                                                \
 296     __d1 = (d); __d0 = 0;                                               \
 297     __r1 = (n1); __r0 = (n0);                                           \
 298     __q = 0;                                                            \
 299     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 300       {                                                                 \
 301         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 302         __q <<= 1;                                                      \
 303         if (ge2 (__r1, __r0, __d1, __d0))                               \
 304           {                                                             \
 305             __q++;                                                      \
 306             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 307           }                                                             \
 308       }                                                                 \
 309     (r) = __r0;                                                         \
 310     (q) = __q;                                                          \
 311   } while (0)
 312 #endif
 313
 314 #if !defined add_ssaaaa
 315 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 316   do {                                                                  \
 317     uintmax_t _add_x;                                                   \
 318     _add_x = (al) + (bl);                                               \
 319     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 320     (sl) = _add_x;                                                      \
 321   } while (0)
 322 #endif
 323
 324 #define rsh2(rh, rl, ah, al, cnt)                                       \
 325   do {                                                                  \
 326     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 327     (rh) = (ah) >> (cnt);                                               \
 328   } while (0)
 329
 330 #define lsh2(rh, rl, ah, al, cnt)                                       \
 331   do {                                                                  \
 332     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 333     (rl) = (al) << (cnt);                                               \
 334   } while (0)
 335
 336 #define ge2(ah, al, bh, bl)                                             \
 337   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 338
 339 #define gt2(ah, al, bh, bl)                                             \
 340   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 341
 342 #ifndef sub_ddmmss
 343 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 344   do {                                                                  \
 345     uintmax_t _cy;                                                      \
 346     _cy = (al) < (bl);                                                  \
 347     (rl) = (al) - (bl);                                                 \
 348     (rh) = (ah) - (bh) - _cy;                                           \
 349   } while (0)
 350 #endif
 351
 352 #ifndef count_leading_zeros
 353 # define count_leading_zeros(count, x) do {                             \
 354     uintmax_t __clz_x = (x);                                            \
 355     unsigned int __clz_c;                                               \
 356     for (__clz_c = 0;                                                   \
 357          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 358          __clz_c += 8)                                                  \
 359       __clz_x <<= 8;                                                    \
 360     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 361       __clz_x <<= 1;                                                    \
 362     (count) = __clz_c;                                                  \
 363   } while (0)
 364 #endif
 365
 366 #ifndef count_trailing_zeros
 367 # define count_trailing_zeros(count, x) do {                            \
 368     uintmax_t __ctz_x = (x);                                            \
 369     unsigned int __ctz_c = 0;                                           \
 370     while ((__ctz_x & 1) == 0)                                          \
 371       {                                                                 \
 372         __ctz_x >>= 1;                                                  \
 373         __ctz_c++;                                                      \
 374       }                                                                 \
 375     (count) = __ctz_c;                                                  \
 376   } while (0)
 377 #endif
 378
 379 /* Requires that a < n and b <= n */
 380 #define submod(r,a,b,n)                                                 \
 381   do {                                                                  \
 382     uintmax_t _t = - (uintmax_t) (a < b);                               \
 383     (r) = ((n) & _t) + (a) - (b);                                       \
 384   } while (0)
 385
 386 #define addmod(r,a,b,n)                                                 \
 387   submod ((r), (a), ((n) - (b)), (n))
 388
 389 /* Modular two-word addition and subtraction.  For performance reasons, the
 390    most significant bit of n1 must be clear.  The destination variables must be
 391    distinct from the mod operand.  */
 392 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 393   do {                                                                  \
 394     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 395     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 396       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 397   } while (0)
 398 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 399   do {                                                                  \
 400     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 401     if ((intmax_t) (r1) < 0)                                            \
 402       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 403   } while (0)
 404
 405 #define HIGHBIT_TO_MASK(x)                                              \
 406   (((intmax_t)-1 >> 1) < 0                                              \
 407    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 408    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 409       ? UINTMAX_MAX : (uintmax_t) 0))
 410
 411 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 412    Requires that d1 != 0.  */
 413 static uintmax_t
 414 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 415 {
 416   int cntd, cnta;
 417
 418   assert (d1 != 0);
 419
 420   if (a1 == 0)
 421     {
 422       *r1 = 0;
 423       return a0;
 424     }
 425
 426   count_leading_zeros (cntd, d1);
 427   count_leading_zeros (cnta, a1);
 428   int cnt = cntd - cnta;
 429   lsh2 (d1, d0, d1, d0, cnt);
 430   for (int i = 0; i < cnt; i++)
 431     {
 432       if (ge2 (a1, a0, d1, d0))
 433         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 434       rsh2 (d1, d0, d1, d0, 1);
 435     }
 436
 437   *r1 = a1;
 438   return a0;
 439 }
 440
 441 static uintmax_t _GL_ATTRIBUTE_CONST
 442 gcd_odd (uintmax_t a, uintmax_t b)
 443 {
 444   if ( (b & 1) == 0)
 445     {
 446       uintmax_t t = b;
 447       b = a;
 448       a = t;
 449     }
 450   if (a == 0)
 451     return b;
 452
 453   /* Take out least significant one bit, to make room for sign */
 454   b >>= 1;
 455
 456   for (;;)
 457     {
 458       uintmax_t t;
 459       uintmax_t bgta;
 460
 461       while ((a & 1) == 0)
 462         a >>= 1;
 463       a >>= 1;
 464
 465       t = a - b;
 466       if (t == 0)
 467         return (a << 1) + 1;
 468
 469       bgta = HIGHBIT_TO_MASK (t);
 470
 471       /* b <-- min (a, b) */
 472       b += (bgta & t);
 473
 474       /* a <-- |a - b| */
 475       a = (t ^ bgta) - bgta;
 476     }
 477 }
 478
 479 static uintmax_t
 480 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 481 {
 482   while ((a0 & 1) == 0)
 483     rsh2 (a1, a0, a1, a0, 1);
 484   while ((b0 & 1) == 0)
 485     rsh2 (b1, b0, b1, b0, 1);
 486
 487   for (;;)
 488     {
 489       if ((b1 | a1) == 0)
 490         {
 491           *r1 = 0;
 492           return gcd_odd (b0, a0);
 493         }
 494
 495       if (gt2 (a1, a0, b1, b0))
 496         {
 497           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 498           do
 499             rsh2 (a1, a0, a1, a0, 1);
 500           while ((a0 & 1) == 0);
 501         }
 502       else if (gt2 (b1, b0, a1, a0))
 503         {
 504           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 505           do
 506             rsh2 (b1, b0, b1, b0, 1);
 507           while ((b0 & 1) == 0);
 508         }
 509       else
 510         break;
 511     }
 512
 513   *r1 = a1;
 514   return a0;
 515 }
 516
 517 static void
 518 factor_insert_multiplicity (struct factors *factors,
 519                             uintmax_t prime, unsigned int m)
 520 {
 521   unsigned int nfactors = factors->nfactors;
 522   uintmax_t *p = factors->p;
 523   unsigned char *e = factors->e;
 524
 525   /* Locate position for insert new or increment e.  */
 526   int i;
 527   for (i = nfactors - 1; i >= 0; i--)
 528     {
 529       if (p[i] <= prime)
 530         break;
 531     }
 532
 533   if (i < 0 || p[i] != prime)
 534     {
 535       for (int j = nfactors - 1; j > i; j--)
 536         {
 537           p[j + 1] = p[j];
 538           e[j + 1] = e[j];
 539         }
 540       p[i + 1] = prime;
 541       e[i + 1] = m;
 542       factors->nfactors = nfactors + 1;
 543     }
 544   else
 545     {
 546       e[i] += m;
 547     }
 548 }
 549
 550 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 551
 552 static void
 553 factor_insert_large (struct factors *factors,
 554                      uintmax_t p1, uintmax_t p0)
 555 {
 556   if (p1 > 0)
 557     {
 558       assert (factors->plarge[1] == 0);
 559       factors->plarge[0] = p0;
 560       factors->plarge[1] = p1;
 561     }
 562   else
 563     factor_insert (factors, p0);
 564 }
 565
 566 #if HAVE_GMP
 567
 568 # if !HAVE_DECL_MPZ_INITS
 569
 570 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 571 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 572
 573 static void
 574 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 575 {
 576   va_list ap;
 577
 578   va_start (ap, mpz_single_init);
 579
 580   mpz_t *mpz;
 581   while ((mpz = va_arg (ap, mpz_t *)))
 582     mpz_single_init (*mpz);
 583
 584   va_end (ap);
 585 }
 586 # endif
 587
 588 static void mp_factor (mpz_t, struct mp_factors *);
 589
 590 static void
 591 mp_factor_init (struct mp_factors *factors)
 592 {
 593   factors->p = NULL;
 594   factors->e = NULL;
 595   factors->nfactors = 0;
 596 }
 597
 598 static void
 599 mp_factor_clear (struct mp_factors *factors)
 600 {
 601   for (unsigned int i = 0; i < factors->nfactors; i++)
 602     mpz_clear (factors->p[i]);
 603
 604   free (factors->p);
 605   free (factors->e);
 606 }
 607
 608 static void
 609 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 610 {
 611   unsigned long int nfactors = factors->nfactors;
 612   mpz_t         *p  = factors->p;
 613   unsigned long int *e  = factors->e;
 614   long i;
 615
 616   /* Locate position for insert new or increment e.  */
 617   for (i = nfactors - 1; i >= 0; i--)
 618     {
 619       if (mpz_cmp (p[i], prime) <= 0)
 620         break;
 621     }
 622
 623   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 624     {
 625       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 626       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 627
 628       mpz_init (p[nfactors]);
 629       for (long j = nfactors - 1; j > i; j--)
 630         {
 631           mpz_set (p[j + 1], p[j]);
 632           e[j + 1] = e[j];
 633         }
 634       mpz_set (p[i + 1], prime);
 635       e[i + 1] = 1;
 636
 637       factors->p = p;
 638       factors->e = e;
 639       factors->nfactors = nfactors + 1;
 640     }
 641   else
 642     {
 643       e[i] += 1;
 644     }
 645 }
 646
 647 static void
 648 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 649 {
 650   mpz_t pz;
 651
 652   mpz_init_set_ui (pz, prime);
 653   mp_factor_insert (factors, pz);
 654   mpz_clear (pz);
 655 }
 656 #endif /* HAVE_GMP */
 657
 658
 659 /* Number of bits in an uintmax_t.  */
 660 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 661
 662 /* Verify that uintmax_t does not have holes in its representation.  */
 663 verify (UINTMAX_MAX >> (W - 1) == 1);
 664
 665 #define P(a,b,c,d) a,
 666 static const unsigned char primes_diff[] = {
 667 #include "primes.h"
 668 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 669 };
 670 #undef P
 671
 672 #define PRIMES_PTAB_ENTRIES \
 673   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 674
 675 #define P(a,b,c,d) b,
 676 static const unsigned char primes_diff8[] = {
 677 #include "primes.h"
 678 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 679 };
 680 #undef P
 681
 682 struct primes_dtab
 683 {
 684   uintmax_t binv, lim;
 685 };
 686
 687 #define P(a,b,c,d) {c,d},
 688 static const struct primes_dtab primes_dtab[] = {
 689 #include "primes.h"
 690 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 691 };
 692 #undef P
 693
 694 /* Verify that uintmax_t is not wider than
 695    the integers used to generate primes.h.  */
 696 verify (W <= WIDE_UINT_BITS);
 697
 698 /* debugging for developers.  Enables devmsg().
 699    This flag is used only in the GMP code.  */
 700 static bool dev_debug = false;
 701
 702 /* Prove primality or run probabilistic tests.  */
 703 static bool flag_prove_primality = PROVE_PRIMALITY;
 704
 705 /* Number of Miller-Rabin tests to run when not proving primality. */
 706 #define MR_REPS 25
 707
 708 static void
 709 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 710                       unsigned int off)
 711 {
 712   for (unsigned int j = 0; j < off; j++)
 713     p += primes_diff[i + j];
 714   factor_insert (factors, p);
 715 }
 716
 717 /* Trial division with odd primes uses the following trick.
 718
 719    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 720    consider the case t < B (this is the second loop below).
 721
 722    From our tables we get
 723
 724      binv = p^{-1} (mod B)
 725      lim = floor ( (B-1) / p ).
 726
 727    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 728    (and all quotients in this range occur for some t).
 729
 730    Then t = q * p is true also (mod B), and p is invertible we get
 731
 732      q = t * binv (mod B).
 733
 734    Next, assume that t is *not* divisible by p. Since multiplication
 735    by binv (mod B) is a one-to-one mapping,
 736
 737      t * binv (mod B) > lim,
 738
 739    because all the smaller values are already taken.
 740
 741    This can be summed up by saying that the function
 742
 743      q(t) = binv * t (mod B)
 744
 745    is a permutation of the range 0 <= t < B, with the curious property
 746    that it maps the multiples of p onto the range 0 <= q <= lim, in
 747    order, and the non-multiples of p onto the range lim < q < B.
 748  */
 749
 750 static uintmax_t
 751 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 752                        struct factors *factors)
 753 {
 754   if (t0 % 2 == 0)
 755     {
 756       unsigned int cnt;
 757
 758       if (t0 == 0)
 759         {
 760           count_trailing_zeros (cnt, t1);
 761           t0 = t1 >> cnt;
 762           t1 = 0;
 763           cnt += W_TYPE_SIZE;
 764         }
 765       else
 766         {
 767           count_trailing_zeros (cnt, t0);
 768           rsh2 (t1, t0, t1, t0, cnt);
 769         }
 770
 771       factor_insert_multiplicity (factors, 2, cnt);
 772     }
 773
 774   uintmax_t p = 3;
 775   unsigned int i;
 776   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 777     {
 778       for (;;)
 779         {
 780           uintmax_t q1, q0, hi, lo _GL_UNUSED;
 781
 782           q0 = t0 * primes_dtab[i].binv;
 783           umul_ppmm (hi, lo, q0, p);
 784           if (hi > t1)
 785             break;
 786           hi = t1 - hi;
 787           q1 = hi * primes_dtab[i].binv;
 788           if (LIKELY (q1 > primes_dtab[i].lim))
 789             break;
 790           t1 = q1; t0 = q0;
 791           factor_insert (factors, p);
 792         }
 793       p += primes_diff[i + 1];
 794     }
 795   if (t1p)
 796     *t1p = t1;
 797
 798 #define DIVBLOCK(I)                                                     \
 799   do {                                                                  \
 800     for (;;)                                                            \
 801       {                                                                 \
 802         q = t0 * pd[I].binv;                                            \
 803         if (LIKELY (q > pd[I].lim))                                     \
 804           break;                                                        \
 805         t0 = q;                                                         \
 806         factor_insert_refind (factors, p, i + 1, I);                    \
 807       }                                                                 \
 808   } while (0)
 809
 810   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 811     {
 812       uintmax_t q;
 813       const struct primes_dtab *pd = &primes_dtab[i];
 814       DIVBLOCK (0);
 815       DIVBLOCK (1);
 816       DIVBLOCK (2);
 817       DIVBLOCK (3);
 818       DIVBLOCK (4);
 819       DIVBLOCK (5);
 820       DIVBLOCK (6);
 821       DIVBLOCK (7);
 822
 823       p += primes_diff8[i];
 824       if (p * p > t0)
 825         break;
 826     }
 827
 828   return t0;
 829 }
 830
 831 #if HAVE_GMP
 832 static void
 833 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 834 {
 835   mpz_t q;
 836   unsigned long int p;
 837
 838   devmsg ("[trial division] ");
 839
 840   mpz_init (q);
 841
 842   p = mpz_scan1 (t, 0);
 843   mpz_div_2exp (t, t, p);
 844   while (p)
 845     {
 846       mp_factor_insert_ui (factors, 2);
 847       --p;
 848     }
 849
 850   p = 3;
 851   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 852     {
 853       if (! mpz_divisible_ui_p (t, p))
 854         {
 855           p += primes_diff[i++];
 856           if (mpz_cmp_ui (t, p * p) < 0)
 857             break;
 858         }
 859       else
 860         {
 861           mpz_tdiv_q_ui (t, t, p);
 862           mp_factor_insert_ui (factors, p);
 863         }
 864     }
 865
 866   mpz_clear (q);
 867 }
 868 #endif
 869
 870 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 871 static const unsigned char  binvert_table[128] =
 872 {
 873   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 874   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 875   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 876   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 877   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 878   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 879   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 880   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 881   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 882   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 883   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 884   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 885   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 886   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 887   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 888   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 889 };
 890
 891 /* Compute n^(-1) mod B, using a Newton iteration.  */
 892 #define binv(inv,n)                                                     \
 893   do {                                                                  \
 894     uintmax_t  __n = (n);                                               \
 895     uintmax_t  __inv;                                                   \
 896                                                                         \
 897     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 898     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 899     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 900     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 901                                                                         \
 902     if (W_TYPE_SIZE > 64)                                               \
 903       {                                                                 \
 904         int  __invbits = 64;                                            \
 905         do {                                                            \
 906           __inv = 2 * __inv - __inv * __inv * __n;                      \
 907           __invbits *= 2;                                               \
 908         } while (__invbits < W_TYPE_SIZE);                              \
 909       }                                                                 \
 910                                                                         \
 911     (inv) = __inv;                                                      \
 912   } while (0)
 913
 914 /* q = u / d, assuming d|u.  */
 915 #define divexact_21(q1, q0, u1, u0, d)                                  \
 916   do {                                                                  \
 917     uintmax_t _di, _q0;                                                 \
 918     binv (_di, (d));                                                    \
 919     _q0 = (u0) * _di;                                                   \
 920     if ((u1) >= (d))                                                    \
 921       {                                                                 \
 922         uintmax_t _p1, _p0 _GL_UNUSED;                            \
 923         umul_ppmm (_p1, _p0, _q0, d);                                   \
 924         (q1) = ((u1) - _p1) * _di;                                      \
 925         (q0) = _q0;                                                     \
 926       }                                                                 \
 927     else                                                                \
 928       {                                                                 \
 929         (q0) = _q0;                                                     \
 930         (q1) = 0;                                                       \
 931       }                                                                 \
 932   } while (0)
 933
 934 /* x B (mod n). */
 935 #define redcify(r_prim, r, n)                                           \
 936   do {                                                                  \
 937     uintmax_t _redcify_q _GL_UNUSED;                              \
 938     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 939   } while (0)
 940
 941 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 942 #define redcify2(r1, r0, x, n1, n0)                                     \
 943   do {                                                                  \
 944     uintmax_t _r1, _r0, _i;                                             \
 945     if ((x) < (n1))                                                     \
 946       {                                                                 \
 947         _r1 = (x); _r0 = 0;                                             \
 948         _i = W_TYPE_SIZE;                                               \
 949       }                                                                 \
 950     else                                                                \
 951       {                                                                 \
 952         _r1 = 0; _r0 = (x);                                             \
 953         _i = 2*W_TYPE_SIZE;                                             \
 954       }                                                                 \
 955     while (_i-- > 0)                                                    \
 956       {                                                                 \
 957         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 958         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 959           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 960       }                                                                 \
 961     (r1) = _r1;                                                         \
 962     (r0) = _r0;                                                         \
 963   } while (0)
 964
 965 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 966    Both a and b must be in redc form, the result will be in redc form too. */
 967 static inline uintmax_t
 968 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 969 {
 970   uintmax_t rh, rl, q, th, tl _GL_UNUSED, xh;
 971
 972   umul_ppmm (rh, rl, a, b);
 973   q = rl * mi;
 974   umul_ppmm (th, tl, q, m);
 975   xh = rh - th;
 976   if (rh < th)
 977     xh += m;
 978
 979   return xh;
 980 }
 981
 982 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 983    Both a and b must be in redc form, the result will be in redc form too.
 984    For performance reasons, the most significant bit of m must be clear. */
 985 static uintmax_t
 986 mulredc2 (uintmax_t *r1p,
 987           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 988           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 989 {
 990   uintmax_t r1, r0, q, p1, p0 _GL_UNUSED, t1, t0, s1, s0;
 991   mi = -mi;
 992   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
 993   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
 994   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
 995
 996   /* First compute a0 * <b1, b0> B^{-1}
 997         +-----+
 998         |a0 b0|
 999      +--+--+--+
1000      |a0 b1|
1001      +--+--+--+
1002         |q0 m0|
1003      +--+--+--+
1004      |q0 m1|
1005     -+--+--+--+
1006      |r1|r0| 0|
1007      +--+--+--+
1008   */
1009   umul_ppmm (t1, t0, a0, b0);
1010   umul_ppmm (r1, r0, a0, b1);
1011   q = mi * t0;
1012   umul_ppmm (p1, p0, q, m0);
1013   umul_ppmm (s1, s0, q, m1);
1014   r0 += (t0 != 0); /* Carry */
1015   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1016   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1017   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1018
1019   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1020         +-----+
1021         |a1 b0|
1022         +--+--+
1023         |r1|r0|
1024      +--+--+--+
1025      |a1 b1|
1026      +--+--+--+
1027         |q1 m0|
1028      +--+--+--+
1029      |q1 m1|
1030     -+--+--+--+
1031      |r1|r0| 0|
1032      +--+--+--+
1033   */
1034   umul_ppmm (t1, t0, a1, b0);
1035   umul_ppmm (s1, s0, a1, b1);
1036   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1037   q = mi * t0;
1038   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1039   umul_ppmm (p1, p0, q, m0);
1040   umul_ppmm (s1, s0, q, m1);
1041   r0 += (t0 != 0); /* Carry */
1042   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1043   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1044   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1045
1046   if (ge2 (r1, r0, m1, m0))
1047     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1048
1049   *r1p = r1;
1050   return r0;
1051 }
1052
1053 static uintmax_t _GL_ATTRIBUTE_CONST
1054 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1055 {
1056   uintmax_t y = one;
1057
1058   if (e & 1)
1059     y = b;
1060
1061   while (e != 0)
1062     {
1063       b = mulredc (b, b, n, ni);
1064       e >>= 1;
1065
1066       if (e & 1)
1067         y = mulredc (y, b, n, ni);
1068     }
1069
1070   return y;
1071 }
1072
1073 static uintmax_t
1074 powm2 (uintmax_t *r1m,
1075        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1076        uintmax_t ni, const uintmax_t *one)
1077 {
1078   uintmax_t r1, r0, b1, b0, n1, n0;
1079   unsigned int i;
1080   uintmax_t e;
1081
1082   b0 = bp[0];
1083   b1 = bp[1];
1084   n0 = np[0];
1085   n1 = np[1];
1086
1087   r0 = one[0];
1088   r1 = one[1];
1089
1090   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1091     {
1092       if (e & 1)
1093         {
1094           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1095           r1 = *r1m;
1096         }
1097       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1098       b1 = *r1m;
1099     }
1100   for (e = ep[1]; e > 0; e >>= 1)
1101     {
1102       if (e & 1)
1103         {
1104           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1105           r1 = *r1m;
1106         }
1107       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1108       b1 = *r1m;
1109     }
1110   *r1m = r1;
1111   return r0;
1112 }
1113
1114 static bool _GL_ATTRIBUTE_CONST
1115 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1116              unsigned int k, uintmax_t one)
1117 {
1118   uintmax_t y = powm (b, q, n, ni, one);
1119
1120   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1121
1122   if (y == one || y == nm1)
1123     return true;
1124
1125   for (unsigned int i = 1; i < k; i++)
1126     {
1127       y = mulredc (y, y, n, ni);
1128
1129       if (y == nm1)
1130         return true;
1131       if (y == one)
1132         return false;
1133     }
1134   return false;
1135 }
1136
1137 static bool
1138 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1139               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1140 {
1141   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1142
1143   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1144   y1 = r1m;
1145
1146   if (y0 == one[0] && y1 == one[1])
1147     return true;
1148
1149   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1150
1151   if (y0 == nm1_0 && y1 == nm1_1)
1152     return true;
1153
1154   for (unsigned int i = 1; i < k; i++)
1155     {
1156       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1157       y1 = r1m;
1158
1159       if (y0 == nm1_0 && y1 == nm1_1)
1160         return true;
1161       if (y0 == one[0] && y1 == one[1])
1162         return false;
1163     }
1164   return false;
1165 }
1166
1167 #if HAVE_GMP
1168 static bool
1169 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1170                 mpz_srcptr q, unsigned long int k)
1171 {
1172   mpz_powm (y, x, q, n);
1173
1174   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1175     return true;
1176
1177   for (unsigned long int i = 1; i < k; i++)
1178     {
1179       mpz_powm_ui (y, y, 2, n);
1180       if (mpz_cmp (y, nm1) == 0)
1181         return true;
1182       if (mpz_cmp_ui (y, 1) == 0)
1183         return false;
1184     }
1185   return false;
1186 }
1187 #endif
1188
1189 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1190    have been observed.  The average seem to be about 2.  */
1191 static bool
1192 prime_p (uintmax_t n)
1193 {
1194   int k;
1195   bool is_prime;
1196   uintmax_t a_prim, one, ni;
1197   struct factors factors;
1198
1199   if (n <= 1)
1200     return false;
1201
1202   /* We have already casted out small primes. */
1203   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1204     return true;
1205
1206   /* Precomputation for Miller-Rabin.  */
1207   uintmax_t q = n - 1;
1208   for (k = 0; (q & 1) == 0; k++)
1209     q >>= 1;
1210
1211   uintmax_t a = 2;
1212   binv (ni, n);                 /* ni <- 1/n mod B */
1213   redcify (one, 1, n);
1214   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1215
1216   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1217   if (!millerrabin (n, ni, a_prim, q, k, one))
1218     return false;
1219
1220   if (flag_prove_primality)
1221     {
1222       /* Factor n-1 for Lucas.  */
1223       factor (0, n - 1, &factors);
1224     }
1225
1226   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1227      number composite.  */
1228   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1229     {
1230       if (flag_prove_primality)
1231         {
1232           is_prime = true;
1233           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1234             {
1235               is_prime
1236                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1237             }
1238         }
1239       else
1240         {
1241           /* After enough Miller-Rabin runs, be content. */
1242           is_prime = (r == MR_REPS - 1);
1243         }
1244
1245       if (is_prime)
1246         return true;
1247
1248       a += primes_diff[r];      /* Establish new base.  */
1249
1250       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1251          on most processors, since it avoids udiv_qrnnd.  If we go down the
1252          udiv_qrnnd_preinv path, this code should be replaced.  */
1253       {
1254         uintmax_t s1, s0;
1255         umul_ppmm (s1, s0, one, a);
1256         if (LIKELY (s1 == 0))
1257           a_prim = s0 % n;
1258         else
1259           {
1260             uintmax_t dummy _GL_UNUSED;
1261             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1262           }
1263       }
1264
1265       if (!millerrabin (n, ni, a_prim, q, k, one))
1266         return false;
1267     }
1268
1269   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1270   abort ();
1271 }
1272
1273 static bool
1274 prime2_p (uintmax_t n1, uintmax_t n0)
1275 {
1276   uintmax_t q[2], nm1[2];
1277   uintmax_t a_prim[2];
1278   uintmax_t one[2];
1279   uintmax_t na[2];
1280   uintmax_t ni;
1281   unsigned int k;
1282   struct factors factors;
1283
1284   if (n1 == 0)
1285     return prime_p (n0);
1286
1287   nm1[1] = n1 - (n0 == 0);
1288   nm1[0] = n0 - 1;
1289   if (nm1[0] == 0)
1290     {
1291       count_trailing_zeros (k, nm1[1]);
1292
1293       q[0] = nm1[1] >> k;
1294       q[1] = 0;
1295       k += W_TYPE_SIZE;
1296     }
1297   else
1298     {
1299       count_trailing_zeros (k, nm1[0]);
1300       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1301     }
1302
1303   uintmax_t a = 2;
1304   binv (ni, n0);
1305   redcify2 (one[1], one[0], 1, n1, n0);
1306   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1307
1308   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1309   na[0] = n0;
1310   na[1] = n1;
1311
1312   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1313     return false;
1314
1315   if (flag_prove_primality)
1316     {
1317       /* Factor n-1 for Lucas.  */
1318       factor (nm1[1], nm1[0], &factors);
1319     }
1320
1321   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1322      number composite.  */
1323   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1324     {
1325       bool is_prime;
1326       uintmax_t e[2], y[2];
1327
1328       if (flag_prove_primality)
1329         {
1330           is_prime = true;
1331           if (factors.plarge[1])
1332             {
1333               uintmax_t pi;
1334               binv (pi, factors.plarge[0]);
1335               e[0] = pi * nm1[0];
1336               e[1] = 0;
1337               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1338               is_prime = (y[0] != one[0] || y[1] != one[1]);
1339             }
1340           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1341             {
1342               /* FIXME: We always have the factor 2. Do we really need to
1343                  handle it here? We have done the same powering as part
1344                  of millerrabin. */
1345               if (factors.p[i] == 2)
1346                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1347               else
1348                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1349               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1350               is_prime = (y[0] != one[0] || y[1] != one[1]);
1351             }
1352         }
1353       else
1354         {
1355           /* After enough Miller-Rabin runs, be content. */
1356           is_prime = (r == MR_REPS - 1);
1357         }
1358
1359       if (is_prime)
1360         return true;
1361
1362       a += primes_diff[r];      /* Establish new base.  */
1363       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1364
1365       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1366         return false;
1367     }
1368
1369   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1370   abort ();
1371 }
1372
1373 #if HAVE_GMP
1374 static bool
1375 mp_prime_p (mpz_t n)
1376 {
1377   bool is_prime;
1378   mpz_t q, a, nm1, tmp;
1379   struct mp_factors factors;
1380
1381   if (mpz_cmp_ui (n, 1) <= 0)
1382     return false;
1383
1384   /* We have already casted out small primes. */
1385   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1386     return true;
1387
1388   mpz_inits (q, a, nm1, tmp, NULL);
1389
1390   /* Precomputation for Miller-Rabin.  */
1391   mpz_sub_ui (nm1, n, 1);
1392
1393   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1394   unsigned long int k = mpz_scan1 (nm1, 0);
1395   mpz_tdiv_q_2exp (q, nm1, k);
1396
1397   mpz_set_ui (a, 2);
1398
1399   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1400   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1401     {
1402       is_prime = false;
1403       goto ret2;
1404     }
1405
1406   if (flag_prove_primality)
1407     {
1408       /* Factor n-1 for Lucas.  */
1409       mpz_set (tmp, nm1);
1410       mp_factor (tmp, &factors);
1411     }
1412
1413   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1414      number composite.  */
1415   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1416     {
1417       if (flag_prove_primality)
1418         {
1419           is_prime = true;
1420           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1421             {
1422               mpz_divexact (tmp, nm1, factors.p[i]);
1423               mpz_powm (tmp, a, tmp, n);
1424               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1425             }
1426         }
1427       else
1428         {
1429           /* After enough Miller-Rabin runs, be content. */
1430           is_prime = (r == MR_REPS - 1);
1431         }
1432
1433       if (is_prime)
1434         goto ret1;
1435
1436       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1437
1438       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1439         {
1440           is_prime = false;
1441           goto ret1;
1442         }
1443     }
1444
1445   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1446   abort ();
1447
1448  ret1:
1449   if (flag_prove_primality)
1450     mp_factor_clear (&factors);
1451  ret2:
1452   mpz_clears (q, a, nm1, tmp, NULL);
1453
1454   return is_prime;
1455 }
1456 #endif
1457
1458 static void
1459 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1460                           struct factors *factors)
1461 {
1462   uintmax_t x, z, y, P, t, ni, g;
1463
1464   unsigned long int k = 1;
1465   unsigned long int l = 1;
1466
1467   redcify (P, 1, n);
1468   addmod (x, P, P, n);          /* i.e., redcify(2) */
1469   y = z = x;
1470
1471   while (n != 1)
1472     {
1473       assert (a < n);
1474
1475       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1476
1477       for (;;)
1478         {
1479           do
1480             {
1481               x = mulredc (x, x, n, ni);
1482               addmod (x, x, a, n);
1483
1484               submod (t, z, x, n);
1485               P = mulredc (P, t, n, ni);
1486
1487               if (k % 32 == 1)
1488                 {
1489                   if (gcd_odd (P, n) != 1)
1490                     goto factor_found;
1491                   y = x;
1492                 }
1493             }
1494           while (--k != 0);
1495
1496           z = x;
1497           k = l;
1498           l = 2 * l;
1499           for (unsigned long int i = 0; i < k; i++)
1500             {
1501               x = mulredc (x, x, n, ni);
1502               addmod (x, x, a, n);
1503             }
1504           y = x;
1505         }
1506
1507     factor_found:
1508       do
1509         {
1510           y = mulredc (y, y, n, ni);
1511           addmod (y, y, a, n);
1512
1513           submod (t, z, y, n);
1514           g = gcd_odd (t, n);
1515         }
1516       while (g == 1);
1517
1518       n = n / g;
1519
1520       if (!prime_p (g))
1521         factor_using_pollard_rho (g, a + 1, factors);
1522       else
1523         factor_insert (factors, g);
1524
1525       if (prime_p (n))
1526         {
1527           factor_insert (factors, n);
1528           break;
1529         }
1530
1531       x = x % n;
1532       z = z % n;
1533       y = y % n;
1534     }
1535 }
1536
1537 static void
1538 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1539                            struct factors *factors)
1540 {
1541   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1542
1543   unsigned long int k = 1;
1544   unsigned long int l = 1;
1545
1546   redcify2 (P1, P0, 1, n1, n0);
1547   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1548   y1 = z1 = x1;
1549   y0 = z0 = x0;
1550
1551   while (n1 != 0 || n0 != 1)
1552     {
1553       binv (ni, n0);
1554
1555       for (;;)
1556         {
1557           do
1558             {
1559               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1560               x1 = r1m;
1561               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1562
1563               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1564               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1565               P1 = r1m;
1566
1567               if (k % 32 == 1)
1568                 {
1569                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1570                   if (g1 != 0 || g0 != 1)
1571                     goto factor_found;
1572                   y1 = x1; y0 = x0;
1573                 }
1574             }
1575           while (--k != 0);
1576
1577           z1 = x1; z0 = x0;
1578           k = l;
1579           l = 2 * l;
1580           for (unsigned long int i = 0; i < k; i++)
1581             {
1582               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1583               x1 = r1m;
1584               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1585             }
1586           y1 = x1; y0 = x0;
1587         }
1588
1589     factor_found:
1590       do
1591         {
1592           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1593           y1 = r1m;
1594           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1595
1596           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1597           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1598         }
1599       while (g1 == 0 && g0 == 1);
1600
1601       if (g1 == 0)
1602         {
1603           /* The found factor is one word. */
1604           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1605
1606           if (!prime_p (g0))
1607             factor_using_pollard_rho (g0, a + 1, factors);
1608           else
1609             factor_insert (factors, g0);
1610         }
1611       else
1612         {
1613           /* The found factor is two words.  This is highly unlikely, thus hard
1614              to trigger.  Please be careful before you change this code!  */
1615           uintmax_t ginv;
1616
1617           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1618           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1619           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1620
1621           if (!prime2_p (g1, g0))
1622             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1623           else
1624             factor_insert_large (factors, g1, g0);
1625         }
1626
1627       if (n1 == 0)
1628         {
1629           if (prime_p (n0))
1630             {
1631               factor_insert (factors, n0);
1632               break;
1633             }
1634
1635           factor_using_pollard_rho (n0, a, factors);
1636           return;
1637         }
1638
1639       if (prime2_p (n1, n0))
1640         {
1641           factor_insert_large (factors, n1, n0);
1642           break;
1643         }
1644
1645       x0 = mod2 (&x1, x1, x0, n1, n0);
1646       z0 = mod2 (&z1, z1, z0, n1, n0);
1647       y0 = mod2 (&y1, y1, y0, n1, n0);
1648     }
1649 }
1650
1651 #if HAVE_GMP
1652 static void
1653 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1654                              struct mp_factors *factors)
1655 {
1656   mpz_t x, z, y, P;
1657   mpz_t t, t2;
1658
1659   devmsg ("[pollard-rho (%lu)] ", a);
1660
1661   mpz_inits (t, t2, NULL);
1662   mpz_init_set_si (y, 2);
1663   mpz_init_set_si (x, 2);
1664   mpz_init_set_si (z, 2);
1665   mpz_init_set_ui (P, 1);
1666
1667   unsigned long long int k = 1;
1668   unsigned long long int l = 1;
1669
1670   while (mpz_cmp_ui (n, 1) != 0)
1671     {
1672       for (;;)
1673         {
1674           do
1675             {
1676               mpz_mul (t, x, x);
1677               mpz_mod (x, t, n);
1678               mpz_add_ui (x, x, a);
1679
1680               mpz_sub (t, z, x);
1681               mpz_mul (t2, P, t);
1682               mpz_mod (P, t2, n);
1683
1684               if (k % 32 == 1)
1685                 {
1686                   mpz_gcd (t, P, n);
1687                   if (mpz_cmp_ui (t, 1) != 0)
1688                     goto factor_found;
1689                   mpz_set (y, x);
1690                 }
1691             }
1692           while (--k != 0);
1693
1694           mpz_set (z, x);
1695           k = l;
1696           l = 2 * l;
1697           for (unsigned long long int i = 0; i < k; i++)
1698             {
1699               mpz_mul (t, x, x);
1700               mpz_mod (x, t, n);
1701               mpz_add_ui (x, x, a);
1702             }
1703           mpz_set (y, x);
1704         }
1705
1706     factor_found:
1707       do
1708         {
1709           mpz_mul (t, y, y);
1710           mpz_mod (y, t, n);
1711           mpz_add_ui (y, y, a);
1712
1713           mpz_sub (t, z, y);
1714           mpz_gcd (t, t, n);
1715         }
1716       while (mpz_cmp_ui (t, 1) == 0);
1717
1718       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1719
1720       if (!mp_prime_p (t))
1721         {
1722           devmsg ("[composite factor--restarting pollard-rho] ");
1723           mp_factor_using_pollard_rho (t, a + 1, factors);
1724         }
1725       else
1726         {
1727           mp_factor_insert (factors, t);
1728         }
1729
1730       if (mp_prime_p (n))
1731         {
1732           mp_factor_insert (factors, n);
1733           break;
1734         }
1735
1736       mpz_mod (x, x, n);
1737       mpz_mod (z, z, n);
1738       mpz_mod (y, y, n);
1739     }
1740
1741   mpz_clears (P, t2, t, z, x, y, NULL);
1742 }
1743 #endif
1744
1745 #if USE_SQUFOF
1746 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1747    algorithm is replaced, consider also returning the remainder. */
1748 static uintmax_t _GL_ATTRIBUTE_CONST
1749 isqrt (uintmax_t n)
1750 {
1751   uintmax_t x;
1752   unsigned c;
1753   if (n == 0)
1754     return 0;
1755
1756   count_leading_zeros (c, n);
1757
1758   /* Make x > sqrt(n). This will be invariant through the loop. */
1759   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1760
1761   for (;;)
1762     {
1763       uintmax_t y = (x + n/x) / 2;
1764       if (y >= x)
1765         return x;
1766
1767       x = y;
1768     }
1769 }
1770
1771 static uintmax_t _GL_ATTRIBUTE_CONST
1772 isqrt2 (uintmax_t nh, uintmax_t nl)
1773 {
1774   unsigned int shift;
1775   uintmax_t x;
1776
1777   /* Ensures the remainder fits in an uintmax_t. */
1778   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1779
1780   if (nh == 0)
1781     return isqrt (nl);
1782
1783   count_leading_zeros (shift, nh);
1784   shift &= ~1;
1785
1786   /* Make x > sqrt(n) */
1787   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1788   x <<= (W_TYPE_SIZE - shift) / 2;
1789
1790   /* Do we need more than one iteration? */
1791   for (;;)
1792     {
1793       uintmax_t r _GL_UNUSED;
1794       uintmax_t q, y;
1795       udiv_qrnnd (q, r, nh, nl, x);
1796       y = (x + q) / 2;
1797
1798       if (y >= x)
1799         {
1800           uintmax_t hi, lo;
1801           umul_ppmm (hi, lo, x + 1, x + 1);
1802           assert (gt2 (hi, lo, nh, nl));
1803
1804           umul_ppmm (hi, lo, x, x);
1805           assert (ge2 (nh, nl, hi, lo));
1806           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1807           assert (hi == 0);
1808
1809           return x;
1810         }
1811
1812       x = y;
1813     }
1814 }
1815
1816 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1817 # define MAGIC64 0x0202021202030213ULL
1818 # define MAGIC63 0x0402483012450293ULL
1819 # define MAGIC65 0x218a019866014613ULL
1820 # define MAGIC11 0x23b
1821
1822 /* Return the square root if the input is a square, otherwise 0. */
1823 static uintmax_t _GL_ATTRIBUTE_CONST
1824 is_square (uintmax_t x)
1825 {
1826   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1827      computing the square root. */
1828   if (((MAGIC64 >> (x & 63)) & 1)
1829       && ((MAGIC63 >> (x % 63)) & 1)
1830       /* Both 0 and 64 are squares mod (65) */
1831       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1832       && ((MAGIC11 >> (x % 11) & 1)))
1833     {
1834       uintmax_t r = isqrt (x);
1835       if (r*r == x)
1836         return r;
1837     }
1838   return 0;
1839 }
1840
1841 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1842 static const unsigned short invtab[0x81] =
1843   {
1844     0x200,
1845     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1846     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1847     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1848     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1849     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1850     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1851     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1852     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1853     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1854     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1855     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1856     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1857     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1858     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1859     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1860     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1861   };
1862
1863 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1864    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1865 # define div_smallq(q, r, u, d)                                          \
1866   do {                                                                  \
1867     if ((u) / 0x40 < (d))                                               \
1868       {                                                                 \
1869         int _cnt;                                                       \
1870         uintmax_t _dinv, _mask, _q, _r;                                 \
1871         count_leading_zeros (_cnt, (d));                                \
1872         _r = (u);                                                       \
1873         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1874           {                                                             \
1875             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1876             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1877           }                                                             \
1878         else                                                            \
1879           {                                                             \
1880             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1881             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1882           }                                                             \
1883         _r -= _q*(d);                                                   \
1884                                                                         \
1885         _mask = -(uintmax_t) (_r >= (d));                               \
1886         (r) = _r - (_mask & (d));                                       \
1887         (q) = _q - _mask;                                               \
1888         assert ( (q) * (d) + (r) == u);                                 \
1889       }                                                                 \
1890     else                                                                \
1891       {                                                                 \
1892         uintmax_t _q = (u) / (d);                                       \
1893         (r) = (u) - _q * (d);                                           \
1894         (q) = _q;                                                       \
1895       }                                                                 \
1896   } while (0)
1897
1898 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1899    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1900    with 3025 = 55^2.
1901
1902    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1903    representing G_0 = (-55, 2*4652, 8653).
1904
1905    In the notation of the paper:
1906
1907    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1908
1909    Put
1910
1911      t_0 = floor([q_0 + R_0] / S0) = 1
1912      R_1 = t_0 * S_0 - R_0 = 4001
1913      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1914 */
1915
1916 /* Multipliers, in order of efficiency:
1917    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1918    0.7317  3*5*7    =  105 = 1
1919    0.7820  3*5*11   =  165 = 1
1920    0.7872  3*5      =   15 = 3
1921    0.8101  3*7*11   =  231 = 3
1922    0.8155  3*7      =   21 = 1
1923    0.8284  5*7*11   =  385 = 1
1924    0.8339  5*7      =   35 = 3
1925    0.8716  3*11     =   33 = 1
1926    0.8774  3        =    3 = 3
1927    0.8913  5*11     =   55 = 3
1928    0.8972  5        =    5 = 1
1929    0.9233  7*11     =   77 = 1
1930    0.9295  7        =    7 = 3
1931    0.9934  11       =   11 = 3
1932 */
1933 # define QUEUE_SIZE 50
1934 #endif
1935
1936 #if STAT_SQUFOF
1937 # define Q_FREQ_SIZE 50
1938 /* Element 0 keeps the total */
1939 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1940 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1941 #endif
1942
1943 #if USE_SQUFOF
1944 /* Return true on success.  Expected to fail only for numbers
1945    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1946 static bool
1947 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1948 {
1949   /* Uses algorithm and notation from
1950
1951      SQUARE FORM FACTORIZATION
1952      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1953
1954      http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1955    */
1956
1957   static const unsigned int multipliers_1[] =
1958     { /* = 1 (mod 4) */
1959       105, 165, 21, 385, 33, 5, 77, 1, 0
1960     };
1961   static const unsigned int multipliers_3[] =
1962     { /* = 3 (mod 4) */
1963       1155, 15, 231, 35, 3, 55, 7, 11, 0
1964     };
1965
1966   const unsigned int *m;
1967
1968   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1969
1970   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1971     return false;
1972
1973   uintmax_t sqrt_n = isqrt2 (n1, n0);
1974
1975   if (n0 == sqrt_n * sqrt_n)
1976     {
1977       uintmax_t p1, p0;
1978
1979       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1980       assert (p0 == n0);
1981
1982       if (n1 == p1)
1983         {
1984           if (prime_p (sqrt_n))
1985             factor_insert_multiplicity (factors, sqrt_n, 2);
1986           else
1987             {
1988               struct factors f;
1989
1990               f.nfactors = 0;
1991               if (!factor_using_squfof (0, sqrt_n, &f))
1992                 {
1993                   /* Try pollard rho instead */
1994                   factor_using_pollard_rho (sqrt_n, 1, &f);
1995                 }
1996               /* Duplicate the new factors */
1997               for (unsigned int i = 0; i < f.nfactors; i++)
1998                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
1999             }
2000           return true;
2001         }
2002     }
2003
2004   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2005   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2006        *m; m++)
2007     {
2008       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2009       unsigned int i;
2010       unsigned int mu = *m;
2011       unsigned int qpos = 0;
2012
2013       assert (mu * n0 % 4 == 3);
2014
2015       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2016          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2017          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2018          mu^3 < n.
2019
2020          However, this seems insufficient: With n = 37243139 and mu =
2021          105, we get a trivial factor, from the square 38809 = 197^2,
2022          without any corresponding Q earlier in the iteration.
2023
2024          Requiring 64 mu^3 < n seems sufficient. */
2025       if (n1 == 0)
2026         {
2027           if ((uintmax_t) mu*mu*mu >= n0 / 64)
2028             continue;
2029         }
2030       else
2031         {
2032           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2033             continue;
2034         }
2035       umul_ppmm (Dh, Dl, n0, mu);
2036       Dh += n1 * mu;
2037
2038       assert (Dl % 4 != 1);
2039       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2040
2041       S = isqrt2 (Dh, Dl);
2042
2043       Q1 = 1;
2044       P = S;
2045
2046       /* Square root remainder fits in one word, so ignore high part. */
2047       Q = Dl - P*P;
2048       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2049       L = isqrt (2*S);
2050       B = 2*L;
2051       L1 = mu * 2 * L;
2052
2053       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2054          4 D. */
2055
2056       for (i = 0; i <= B; i++)
2057         {
2058           uintmax_t q, P1, t, rem;
2059
2060           div_smallq (q, rem, S+P, Q);
2061           P1 = S - rem; /* P1 = q*Q - P */
2062
2063           IF_LINT (assert (q > 0 && Q > 0));
2064
2065 # if STAT_SQUFOF
2066           q_freq[0]++;
2067           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2068 # endif
2069
2070           if (Q <= L1)
2071             {
2072               uintmax_t g = Q;
2073
2074               if ( (Q & 1) == 0)
2075                 g /= 2;
2076
2077               g /= gcd_odd (g, mu);
2078
2079               if (g <= L)
2080                 {
2081                   if (qpos >= QUEUE_SIZE)
2082                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2083                   queue[qpos].Q = g;
2084                   queue[qpos].P = P % g;
2085                   qpos++;
2086                 }
2087             }
2088
2089           /* I think the difference can be either sign, but mod
2090              2^W_TYPE_SIZE arithmetic should be fine. */
2091           t = Q1 + q * (P - P1);
2092           Q1 = Q;
2093           Q = t;
2094           P = P1;
2095
2096           if ( (i & 1) == 0)
2097             {
2098               uintmax_t r = is_square (Q);
2099               if (r)
2100                 {
2101                   for (unsigned int j = 0; j < qpos; j++)
2102                     {
2103                       if (queue[j].Q == r)
2104                         {
2105                           if (r == 1)
2106                             /* Traversed entire cycle. */
2107                             goto next_multiplier;
2108
2109                           /* Need the absolute value for divisibility test. */
2110                           if (P >= queue[j].P)
2111                             t = P - queue[j].P;
2112                           else
2113                             t = queue[j].P - P;
2114                           if (t % r == 0)
2115                             {
2116                               /* Delete entries up to and including entry
2117                                  j, which matched. */
2118                               memmove (queue, queue + j + 1,
2119                                        (qpos - j - 1) * sizeof (queue[0]));
2120                               qpos -= (j + 1);
2121                             }
2122                           goto next_i;
2123                         }
2124                     }
2125
2126                   /* We have found a square form, which should give a
2127                      factor. */
2128                   Q1 = r;
2129                   assert (S >= P); /* What signs are possible? */
2130                   P += r * ((S - P) / r);
2131
2132                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2133                      for the case D = 2N. */
2134                   /* Compute Q = (D - P*P) / Q1, but we need double
2135                      precision. */
2136                   uintmax_t hi, lo;
2137                   umul_ppmm (hi, lo, P, P);
2138                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2139                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2140                   assert (rem == 0);
2141
2142                   for (;;)
2143                     {
2144                       /* Note: There appears to by a typo in the paper,
2145                          Step 4a in the algorithm description says q <--
2146                          floor([S+P]/\hat Q), but looking at the equations
2147                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2148                          (In this code, \hat Q is Q1). */
2149                       div_smallq (q, rem, S+P, Q);
2150                       P1 = S - rem;     /* P1 = q*Q - P */
2151
2152 # if STAT_SQUFOF
2153                       q_freq[0]++;
2154                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2155 # endif
2156                       if (P == P1)
2157                         break;
2158                       t = Q1 + q * (P - P1);
2159                       Q1 = Q;
2160                       Q = t;
2161                       P = P1;
2162                     }
2163
2164                   if ( (Q & 1) == 0)
2165                     Q /= 2;
2166                   Q /= gcd_odd (Q, mu);
2167
2168                   assert (Q > 1 && (n1 || Q < n0));
2169
2170                   if (prime_p (Q))
2171                     factor_insert (factors, Q);
2172                   else if (!factor_using_squfof (0, Q, factors))
2173                     factor_using_pollard_rho (Q, 2, factors);
2174
2175                   divexact_21 (n1, n0, n1, n0, Q);
2176
2177                   if (prime2_p (n1, n0))
2178                     factor_insert_large (factors, n1, n0);
2179                   else
2180                     {
2181                       if (!factor_using_squfof (n1, n0, factors))
2182                         {
2183                           if (n1 == 0)
2184                             factor_using_pollard_rho (n0, 1, factors);
2185                           else
2186                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2187                         }
2188                     }
2189
2190                   return true;
2191                 }
2192             }
2193         next_i:;
2194         }
2195     next_multiplier:;
2196     }
2197   return false;
2198 }
2199 #endif
2200
2201 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2202    results in FACTORS.  */
2203 static void
2204 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2205 {
2206   factors->nfactors = 0;
2207   factors->plarge[1] = 0;
2208
2209   if (t1 == 0 && t0 < 2)
2210     return;
2211
2212   t0 = factor_using_division (&t1, t1, t0, factors);
2213
2214   if (t1 == 0 && t0 < 2)
2215     return;
2216
2217   if (prime2_p (t1, t0))
2218     factor_insert_large (factors, t1, t0);
2219   else
2220     {
2221 #if USE_SQUFOF
2222       if (factor_using_squfof (t1, t0, factors))
2223         return;
2224 #endif
2225
2226       if (t1 == 0)
2227         factor_using_pollard_rho (t0, 1, factors);
2228       else
2229         factor_using_pollard_rho2 (t1, t0, 1, factors);
2230     }
2231 }
2232
2233 #if HAVE_GMP
2234 /* Use Pollard-rho to compute the prime factors of
2235    arbitrary-precision T, and put the results in FACTORS.  */
2236 static void
2237 mp_factor (mpz_t t, struct mp_factors *factors)
2238 {
2239   mp_factor_init (factors);
2240
2241   if (mpz_sgn (t) != 0)
2242     {
2243       mp_factor_using_division (t, factors);
2244
2245       if (mpz_cmp_ui (t, 1) != 0)
2246         {
2247           devmsg ("[is number prime?] ");
2248           if (mp_prime_p (t))
2249             mp_factor_insert (factors, t);
2250           else
2251             mp_factor_using_pollard_rho (t, 1, factors);
2252         }
2253     }
2254 }
2255 #endif
2256
2257 static strtol_error
2258 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2259 {
2260   unsigned int lo_carry;
2261   uintmax_t hi = 0, lo = 0;
2262
2263   strtol_error err = LONGINT_INVALID;
2264
2265   /* Skip initial spaces and '+'.  */
2266   for (;;)
2267     {
2268       char c = *s;
2269       if (c == ' ')
2270         s++;
2271       else if (c == '+')
2272         {
2273           s++;
2274           break;
2275         }
2276       else
2277         break;
2278     }
2279
2280   /* Initial scan for invalid digits.  */
2281   const char *p = s;
2282   for (;;)
2283     {
2284       unsigned int c = *p++;
2285       if (c == 0)
2286         break;
2287
2288       if (UNLIKELY (!ISDIGIT (c)))
2289         {
2290           err = LONGINT_INVALID;
2291           break;
2292         }
2293
2294       err = LONGINT_OK;           /* we've seen at least one valid digit */
2295     }
2296
2297   for (;err == LONGINT_OK;)
2298     {
2299       unsigned int c = *s++;
2300       if (c == 0)
2301         break;
2302
2303       c -= '0';
2304
2305       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2306         {
2307           err = LONGINT_OVERFLOW;
2308           break;
2309         }
2310       hi = 10 * hi;
2311
2312       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2313       lo_carry += 10 * lo < 2 * lo;
2314
2315       lo = 10 * lo;
2316       lo += c;
2317
2318       lo_carry += lo < c;
2319       hi += lo_carry;
2320       if (UNLIKELY (hi < lo_carry))
2321         {
2322           err = LONGINT_OVERFLOW;
2323           break;
2324         }
2325     }
2326
2327   *hip = hi;
2328   *lop = lo;
2329
2330   return err;
2331 }
2332
2333 /* Structure and routines for buffering and outputting full lines,
2334    to support parallel operation efficiently.  */
2335 static struct lbuf_
2336 {
2337   char *buf;
2338   char *end;
2339 } lbuf;
2340
2341 /* 512 is chosen to give good performance,
2342    and also is the max guaranteed size that
2343    consumers can read atomically through pipes.
2344    Also it's big enough to cater for max line length
2345    even with 128 bit uintmax_t.  */
2346 #define FACTOR_PIPE_BUF 512
2347
2348 static void
2349 lbuf_alloc (void)
2350 {
2351   if (lbuf.buf)
2352     return;
2353
2354   /* Double to ensure enough space for
2355      previous numbers + next number.  */
2356   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2357   lbuf.end = lbuf.buf;
2358 }
2359
2360 /* Write complete LBUF to standard output.  */
2361 static void
2362 lbuf_flush (void)
2363 {
2364   size_t size = lbuf.end - lbuf.buf;
2365   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2366     error (EXIT_FAILURE, errno, "%s", _("write error"));
2367   lbuf.end = lbuf.buf;
2368 }
2369
2370 /* Add a character C to LBUF and if it's a newline
2371    and enough bytes are already buffered,
2372    then write atomically to standard output.  */
2373 static void
2374 lbuf_putc (char c)
2375 {
2376   *lbuf.end++ = c;
2377
2378   if (c == '\n')
2379     {
2380       size_t buffered = lbuf.end - lbuf.buf;
2381
2382       if (buffered >= FACTOR_PIPE_BUF)
2383         {
2384           /* Write output in <= PIPE_BUF chunks
2385              so consumers can read atomically.  */
2386           char const *tend = lbuf.end;
2387
2388           /* Since a umaxint_t's factors must fit in 512
2389              we're guaranteed to find a newline here.  */
2390           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2391           while (*--tlend != '\n');
2392           tlend++;
2393
2394           lbuf.end = tlend;
2395           lbuf_flush ();
2396
2397           /* Buffer the remainder.  */
2398           memcpy (lbuf.buf, tlend, tend - tlend);
2399           lbuf.end = lbuf.buf + (tend - tlend);
2400         }
2401     }
2402 }
2403
2404 /* Buffer an int to the internal LBUF.  */
2405 static void
2406 lbuf_putint (uintmax_t i, size_t min_width)
2407 {
2408   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2409   char const *umaxstr = umaxtostr (i, buf);
2410   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2411   size_t z = width;
2412
2413   for (; z < min_width; z++)
2414     *lbuf.end++ = '0';
2415
2416   memcpy (lbuf.end, umaxstr, width);
2417   lbuf.end += width;
2418 }
2419
2420 static void
2421 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2422 {
2423   uintmax_t q, r;
2424
2425   if (t1 == 0)
2426     lbuf_putint (t0, 0);
2427   else
2428     {
2429       /* Use very plain code here since it seems hard to write fast code
2430          without assuming a specific word size.  */
2431       q = t1 / 1000000000;
2432       r = t1 % 1000000000;
2433       udiv_qrnnd (t0, r, r, t0, 1000000000);
2434       print_uintmaxes (q, t0);
2435       lbuf_putint (r, 9);
2436     }
2437 }
2438
2439 /* Single-precision factoring */
2440 static void
2441 print_factors_single (uintmax_t t1, uintmax_t t0)
2442 {
2443   struct factors factors;
2444
2445   print_uintmaxes (t1, t0);
2446   lbuf_putc (':');
2447
2448   factor (t1, t0, &factors);
2449
2450   for (unsigned int j = 0; j < factors.nfactors; j++)
2451     for (unsigned int k = 0; k < factors.e[j]; k++)
2452       {
2453         lbuf_putc (' ');
2454         print_uintmaxes (0, factors.p[j]);
2455       }
2456
2457   if (factors.plarge[1])
2458     {
2459       lbuf_putc (' ');
2460       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2461     }
2462
2463   lbuf_putc ('\n');
2464 }
2465
2466 /* Emit the factors of the indicated number.  If we have the option of using
2467    either algorithm, we select on the basis of the length of the number.
2468    For longer numbers, we prefer the MP algorithm even if the native algorithm
2469    has enough digits, because the algorithm is better.  The turnover point
2470    depends on the value.  */
2471 static bool
2472 print_factors (const char *input)
2473 {
2474   uintmax_t t1, t0;
2475
2476   /* Try converting the number to one or two words.  If it fails, use GMP or
2477      print an error message.  The 2nd condition checks that the most
2478      significant bit of the two-word number is clear, in a typesize neutral
2479      way.  */
2480   strtol_error err = strto2uintmax (&t1, &t0, input);
2481
2482   switch (err)
2483     {
2484     case LONGINT_OK:
2485       if (((t1 << 1) >> 1) == t1)
2486         {
2487           devmsg ("[using single-precision arithmetic] ");
2488           print_factors_single (t1, t0);
2489           return true;
2490         }
2491       break;
2492
2493     case LONGINT_OVERFLOW:
2494       /* Try GMP.  */
2495       break;
2496
2497     default:
2498       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2499       return false;
2500     }
2501
2502 #if HAVE_GMP
2503   devmsg ("[using arbitrary-precision arithmetic] ");
2504   mpz_t t;
2505   struct mp_factors factors;
2506
2507   mpz_init_set_str (t, input, 10);
2508
2509   gmp_printf ("%Zd:", t);
2510   mp_factor (t, &factors);
2511
2512   for (unsigned int j = 0; j < factors.nfactors; j++)
2513     for (unsigned int k = 0; k < factors.e[j]; k++)
2514       gmp_printf (" %Zd", factors.p[j]);
2515
2516   mp_factor_clear (&factors);
2517   mpz_clear (t);
2518   putchar ('\n');
2519   fflush (stdout);
2520   return true;
2521 #else
2522   error (0, 0, _("%s is too large"), quote (input));
2523   return false;
2524 #endif
2525 }
2526
2527 void
2528 usage (int status)
2529 {
2530   if (status != EXIT_SUCCESS)
2531     emit_try_help ();
2532   else
2533     {
2534       printf (_("\
2535 Usage: %s [NUMBER]...\n\
2536   or:  %s OPTION\n\
2537 "),
2538               program_name, program_name);
2539       fputs (_("\
2540 Print the prime factors of each specified integer NUMBER.  If none\n\
2541 are specified on the command line, read them from standard input.\n\
2542 \n\
2543 "), stdout);
2544       fputs (HELP_OPTION_DESCRIPTION, stdout);
2545       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2546       emit_ancillary_info (PROGRAM_NAME);
2547     }
2548   exit (status);
2549 }
2550
2551 static bool
2552 do_stdin (void)
2553 {
2554   bool ok = true;
2555   token_buffer tokenbuffer;
2556
2557   init_tokenbuffer (&tokenbuffer);
2558
2559   while (true)
2560     {
2561       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2562                                        &tokenbuffer);
2563       if (token_length == (size_t) -1)
2564         break;
2565       ok &= print_factors (tokenbuffer.buffer);
2566     }
2567   free (tokenbuffer.buffer);
2568
2569   return ok;
2570 }
2571
2572 int
2573 main (int argc, char **argv)
2574 {
2575   initialize_main (&argc, &argv);
2576   set_program_name (argv[0]);
2577   setlocale (LC_ALL, "");
2578   bindtextdomain (PACKAGE, LOCALEDIR);
2579   textdomain (PACKAGE);
2580
2581   lbuf_alloc ();
2582   atexit (close_stdout);
2583   atexit (lbuf_flush);
2584
2585   int c;
2586   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2587     {
2588       switch (c)
2589         {
2590         case DEV_DEBUG_OPTION:
2591           dev_debug = true;
2592           break;
2593
2594         case_GETOPT_HELP_CHAR;
2595
2596         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2597
2598         default:
2599           usage (EXIT_FAILURE);
2600         }
2601     }
2602
2603 #if STAT_SQUFOF
2604   memset (q_freq, 0, sizeof (q_freq));
2605 #endif
2606
2607   bool ok;
2608   if (argc <= optind)
2609     ok = do_stdin ();
2610   else
2611     {
2612       ok = true;
2613       for (int i = optind; i < argc; i++)
2614         if (! print_factors (argv[i]))
2615           ok = false;
2616     }
2617
2618 #if STAT_SQUFOF
2619   if (q_freq[0] > 0)
2620     {
2621       double acc_f;
2622       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2623       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2624         {
2625           double f = (double) q_freq[i] / q_freq[0];
2626           acc_f += f;
2627           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2628                   100.0 * f, 100.0 * acc_f);
2629         }
2630     }
2631 #endif
2632
2633   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2634 }