src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2015 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38     Using GMP is optional.  Define HAVE_GMP to make this code include GMP
  39     factoring code.  The GMP factoring code is based on GMP's demos/factorize.c
  40     (last synched 2012-09-07).  The GMP-based factoring code will stay in GMP
  41     factoring code even if numbers get small enough for using the two-word
  42     code.
  43
  44   Algorithm:
  45
  46     (1) Perform trial division using a small primes table, but without hardware
  47         division since the primes table store inverses modulo the word base.
  48         (The GMP variant of this code doesn't make use of the precomputed
  49         inverses, but instead relies on GMP for fast divisibility testing.)
  50     (2) Check the nature of any non-factored part using Miller-Rabin for
  51         detecting composites, and Lucas for detecting primes.
  52     (3) Factor any remaining composite part using the Pollard-Brent rho
  53         algorithm or the SQUFOF algorithm, checking status of found factors
  54         again using Miller-Rabin and Lucas.
  55
  56     We prefer using Hensel norm in the divisions, not the more familiar
  57     Euclidian norm, since the former leads to much faster code.  In the
  58     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  59     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  60     reductions mod n.
  61
  62   Improvements:
  63
  64     * Use modular inverses also for exact division in the Lucas code, and
  65       elsewhere.  A problem is to locate the inverses not from an index, but
  66       from a prime.  We might instead compute the inverse on-the-fly.
  67
  68     * Tune trial division table size (not forgetting that this is a standalone
  69       program where the table will be read from disk for each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 #include <config.h>
  88 #include <getopt.h>
  89 #include <stdio.h>
  90 #if HAVE_GMP
  91 # include <gmp.h>
  92 # if !HAVE_DECL_MPZ_INITS
  93 #  include <stdarg.h>
  94 # endif
  95 #endif
  96
  97 #include <assert.h>
  98
  99 #include "system.h"
 100 #include "error.h"
 101 #include "quote.h"
 102 #include "readtokens.h"
 103 #include "xstrtol.h"
 104
 105 /* The official name of this program (e.g., no 'g' prefix).  */
 106 #define PROGRAM_NAME "factor"
 107
 108 #define AUTHORS \
 109   proper_name ("Paul Rubin"),                                           \
 110   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 111   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 112
 113 /* Token delimiters when reading from a file.  */
 114 #define DELIM "\n\t "
 115
 116 #ifndef STAT_SQUFOF
 117 # define STAT_SQUFOF 0
 118 #endif
 119
 120 #ifndef USE_LONGLONG_H
 121 /* With the way we use longlong.h, it's only safe to use
 122    when UWtype = UHWtype, as there were various cases
 123    (as can be seen in the history for longlong.h) where
 124    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 125    to avoid compile time or run time issues.  */
 126 # if LONG_MAX == INTMAX_MAX
 127 #  define USE_LONGLONG_H 1
 128 # endif
 129 #endif
 130
 131 #if USE_LONGLONG_H
 132
 133 /* Make definitions for longlong.h to make it do what it can do for us */
 134
 135 /* bitcount for uintmax_t */
 136 # if UINTMAX_MAX == UINT32_MAX
 137 #  define W_TYPE_SIZE 32
 138 # elif UINTMAX_MAX == UINT64_MAX
 139 #  define W_TYPE_SIZE 64
 140 # elif UINTMAX_MAX == UINT128_MAX
 141 #  define W_TYPE_SIZE 128
 142 # endif
 143
 144 # define UWtype  uintmax_t
 145 # define UHWtype unsigned long int
 146 # undef UDWtype
 147 # if HAVE_ATTRIBUTE_MODE
 148 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 149 typedef          int SItype     __attribute__ ((mode (SI)));
 150 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 151 typedef          int DItype     __attribute__ ((mode (DI)));
 152 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 153 # else
 154 typedef unsigned char UQItype;
 155 typedef          long SItype;
 156 typedef unsigned long int USItype;
 157 #  if HAVE_LONG_LONG_INT
 158 typedef long long int DItype;
 159 typedef unsigned long long int UDItype;
 160 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 161 typedef long int DItype;
 162 typedef unsigned long int UDItype;
 163 #  endif
 164 # endif
 165 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 166 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 167 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 168 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 169 # ifndef __GMP_GNUC_PREREQ
 170 #  define __GMP_GNUC_PREREQ(a,b) 1
 171 # endif
 172
 173 /* These stub macros are only used in longlong.h in certain system compiler
 174    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 175 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 176 ASSERT (1)
 177 __GMP_DECLSPEC
 178 # endif
 179
 180 # if _ARCH_PPC
 181 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 182 # endif
 183 # include "longlong.h"
 184 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 185 const unsigned char factor_clz_tab[129] =
 186 {
 187   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 188   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 189   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 190   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 191   9
 192 };
 193 # endif
 194
 195 #else /* not USE_LONGLONG_H */
 196
 197 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 198 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 199 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 200 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 201
 202 #endif
 203
 204 #if !defined __clz_tab && !defined UHWtype
 205 /* Without this seemingly useless conditional, gcc -Wunused-macros
 206    warns that each of the two tested macros is unused on Fedora 18.
 207    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 208 #endif
 209
 210 enum alg_type { ALG_POLLARD_RHO = 1, ALG_SQUFOF = 2 };
 211
 212 static enum alg_type alg;
 213
 214 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 215 #define MAX_NFACTS 26
 216
 217 enum
 218 {
 219   DEV_DEBUG_OPTION = CHAR_MAX + 1
 220 };
 221
 222 static struct option const long_options[] =
 223 {
 224   {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
 225   {GETOPT_HELP_OPTION_DECL},
 226   {GETOPT_VERSION_OPTION_DECL},
 227   {NULL, 0, NULL, 0}
 228 };
 229
 230 struct factors
 231 {
 232   uintmax_t     plarge[2]; /* Can have a single large factor */
 233   uintmax_t     p[MAX_NFACTS];
 234   unsigned char e[MAX_NFACTS];
 235   unsigned char nfactors;
 236 };
 237
 238 #if HAVE_GMP
 239 struct mp_factors
 240 {
 241   mpz_t             *p;
 242   unsigned long int *e;
 243   unsigned long int nfactors;
 244 };
 245 #endif
 246
 247 static void factor (uintmax_t, uintmax_t, struct factors *);
 248
 249 #ifndef umul_ppmm
 250 # define umul_ppmm(w1, w0, u, v)                                        \
 251   do {                                                                  \
 252     uintmax_t __x0, __x1, __x2, __x3;                                   \
 253     unsigned long int __ul, __vl, __uh, __vh;                           \
 254     uintmax_t __u = (u), __v = (v);                                     \
 255                                                                         \
 256     __ul = __ll_lowpart (__u);                                          \
 257     __uh = __ll_highpart (__u);                                         \
 258     __vl = __ll_lowpart (__v);                                          \
 259     __vh = __ll_highpart (__v);                                         \
 260                                                                         \
 261     __x0 = (uintmax_t) __ul * __vl;                                     \
 262     __x1 = (uintmax_t) __ul * __vh;                                     \
 263     __x2 = (uintmax_t) __uh * __vl;                                     \
 264     __x3 = (uintmax_t) __uh * __vh;                                     \
 265                                                                         \
 266     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
 267     __x1 += __x2;               /* but this indeed can */               \
 268     if (__x1 < __x2)            /* did we get it? */                    \
 269       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
 270                                                                         \
 271     (w1) = __x3 + __ll_highpart (__x1);                                 \
 272     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 273   } while (0)
 274 #endif
 275
 276 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 277 /* Define our own, not needing normalization. This function is
 278    currently not performance critical, so keep it simple. Similar to
 279    the mod macro below. */
 280 # undef udiv_qrnnd
 281 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 282   do {                                                                  \
 283     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 284                                                                         \
 285     assert ((n1) < (d));                                                \
 286     __d1 = (d); __d0 = 0;                                               \
 287     __r1 = (n1); __r0 = (n0);                                           \
 288     __q = 0;                                                            \
 289     for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--)                \
 290       {                                                                 \
 291         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 292         __q <<= 1;                                                      \
 293         if (ge2 (__r1, __r0, __d1, __d0))                               \
 294           {                                                             \
 295             __q++;                                                      \
 296             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 297           }                                                             \
 298       }                                                                 \
 299     (r) = __r0;                                                         \
 300     (q) = __q;                                                          \
 301   } while (0)
 302 #endif
 303
 304 #if !defined add_ssaaaa
 305 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 306   do {                                                                  \
 307     uintmax_t _add_x;                                                   \
 308     _add_x = (al) + (bl);                                               \
 309     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 310     (sl) = _add_x;                                                      \
 311   } while (0)
 312 #endif
 313
 314 #define rsh2(rh, rl, ah, al, cnt)                                       \
 315   do {                                                                  \
 316     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 317     (rh) = (ah) >> (cnt);                                               \
 318   } while (0)
 319
 320 #define lsh2(rh, rl, ah, al, cnt)                                       \
 321   do {                                                                  \
 322     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 323     (rl) = (al) << (cnt);                                               \
 324   } while (0)
 325
 326 #define ge2(ah, al, bh, bl)                                             \
 327   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 328
 329 #define gt2(ah, al, bh, bl)                                             \
 330   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 331
 332 #ifndef sub_ddmmss
 333 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 334   do {                                                                  \
 335     uintmax_t _cy;                                                      \
 336     _cy = (al) < (bl);                                                  \
 337     (rl) = (al) - (bl);                                                 \
 338     (rh) = (ah) - (bh) - _cy;                                           \
 339   } while (0)
 340 #endif
 341
 342 #ifndef count_leading_zeros
 343 # define count_leading_zeros(count, x) do {                             \
 344     uintmax_t __clz_x = (x);                                            \
 345     unsigned int __clz_c;                                               \
 346     for (__clz_c = 0;                                                   \
 347          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 348          __clz_c += 8)                                                  \
 349       __clz_x <<= 8;                                                    \
 350     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 351       __clz_x <<= 1;                                                    \
 352     (count) = __clz_c;                                                  \
 353   } while (0)
 354 #endif
 355
 356 #ifndef count_trailing_zeros
 357 # define count_trailing_zeros(count, x) do {                            \
 358     uintmax_t __ctz_x = (x);                                            \
 359     unsigned int __ctz_c = 0;                                           \
 360     while ((__ctz_x & 1) == 0)                                          \
 361       {                                                                 \
 362         __ctz_x >>= 1;                                                  \
 363         __ctz_c++;                                                      \
 364       }                                                                 \
 365     (count) = __ctz_c;                                                  \
 366   } while (0)
 367 #endif
 368
 369 /* Requires that a < n and b <= n */
 370 #define submod(r,a,b,n)                                                 \
 371   do {                                                                  \
 372     uintmax_t _t = - (uintmax_t) (a < b);                               \
 373     (r) = ((n) & _t) + (a) - (b);                                       \
 374   } while (0)
 375
 376 #define addmod(r,a,b,n)                                                 \
 377   submod ((r), (a), ((n) - (b)), (n))
 378
 379 /* Modular two-word addition and subtraction.  For performance reasons, the
 380    most significant bit of n1 must be clear.  The destination variables must be
 381    distinct from the mod operand.  */
 382 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 383   do {                                                                  \
 384     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 385     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 386       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 387   } while (0)
 388 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 389   do {                                                                  \
 390     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 391     if ((intmax_t) (r1) < 0)                                            \
 392       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 393   } while (0)
 394
 395 #define HIGHBIT_TO_MASK(x)                                              \
 396   (((intmax_t)-1 >> 1) < 0                                              \
 397    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 398    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 399       ? UINTMAX_MAX : (uintmax_t) 0))
 400
 401 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 402    Requires that d1 != 0.  */
 403 static uintmax_t
 404 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 405 {
 406   int cntd, cnta;
 407
 408   assert (d1 != 0);
 409
 410   if (a1 == 0)
 411     {
 412       *r1 = 0;
 413       return a0;
 414     }
 415
 416   count_leading_zeros (cntd, d1);
 417   count_leading_zeros (cnta, a1);
 418   int cnt = cntd - cnta;
 419   lsh2 (d1, d0, d1, d0, cnt);
 420   for (int i = 0; i < cnt; i++)
 421     {
 422       if (ge2 (a1, a0, d1, d0))
 423         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 424       rsh2 (d1, d0, d1, d0, 1);
 425     }
 426
 427   *r1 = a1;
 428   return a0;
 429 }
 430
 431 static uintmax_t _GL_ATTRIBUTE_CONST
 432 gcd_odd (uintmax_t a, uintmax_t b)
 433 {
 434   if ( (b & 1) == 0)
 435     {
 436       uintmax_t t = b;
 437       b = a;
 438       a = t;
 439     }
 440   if (a == 0)
 441     return b;
 442
 443   /* Take out least significant one bit, to make room for sign */
 444   b >>= 1;
 445
 446   for (;;)
 447     {
 448       uintmax_t t;
 449       uintmax_t bgta;
 450
 451       while ((a & 1) == 0)
 452         a >>= 1;
 453       a >>= 1;
 454
 455       t = a - b;
 456       if (t == 0)
 457         return (a << 1) + 1;
 458
 459       bgta = HIGHBIT_TO_MASK (t);
 460
 461       /* b <-- min (a, b) */
 462       b += (bgta & t);
 463
 464       /* a <-- |a - b| */
 465       a = (t ^ bgta) - bgta;
 466     }
 467 }
 468
 469 static uintmax_t
 470 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 471 {
 472   while ((a0 & 1) == 0)
 473     rsh2 (a1, a0, a1, a0, 1);
 474   while ((b0 & 1) == 0)
 475     rsh2 (b1, b0, b1, b0, 1);
 476
 477   for (;;)
 478     {
 479       if ((b1 | a1) == 0)
 480         {
 481           *r1 = 0;
 482           return gcd_odd (b0, a0);
 483         }
 484
 485       if (gt2 (a1, a0, b1, b0))
 486         {
 487           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 488           do
 489             rsh2 (a1, a0, a1, a0, 1);
 490           while ((a0 & 1) == 0);
 491         }
 492       else if (gt2 (b1, b0, a1, a0))
 493         {
 494           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 495           do
 496             rsh2 (b1, b0, b1, b0, 1);
 497           while ((b0 & 1) == 0);
 498         }
 499       else
 500         break;
 501     }
 502
 503   *r1 = a1;
 504   return a0;
 505 }
 506
 507 static void
 508 factor_insert_multiplicity (struct factors *factors,
 509                             uintmax_t prime, unsigned int m)
 510 {
 511   unsigned int nfactors = factors->nfactors;
 512   uintmax_t *p = factors->p;
 513   unsigned char *e = factors->e;
 514
 515   /* Locate position for insert new or increment e.  */
 516   int i;
 517   for (i = nfactors - 1; i >= 0; i--)
 518     {
 519       if (p[i] <= prime)
 520         break;
 521     }
 522
 523   if (i < 0 || p[i] != prime)
 524     {
 525       for (int j = nfactors - 1; j > i; j--)
 526         {
 527           p[j + 1] = p[j];
 528           e[j + 1] = e[j];
 529         }
 530       p[i + 1] = prime;
 531       e[i + 1] = m;
 532       factors->nfactors = nfactors + 1;
 533     }
 534   else
 535     {
 536       e[i] += m;
 537     }
 538 }
 539
 540 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 541
 542 static void
 543 factor_insert_large (struct factors *factors,
 544                      uintmax_t p1, uintmax_t p0)
 545 {
 546   if (p1 > 0)
 547     {
 548       assert (factors->plarge[1] == 0);
 549       factors->plarge[0] = p0;
 550       factors->plarge[1] = p1;
 551     }
 552   else
 553     factor_insert (factors, p0);
 554 }
 555
 556 #if HAVE_GMP
 557
 558 # if !HAVE_DECL_MPZ_INITS
 559
 560 #  define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 561 #  define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 562
 563 static void
 564 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 565 {
 566   va_list ap;
 567
 568   va_start (ap, mpz_single_init);
 569
 570   mpz_t *mpz;
 571   while ((mpz = va_arg (ap, mpz_t *)))
 572     mpz_single_init (*mpz);
 573
 574   va_end (ap);
 575 }
 576 # endif
 577
 578 static void mp_factor (mpz_t, struct mp_factors *);
 579
 580 static void
 581 mp_factor_init (struct mp_factors *factors)
 582 {
 583   factors->p = NULL;
 584   factors->e = NULL;
 585   factors->nfactors = 0;
 586 }
 587
 588 static void
 589 mp_factor_clear (struct mp_factors *factors)
 590 {
 591   for (unsigned int i = 0; i < factors->nfactors; i++)
 592     mpz_clear (factors->p[i]);
 593
 594   free (factors->p);
 595   free (factors->e);
 596 }
 597
 598 static void
 599 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 600 {
 601   unsigned long int nfactors = factors->nfactors;
 602   mpz_t         *p  = factors->p;
 603   unsigned long int *e  = factors->e;
 604   long i;
 605
 606   /* Locate position for insert new or increment e.  */
 607   for (i = nfactors - 1; i >= 0; i--)
 608     {
 609       if (mpz_cmp (p[i], prime) <= 0)
 610         break;
 611     }
 612
 613   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 614     {
 615       p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
 616       e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
 617
 618       mpz_init (p[nfactors]);
 619       for (long j = nfactors - 1; j > i; j--)
 620         {
 621           mpz_set (p[j + 1], p[j]);
 622           e[j + 1] = e[j];
 623         }
 624       mpz_set (p[i + 1], prime);
 625       e[i + 1] = 1;
 626
 627       factors->p = p;
 628       factors->e = e;
 629       factors->nfactors = nfactors + 1;
 630     }
 631   else
 632     {
 633       e[i] += 1;
 634     }
 635 }
 636
 637 static void
 638 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 639 {
 640   mpz_t pz;
 641
 642   mpz_init_set_ui (pz, prime);
 643   mp_factor_insert (factors, pz);
 644   mpz_clear (pz);
 645 }
 646 #endif /* HAVE_GMP */
 647
 648
 649 /* Number of bits in an uintmax_t.  */
 650 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 651
 652 /* Verify that uintmax_t does not have holes in its representation.  */
 653 verify (UINTMAX_MAX >> (W - 1) == 1);
 654
 655 #define P(a,b,c,d) a,
 656 static const unsigned char primes_diff[] = {
 657 #include "primes.h"
 658 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 659 };
 660 #undef P
 661
 662 #define PRIMES_PTAB_ENTRIES \
 663   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 664
 665 #define P(a,b,c,d) b,
 666 static const unsigned char primes_diff8[] = {
 667 #include "primes.h"
 668 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 669 };
 670 #undef P
 671
 672 struct primes_dtab
 673 {
 674   uintmax_t binv, lim;
 675 };
 676
 677 #define P(a,b,c,d) {c,d},
 678 static const struct primes_dtab primes_dtab[] = {
 679 #include "primes.h"
 680 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 681 };
 682 #undef P
 683
 684 /* Verify that uintmax_t is not wider than
 685    the integers used to generate primes.h.  */
 686 verify (W <= WIDE_UINT_BITS);
 687
 688 /* debugging for developers.  Enables devmsg().
 689    This flag is used only in the GMP code.  */
 690 static bool dev_debug = false;
 691
 692 /* Prove primality or run probabilistic tests.  */
 693 static bool flag_prove_primality = true;
 694
 695 /* Number of Miller-Rabin tests to run when not proving primality. */
 696 #define MR_REPS 25
 697
 698 #ifdef __GNUC__
 699 # define LIKELY(cond)    __builtin_expect ((cond), 1)
 700 # define UNLIKELY(cond)  __builtin_expect ((cond), 0)
 701 #else
 702 # define LIKELY(cond)    (cond)
 703 # define UNLIKELY(cond)  (cond)
 704 #endif
 705
 706 static void
 707 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
 708                       unsigned int off)
 709 {
 710   for (unsigned int j = 0; j < off; j++)
 711     p += primes_diff[i + j];
 712   factor_insert (factors, p);
 713 }
 714
 715 /* Trial division with odd primes uses the following trick.
 716
 717    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
 718    consider the case t < B (this is the second loop below).
 719
 720    From our tables we get
 721
 722      binv = p^{-1} (mod B)
 723      lim = floor ( (B-1) / p ).
 724
 725    First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
 726    (and all quotients in this range occur for some t).
 727
 728    Then t = q * p is true also (mod B), and p is invertible we get
 729
 730      q = t * binv (mod B).
 731
 732    Next, assume that t is *not* divisible by p. Since multiplication
 733    by binv (mod B) is a one-to-one mapping,
 734
 735      t * binv (mod B) > lim,
 736
 737    because all the smaller values are already taken.
 738
 739    This can be summed up by saying that the function
 740
 741      q(t) = binv * t (mod B)
 742
 743    is a permutation of the range 0 <= t < B, with the curious property
 744    that it maps the multiples of p onto the range 0 <= q <= lim, in
 745    order, and the non-multiples of p onto the range lim < q < B.
 746  */
 747
 748 static uintmax_t
 749 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 750                        struct factors *factors)
 751 {
 752   if (t0 % 2 == 0)
 753     {
 754       unsigned int cnt;
 755
 756       if (t0 == 0)
 757         {
 758           count_trailing_zeros (cnt, t1);
 759           t0 = t1 >> cnt;
 760           t1 = 0;
 761           cnt += W_TYPE_SIZE;
 762         }
 763       else
 764         {
 765           count_trailing_zeros (cnt, t0);
 766           rsh2 (t1, t0, t1, t0, cnt);
 767         }
 768
 769       factor_insert_multiplicity (factors, 2, cnt);
 770     }
 771
 772   uintmax_t p = 3;
 773   unsigned int i;
 774   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 775     {
 776       for (;;)
 777         {
 778           uintmax_t q1, q0, hi, lo _GL_UNUSED;
 779
 780           q0 = t0 * primes_dtab[i].binv;
 781           umul_ppmm (hi, lo, q0, p);
 782           if (hi > t1)
 783             break;
 784           hi = t1 - hi;
 785           q1 = hi * primes_dtab[i].binv;
 786           if (LIKELY (q1 > primes_dtab[i].lim))
 787             break;
 788           t1 = q1; t0 = q0;
 789           factor_insert (factors, p);
 790         }
 791       p += primes_diff[i + 1];
 792     }
 793   if (t1p)
 794     *t1p = t1;
 795
 796 #define DIVBLOCK(I)                                                     \
 797   do {                                                                  \
 798     for (;;)                                                            \
 799       {                                                                 \
 800         q = t0 * pd[I].binv;                                            \
 801         if (LIKELY (q > pd[I].lim))                                     \
 802           break;                                                        \
 803         t0 = q;                                                         \
 804         factor_insert_refind (factors, p, i + 1, I);                    \
 805       }                                                                 \
 806   } while (0)
 807
 808   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 809     {
 810       uintmax_t q;
 811       const struct primes_dtab *pd = &primes_dtab[i];
 812       DIVBLOCK (0);
 813       DIVBLOCK (1);
 814       DIVBLOCK (2);
 815       DIVBLOCK (3);
 816       DIVBLOCK (4);
 817       DIVBLOCK (5);
 818       DIVBLOCK (6);
 819       DIVBLOCK (7);
 820
 821       p += primes_diff8[i];
 822       if (p * p > t0)
 823         break;
 824     }
 825
 826   return t0;
 827 }
 828
 829 #if HAVE_GMP
 830 static void
 831 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 832 {
 833   mpz_t q;
 834   unsigned long int p;
 835
 836   devmsg ("[trial division] ");
 837
 838   mpz_init (q);
 839
 840   p = mpz_scan1 (t, 0);
 841   mpz_div_2exp (t, t, p);
 842   while (p)
 843     {
 844       mp_factor_insert_ui (factors, 2);
 845       --p;
 846     }
 847
 848   p = 3;
 849   for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
 850     {
 851       if (! mpz_divisible_ui_p (t, p))
 852         {
 853           p += primes_diff[i++];
 854           if (mpz_cmp_ui (t, p * p) < 0)
 855             break;
 856         }
 857       else
 858         {
 859           mpz_tdiv_q_ui (t, t, p);
 860           mp_factor_insert_ui (factors, p);
 861         }
 862     }
 863
 864   mpz_clear (q);
 865 }
 866 #endif
 867
 868 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 869 static const unsigned char  binvert_table[128] =
 870 {
 871   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 872   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 873   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 874   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 875   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 876   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 877   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 878   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 879   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 880   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 881   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 882   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 883   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 884   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 885   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 886   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 887 };
 888
 889 /* Compute n^(-1) mod B, using a Newton iteration.  */
 890 #define binv(inv,n)                                                     \
 891   do {                                                                  \
 892     uintmax_t  __n = (n);                                               \
 893     uintmax_t  __inv;                                                   \
 894                                                                         \
 895     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 896     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 897     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 898     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 899                                                                         \
 900     if (W_TYPE_SIZE > 64)                                               \
 901       {                                                                 \
 902         int  __invbits = 64;                                            \
 903         do {                                                            \
 904           __inv = 2 * __inv - __inv * __inv * __n;                      \
 905           __invbits *= 2;                                               \
 906         } while (__invbits < W_TYPE_SIZE);                              \
 907       }                                                                 \
 908                                                                         \
 909     (inv) = __inv;                                                      \
 910   } while (0)
 911
 912 /* q = u / d, assuming d|u.  */
 913 #define divexact_21(q1, q0, u1, u0, d)                                  \
 914   do {                                                                  \
 915     uintmax_t _di, _q0;                                                 \
 916     binv (_di, (d));                                                    \
 917     _q0 = (u0) * _di;                                                   \
 918     if ((u1) >= (d))                                                    \
 919       {                                                                 \
 920         uintmax_t _p1, _p0 _GL_UNUSED;                            \
 921         umul_ppmm (_p1, _p0, _q0, d);                                   \
 922         (q1) = ((u1) - _p1) * _di;                                      \
 923         (q0) = _q0;                                                     \
 924       }                                                                 \
 925     else                                                                \
 926       {                                                                 \
 927         (q0) = _q0;                                                     \
 928         (q1) = 0;                                                       \
 929       }                                                                 \
 930   } while (0)
 931
 932 /* x B (mod n). */
 933 #define redcify(r_prim, r, n)                                           \
 934   do {                                                                  \
 935     uintmax_t _redcify_q _GL_UNUSED;                              \
 936     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 937   } while (0)
 938
 939 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
 940 #define redcify2(r1, r0, x, n1, n0)                                     \
 941   do {                                                                  \
 942     uintmax_t _r1, _r0, _i;                                             \
 943     if ((x) < (n1))                                                     \
 944       {                                                                 \
 945         _r1 = (x); _r0 = 0;                                             \
 946         _i = W_TYPE_SIZE;                                               \
 947       }                                                                 \
 948     else                                                                \
 949       {                                                                 \
 950         _r1 = 0; _r0 = (x);                                             \
 951         _i = 2*W_TYPE_SIZE;                                             \
 952       }                                                                 \
 953     while (_i-- > 0)                                                    \
 954       {                                                                 \
 955         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 956         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 957           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 958       }                                                                 \
 959     (r1) = _r1;                                                         \
 960     (r0) = _r0;                                                         \
 961   } while (0)
 962
 963 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 964    Both a and b must be in redc form, the result will be in redc form too. */
 965 static inline uintmax_t
 966 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 967 {
 968   uintmax_t rh, rl, q, th, tl _GL_UNUSED, xh;
 969
 970   umul_ppmm (rh, rl, a, b);
 971   q = rl * mi;
 972   umul_ppmm (th, tl, q, m);
 973   xh = rh - th;
 974   if (rh < th)
 975     xh += m;
 976
 977   return xh;
 978 }
 979
 980 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 981    Both a and b must be in redc form, the result will be in redc form too.
 982    For performance reasons, the most significant bit of m must be clear. */
 983 static uintmax_t
 984 mulredc2 (uintmax_t *r1p,
 985           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 986           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 987 {
 988   uintmax_t r1, r0, q, p1, p0 _GL_UNUSED, t1, t0, s1, s0;
 989   mi = -mi;
 990   assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
 991   assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
 992   assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
 993
 994   /* First compute a0 * <b1, b0> B^{-1}
 995         +-----+
 996         |a0 b0|
 997      +--+--+--+
 998      |a0 b1|
 999      +--+--+--+
1000         |q0 m0|
1001      +--+--+--+
1002      |q0 m1|
1003     -+--+--+--+
1004      |r1|r0| 0|
1005      +--+--+--+
1006   */
1007   umul_ppmm (t1, t0, a0, b0);
1008   umul_ppmm (r1, r0, a0, b1);
1009   q = mi * t0;
1010   umul_ppmm (p1, p0, q, m0);
1011   umul_ppmm (s1, s0, q, m1);
1012   r0 += (t0 != 0); /* Carry */
1013   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1014   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1015   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1016
1017   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1018         +-----+
1019         |a1 b0|
1020         +--+--+
1021         |r1|r0|
1022      +--+--+--+
1023      |a1 b1|
1024      +--+--+--+
1025         |q1 m0|
1026      +--+--+--+
1027      |q1 m1|
1028     -+--+--+--+
1029      |r1|r0| 0|
1030      +--+--+--+
1031   */
1032   umul_ppmm (t1, t0, a1, b0);
1033   umul_ppmm (s1, s0, a1, b1);
1034   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1035   q = mi * t0;
1036   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1037   umul_ppmm (p1, p0, q, m0);
1038   umul_ppmm (s1, s0, q, m1);
1039   r0 += (t0 != 0); /* Carry */
1040   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1041   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1042   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1043
1044   if (ge2 (r1, r0, m1, m0))
1045     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1046
1047   *r1p = r1;
1048   return r0;
1049 }
1050
1051 static uintmax_t _GL_ATTRIBUTE_CONST
1052 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1053 {
1054   uintmax_t y = one;
1055
1056   if (e & 1)
1057     y = b;
1058
1059   while (e != 0)
1060     {
1061       b = mulredc (b, b, n, ni);
1062       e >>= 1;
1063
1064       if (e & 1)
1065         y = mulredc (y, b, n, ni);
1066     }
1067
1068   return y;
1069 }
1070
1071 static uintmax_t
1072 powm2 (uintmax_t *r1m,
1073        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1074        uintmax_t ni, const uintmax_t *one)
1075 {
1076   uintmax_t r1, r0, b1, b0, n1, n0;
1077   unsigned int i;
1078   uintmax_t e;
1079
1080   b0 = bp[0];
1081   b1 = bp[1];
1082   n0 = np[0];
1083   n1 = np[1];
1084
1085   r0 = one[0];
1086   r1 = one[1];
1087
1088   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1089     {
1090       if (e & 1)
1091         {
1092           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1093           r1 = *r1m;
1094         }
1095       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1096       b1 = *r1m;
1097     }
1098   for (e = ep[1]; e > 0; e >>= 1)
1099     {
1100       if (e & 1)
1101         {
1102           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1103           r1 = *r1m;
1104         }
1105       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1106       b1 = *r1m;
1107     }
1108   *r1m = r1;
1109   return r0;
1110 }
1111
1112 static bool _GL_ATTRIBUTE_CONST
1113 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1114              unsigned int k, uintmax_t one)
1115 {
1116   uintmax_t y = powm (b, q, n, ni, one);
1117
1118   uintmax_t nm1 = n - one;      /* -1, but in redc representation. */
1119
1120   if (y == one || y == nm1)
1121     return true;
1122
1123   for (unsigned int i = 1; i < k; i++)
1124     {
1125       y = mulredc (y, y, n, ni);
1126
1127       if (y == nm1)
1128         return true;
1129       if (y == one)
1130         return false;
1131     }
1132   return false;
1133 }
1134
1135 static bool
1136 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1137               const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1138 {
1139   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1140
1141   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1142   y1 = r1m;
1143
1144   if (y0 == one[0] && y1 == one[1])
1145     return true;
1146
1147   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1148
1149   if (y0 == nm1_0 && y1 == nm1_1)
1150     return true;
1151
1152   for (unsigned int i = 1; i < k; i++)
1153     {
1154       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1155       y1 = r1m;
1156
1157       if (y0 == nm1_0 && y1 == nm1_1)
1158         return true;
1159       if (y0 == one[0] && y1 == one[1])
1160         return false;
1161     }
1162   return false;
1163 }
1164
1165 #if HAVE_GMP
1166 static bool
1167 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1168                 mpz_srcptr q, unsigned long int k)
1169 {
1170   mpz_powm (y, x, q, n);
1171
1172   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1173     return true;
1174
1175   for (unsigned long int i = 1; i < k; i++)
1176     {
1177       mpz_powm_ui (y, y, 2, n);
1178       if (mpz_cmp (y, nm1) == 0)
1179         return true;
1180       if (mpz_cmp_ui (y, 1) == 0)
1181         return false;
1182     }
1183   return false;
1184 }
1185 #endif
1186
1187 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1188    have been observed.  The average seem to be about 2.  */
1189 static bool
1190 prime_p (uintmax_t n)
1191 {
1192   int k;
1193   bool is_prime;
1194   uintmax_t a_prim, one, ni;
1195   struct factors factors;
1196
1197   if (n <= 1)
1198     return false;
1199
1200   /* We have already casted out small primes. */
1201   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1202     return true;
1203
1204   /* Precomputation for Miller-Rabin.  */
1205   uintmax_t q = n - 1;
1206   for (k = 0; (q & 1) == 0; k++)
1207     q >>= 1;
1208
1209   uintmax_t a = 2;
1210   binv (ni, n);                 /* ni <- 1/n mod B */
1211   redcify (one, 1, n);
1212   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1213
1214   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1215   if (!millerrabin (n, ni, a_prim, q, k, one))
1216     return false;
1217
1218   if (flag_prove_primality)
1219     {
1220       /* Factor n-1 for Lucas.  */
1221       factor (0, n - 1, &factors);
1222     }
1223
1224   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1225      number composite.  */
1226   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1227     {
1228       if (flag_prove_primality)
1229         {
1230           is_prime = true;
1231           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1232             {
1233               is_prime
1234                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1235             }
1236         }
1237       else
1238         {
1239           /* After enough Miller-Rabin runs, be content. */
1240           is_prime = (r == MR_REPS - 1);
1241         }
1242
1243       if (is_prime)
1244         return true;
1245
1246       a += primes_diff[r];      /* Establish new base.  */
1247
1248       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1249          on most processors, since it avoids udiv_qrnnd.  If we go down the
1250          udiv_qrnnd_preinv path, this code should be replaced.  */
1251       {
1252         uintmax_t s1, s0;
1253         umul_ppmm (s1, s0, one, a);
1254         if (LIKELY (s1 == 0))
1255           a_prim = s0 % n;
1256         else
1257           {
1258             uintmax_t dummy _GL_UNUSED;
1259             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1260           }
1261       }
1262
1263       if (!millerrabin (n, ni, a_prim, q, k, one))
1264         return false;
1265     }
1266
1267   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1268   abort ();
1269 }
1270
1271 static bool
1272 prime2_p (uintmax_t n1, uintmax_t n0)
1273 {
1274   uintmax_t q[2], nm1[2];
1275   uintmax_t a_prim[2];
1276   uintmax_t one[2];
1277   uintmax_t na[2];
1278   uintmax_t ni;
1279   unsigned int k;
1280   struct factors factors;
1281
1282   if (n1 == 0)
1283     return prime_p (n0);
1284
1285   nm1[1] = n1 - (n0 == 0);
1286   nm1[0] = n0 - 1;
1287   if (nm1[0] == 0)
1288     {
1289       count_trailing_zeros (k, nm1[1]);
1290
1291       q[0] = nm1[1] >> k;
1292       q[1] = 0;
1293       k += W_TYPE_SIZE;
1294     }
1295   else
1296     {
1297       count_trailing_zeros (k, nm1[0]);
1298       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1299     }
1300
1301   uintmax_t a = 2;
1302   binv (ni, n0);
1303   redcify2 (one[1], one[0], 1, n1, n0);
1304   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1305
1306   /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1307   na[0] = n0;
1308   na[1] = n1;
1309
1310   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1311     return false;
1312
1313   if (flag_prove_primality)
1314     {
1315       /* Factor n-1 for Lucas.  */
1316       factor (nm1[1], nm1[0], &factors);
1317     }
1318
1319   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1320      number composite.  */
1321   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1322     {
1323       bool is_prime;
1324       uintmax_t e[2], y[2];
1325
1326       if (flag_prove_primality)
1327         {
1328           is_prime = true;
1329           if (factors.plarge[1])
1330             {
1331               uintmax_t pi;
1332               binv (pi, factors.plarge[0]);
1333               e[0] = pi * nm1[0];
1334               e[1] = 0;
1335               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1336               is_prime = (y[0] != one[0] || y[1] != one[1]);
1337             }
1338           for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1339             {
1340               /* FIXME: We always have the factor 2. Do we really need to
1341                  handle it here? We have done the same powering as part
1342                  of millerrabin. */
1343               if (factors.p[i] == 2)
1344                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1345               else
1346                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1347               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1348               is_prime = (y[0] != one[0] || y[1] != one[1]);
1349             }
1350         }
1351       else
1352         {
1353           /* After enough Miller-Rabin runs, be content. */
1354           is_prime = (r == MR_REPS - 1);
1355         }
1356
1357       if (is_prime)
1358         return true;
1359
1360       a += primes_diff[r];      /* Establish new base.  */
1361       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1362
1363       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1364         return false;
1365     }
1366
1367   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1368   abort ();
1369 }
1370
1371 #if HAVE_GMP
1372 static bool
1373 mp_prime_p (mpz_t n)
1374 {
1375   bool is_prime;
1376   mpz_t q, a, nm1, tmp;
1377   struct mp_factors factors;
1378
1379   if (mpz_cmp_ui (n, 1) <= 0)
1380     return false;
1381
1382   /* We have already casted out small primes. */
1383   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1384     return true;
1385
1386   mpz_inits (q, a, nm1, tmp, NULL);
1387
1388   /* Precomputation for Miller-Rabin.  */
1389   mpz_sub_ui (nm1, n, 1);
1390
1391   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1392   unsigned long int k = mpz_scan1 (nm1, 0);
1393   mpz_tdiv_q_2exp (q, nm1, k);
1394
1395   mpz_set_ui (a, 2);
1396
1397   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1398   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1399     {
1400       is_prime = false;
1401       goto ret2;
1402     }
1403
1404   if (flag_prove_primality)
1405     {
1406       /* Factor n-1 for Lucas.  */
1407       mpz_set (tmp, nm1);
1408       mp_factor (tmp, &factors);
1409     }
1410
1411   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1412      number composite.  */
1413   for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1414     {
1415       if (flag_prove_primality)
1416         {
1417           is_prime = true;
1418           for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1419             {
1420               mpz_divexact (tmp, nm1, factors.p[i]);
1421               mpz_powm (tmp, a, tmp, n);
1422               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1423             }
1424         }
1425       else
1426         {
1427           /* After enough Miller-Rabin runs, be content. */
1428           is_prime = (r == MR_REPS - 1);
1429         }
1430
1431       if (is_prime)
1432         goto ret1;
1433
1434       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1435
1436       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1437         {
1438           is_prime = false;
1439           goto ret1;
1440         }
1441     }
1442
1443   error (0, 0, _("Lucas prime test failure.  This should not happen"));
1444   abort ();
1445
1446  ret1:
1447   if (flag_prove_primality)
1448     mp_factor_clear (&factors);
1449  ret2:
1450   mpz_clears (q, a, nm1, tmp, NULL);
1451
1452   return is_prime;
1453 }
1454 #endif
1455
1456 static void
1457 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1458                           struct factors *factors)
1459 {
1460   uintmax_t x, z, y, P, t, ni, g;
1461
1462   unsigned long int k = 1;
1463   unsigned long int l = 1;
1464
1465   redcify (P, 1, n);
1466   addmod (x, P, P, n);          /* i.e., redcify(2) */
1467   y = z = x;
1468
1469   while (n != 1)
1470     {
1471       assert (a < n);
1472
1473       binv (ni, n);             /* FIXME: when could we use old 'ni' value? */
1474
1475       for (;;)
1476         {
1477           do
1478             {
1479               x = mulredc (x, x, n, ni);
1480               addmod (x, x, a, n);
1481
1482               submod (t, z, x, n);
1483               P = mulredc (P, t, n, ni);
1484
1485               if (k % 32 == 1)
1486                 {
1487                   if (gcd_odd (P, n) != 1)
1488                     goto factor_found;
1489                   y = x;
1490                 }
1491             }
1492           while (--k != 0);
1493
1494           z = x;
1495           k = l;
1496           l = 2 * l;
1497           for (unsigned long int i = 0; i < k; i++)
1498             {
1499               x = mulredc (x, x, n, ni);
1500               addmod (x, x, a, n);
1501             }
1502           y = x;
1503         }
1504
1505     factor_found:
1506       do
1507         {
1508           y = mulredc (y, y, n, ni);
1509           addmod (y, y, a, n);
1510
1511           submod (t, z, y, n);
1512           g = gcd_odd (t, n);
1513         }
1514       while (g == 1);
1515
1516       n = n / g;
1517
1518       if (!prime_p (g))
1519         factor_using_pollard_rho (g, a + 1, factors);
1520       else
1521         factor_insert (factors, g);
1522
1523       if (prime_p (n))
1524         {
1525           factor_insert (factors, n);
1526           break;
1527         }
1528
1529       x = x % n;
1530       z = z % n;
1531       y = y % n;
1532     }
1533 }
1534
1535 static void
1536 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1537                            struct factors *factors)
1538 {
1539   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1540
1541   unsigned long int k = 1;
1542   unsigned long int l = 1;
1543
1544   redcify2 (P1, P0, 1, n1, n0);
1545   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1546   y1 = z1 = x1;
1547   y0 = z0 = x0;
1548
1549   while (n1 != 0 || n0 != 1)
1550     {
1551       binv (ni, n0);
1552
1553       for (;;)
1554         {
1555           do
1556             {
1557               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1558               x1 = r1m;
1559               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1560
1561               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1562               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1563               P1 = r1m;
1564
1565               if (k % 32 == 1)
1566                 {
1567                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1568                   if (g1 != 0 || g0 != 1)
1569                     goto factor_found;
1570                   y1 = x1; y0 = x0;
1571                 }
1572             }
1573           while (--k != 0);
1574
1575           z1 = x1; z0 = x0;
1576           k = l;
1577           l = 2 * l;
1578           for (unsigned long int i = 0; i < k; i++)
1579             {
1580               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1581               x1 = r1m;
1582               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1583             }
1584           y1 = x1; y0 = x0;
1585         }
1586
1587     factor_found:
1588       do
1589         {
1590           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1591           y1 = r1m;
1592           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1593
1594           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1595           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1596         }
1597       while (g1 == 0 && g0 == 1);
1598
1599       if (g1 == 0)
1600         {
1601           /* The found factor is one word. */
1602           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1603
1604           if (!prime_p (g0))
1605             factor_using_pollard_rho (g0, a + 1, factors);
1606           else
1607             factor_insert (factors, g0);
1608         }
1609       else
1610         {
1611           /* The found factor is two words.  This is highly unlikely, thus hard
1612              to trigger.  Please be careful before you change this code!  */
1613           uintmax_t ginv;
1614
1615           binv (ginv, g0);      /* Compute n = n / g.  Since the result will */
1616           n0 = ginv * n0;       /* fit one word, we can compute the quotient */
1617           n1 = 0;               /* modulo B, ignoring the high divisor word. */
1618
1619           if (!prime2_p (g1, g0))
1620             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1621           else
1622             factor_insert_large (factors, g1, g0);
1623         }
1624
1625       if (n1 == 0)
1626         {
1627           if (prime_p (n0))
1628             {
1629               factor_insert (factors, n0);
1630               break;
1631             }
1632
1633           factor_using_pollard_rho (n0, a, factors);
1634           return;
1635         }
1636
1637       if (prime2_p (n1, n0))
1638         {
1639           factor_insert_large (factors, n1, n0);
1640           break;
1641         }
1642
1643       x0 = mod2 (&x1, x1, x0, n1, n0);
1644       z0 = mod2 (&z1, z1, z0, n1, n0);
1645       y0 = mod2 (&y1, y1, y0, n1, n0);
1646     }
1647 }
1648
1649 #if HAVE_GMP
1650 static void
1651 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1652                              struct mp_factors *factors)
1653 {
1654   mpz_t x, z, y, P;
1655   mpz_t t, t2;
1656
1657   devmsg ("[pollard-rho (%lu)] ", a);
1658
1659   mpz_inits (t, t2, NULL);
1660   mpz_init_set_si (y, 2);
1661   mpz_init_set_si (x, 2);
1662   mpz_init_set_si (z, 2);
1663   mpz_init_set_ui (P, 1);
1664
1665   unsigned long long int k = 1;
1666   unsigned long long int l = 1;
1667
1668   while (mpz_cmp_ui (n, 1) != 0)
1669     {
1670       for (;;)
1671         {
1672           do
1673             {
1674               mpz_mul (t, x, x);
1675               mpz_mod (x, t, n);
1676               mpz_add_ui (x, x, a);
1677
1678               mpz_sub (t, z, x);
1679               mpz_mul (t2, P, t);
1680               mpz_mod (P, t2, n);
1681
1682               if (k % 32 == 1)
1683                 {
1684                   mpz_gcd (t, P, n);
1685                   if (mpz_cmp_ui (t, 1) != 0)
1686                     goto factor_found;
1687                   mpz_set (y, x);
1688                 }
1689             }
1690           while (--k != 0);
1691
1692           mpz_set (z, x);
1693           k = l;
1694           l = 2 * l;
1695           for (unsigned long long int i = 0; i < k; i++)
1696             {
1697               mpz_mul (t, x, x);
1698               mpz_mod (x, t, n);
1699               mpz_add_ui (x, x, a);
1700             }
1701           mpz_set (y, x);
1702         }
1703
1704     factor_found:
1705       do
1706         {
1707           mpz_mul (t, y, y);
1708           mpz_mod (y, t, n);
1709           mpz_add_ui (y, y, a);
1710
1711           mpz_sub (t, z, y);
1712           mpz_gcd (t, t, n);
1713         }
1714       while (mpz_cmp_ui (t, 1) == 0);
1715
1716       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1717
1718       if (!mp_prime_p (t))
1719         {
1720           devmsg ("[composite factor--restarting pollard-rho] ");
1721           mp_factor_using_pollard_rho (t, a + 1, factors);
1722         }
1723       else
1724         {
1725           mp_factor_insert (factors, t);
1726         }
1727
1728       if (mp_prime_p (n))
1729         {
1730           mp_factor_insert (factors, n);
1731           break;
1732         }
1733
1734       mpz_mod (x, x, n);
1735       mpz_mod (z, z, n);
1736       mpz_mod (y, y, n);
1737     }
1738
1739   mpz_clears (P, t2, t, z, x, y, NULL);
1740 }
1741 #endif
1742
1743 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1744    algorithm is replaced, consider also returning the remainder. */
1745 static uintmax_t _GL_ATTRIBUTE_CONST
1746 isqrt (uintmax_t n)
1747 {
1748   uintmax_t x;
1749   unsigned c;
1750   if (n == 0)
1751     return 0;
1752
1753   count_leading_zeros (c, n);
1754
1755   /* Make x > sqrt(n). This will be invariant through the loop. */
1756   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1757
1758   for (;;)
1759     {
1760       uintmax_t y = (x + n/x) / 2;
1761       if (y >= x)
1762         return x;
1763
1764       x = y;
1765     }
1766 }
1767
1768 static uintmax_t _GL_ATTRIBUTE_CONST
1769 isqrt2 (uintmax_t nh, uintmax_t nl)
1770 {
1771   unsigned int shift;
1772   uintmax_t x;
1773
1774   /* Ensures the remainder fits in an uintmax_t. */
1775   assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1776
1777   if (nh == 0)
1778     return isqrt (nl);
1779
1780   count_leading_zeros (shift, nh);
1781   shift &= ~1;
1782
1783   /* Make x > sqrt(n) */
1784   x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1785   x <<= (W_TYPE_SIZE - shift) / 2;
1786
1787   /* Do we need more than one iteration? */
1788   for (;;)
1789     {
1790       uintmax_t r _GL_UNUSED;
1791       uintmax_t q, y;
1792       udiv_qrnnd (q, r, nh, nl, x);
1793       y = (x + q) / 2;
1794
1795       if (y >= x)
1796         {
1797           uintmax_t hi, lo;
1798           umul_ppmm (hi, lo, x + 1, x + 1);
1799           assert (gt2 (hi, lo, nh, nl));
1800
1801           umul_ppmm (hi, lo, x, x);
1802           assert (ge2 (nh, nl, hi, lo));
1803           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1804           assert (hi == 0);
1805
1806           return x;
1807         }
1808
1809       x = y;
1810     }
1811 }
1812
1813 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1814 #define MAGIC64 0x0202021202030213ULL
1815 #define MAGIC63 0x0402483012450293ULL
1816 #define MAGIC65 0x218a019866014613ULL
1817 #define MAGIC11 0x23b
1818
1819 /* Return the square root if the input is a square, otherwise 0. */
1820 static uintmax_t _GL_ATTRIBUTE_CONST
1821 is_square (uintmax_t x)
1822 {
1823   /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1824      computing the square root. */
1825   if (((MAGIC64 >> (x & 63)) & 1)
1826       && ((MAGIC63 >> (x % 63)) & 1)
1827       /* Both 0 and 64 are squares mod (65) */
1828       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1829       && ((MAGIC11 >> (x % 11) & 1)))
1830     {
1831       uintmax_t r = isqrt (x);
1832       if (r*r == x)
1833         return r;
1834     }
1835   return 0;
1836 }
1837
1838 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1839 static const unsigned short invtab[0x81] =
1840   {
1841     0x200,
1842     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1843     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1844     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1845     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1846     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1847     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1848     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1849     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1850     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1851     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1852     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1853     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1854     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1855     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1856     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1857     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1858   };
1859
1860 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1861    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1862 #define div_smallq(q, r, u, d)                                          \
1863   do {                                                                  \
1864     if ((u) / 0x40 < (d))                                               \
1865       {                                                                 \
1866         int _cnt;                                                       \
1867         uintmax_t _dinv, _mask, _q, _r;                                 \
1868         count_leading_zeros (_cnt, (d));                                \
1869         _r = (u);                                                       \
1870         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1871           {                                                             \
1872             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1873             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1874           }                                                             \
1875         else                                                            \
1876           {                                                             \
1877             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1878             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1879           }                                                             \
1880         _r -= _q*(d);                                                   \
1881                                                                         \
1882         _mask = -(uintmax_t) (_r >= (d));                               \
1883         (r) = _r - (_mask & (d));                                       \
1884         (q) = _q - _mask;                                               \
1885         assert ( (q) * (d) + (r) == u);                                 \
1886       }                                                                 \
1887     else                                                                \
1888       {                                                                 \
1889         uintmax_t _q = (u) / (d);                                       \
1890         (r) = (u) - _q * (d);                                           \
1891         (q) = _q;                                                       \
1892       }                                                                 \
1893   } while (0)
1894
1895 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1896    = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1897    with 3025 = 55^2.
1898
1899    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1900    representing G_0 = (-55, 2*4652, 8653).
1901
1902    In the notation of the paper:
1903
1904    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1905
1906    Put
1907
1908      t_0 = floor([q_0 + R_0] / S0) = 1
1909      R_1 = t_0 * S_0 - R_0 = 4001
1910      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1911 */
1912
1913 /* Multipliers, in order of efficiency:
1914    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1915    0.7317  3*5*7    =  105 = 1
1916    0.7820  3*5*11   =  165 = 1
1917    0.7872  3*5      =   15 = 3
1918    0.8101  3*7*11   =  231 = 3
1919    0.8155  3*7      =   21 = 1
1920    0.8284  5*7*11   =  385 = 1
1921    0.8339  5*7      =   35 = 3
1922    0.8716  3*11     =   33 = 1
1923    0.8774  3        =    3 = 3
1924    0.8913  5*11     =   55 = 3
1925    0.8972  5        =    5 = 1
1926    0.9233  7*11     =   77 = 1
1927    0.9295  7        =    7 = 3
1928    0.9934  11       =   11 = 3
1929 */
1930 #define QUEUE_SIZE 50
1931
1932 #if STAT_SQUFOF
1933 # define Q_FREQ_SIZE 50
1934 /* Element 0 keeps the total */
1935 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1936 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1937 #endif
1938
1939 /* Return true on success.  Expected to fail only for numbers
1940    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1941 static bool
1942 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1943 {
1944   /* Uses algorithm and notation from
1945
1946      SQUARE FORM FACTORIZATION
1947      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1948
1949      http://homes.cerias.purdue.edu/~ssw/squfof.pdf
1950    */
1951
1952   static const unsigned int multipliers_1[] =
1953     { /* = 1 (mod 4) */
1954       105, 165, 21, 385, 33, 5, 77, 1, 0
1955     };
1956   static const unsigned int multipliers_3[] =
1957     { /* = 3 (mod 4) */
1958       1155, 15, 231, 35, 3, 55, 7, 11, 0
1959     };
1960
1961   const unsigned int *m;
1962
1963   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1964
1965   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1966     return false;
1967
1968   uintmax_t sqrt_n = isqrt2 (n1, n0);
1969
1970   if (n0 == sqrt_n * sqrt_n)
1971     {
1972       uintmax_t p1, p0;
1973
1974       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1975       assert (p0 == n0);
1976
1977       if (n1 == p1)
1978         {
1979           if (prime_p (sqrt_n))
1980             factor_insert_multiplicity (factors, sqrt_n, 2);
1981           else
1982             {
1983               struct factors f;
1984
1985               f.nfactors = 0;
1986               if (!factor_using_squfof (0, sqrt_n, &f))
1987                 {
1988                   /* Try pollard rho instead */
1989                   factor_using_pollard_rho (sqrt_n, 1, &f);
1990                 }
1991               /* Duplicate the new factors */
1992               for (unsigned int i = 0; i < f.nfactors; i++)
1993                 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
1994             }
1995           return true;
1996         }
1997     }
1998
1999   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2000   for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2001        *m; m++)
2002     {
2003       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2004       unsigned int i;
2005       unsigned int mu = *m;
2006       unsigned int qpos = 0;
2007
2008       assert (mu * n0 % 4 == 3);
2009
2010       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2011          get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2012          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2013          mu^3 < n.
2014
2015          However, this seems insufficient: With n = 37243139 and mu =
2016          105, we get a trivial factor, from the square 38809 = 197^2,
2017          without any corresponding Q earlier in the iteration.
2018
2019          Requiring 64 mu^3 < n seems sufficient. */
2020       if (n1 == 0)
2021         {
2022           if ((uintmax_t) mu*mu*mu >= n0 / 64)
2023             continue;
2024         }
2025       else
2026         {
2027           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2028             continue;
2029         }
2030       umul_ppmm (Dh, Dl, n0, mu);
2031       Dh += n1 * mu;
2032
2033       assert (Dl % 4 != 1);
2034       assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2035
2036       S = isqrt2 (Dh, Dl);
2037
2038       Q1 = 1;
2039       P = S;
2040
2041       /* Square root remainder fits in one word, so ignore high part. */
2042       Q = Dl - P*P;
2043       /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2044       L = isqrt (2*S);
2045       B = 2*L;
2046       L1 = mu * 2 * L;
2047
2048       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2049          4 D. */
2050
2051       for (i = 0; i <= B; i++)
2052         {
2053           uintmax_t q, P1, t, rem;
2054
2055           div_smallq (q, rem, S+P, Q);
2056           P1 = S - rem; /* P1 = q*Q - P */
2057
2058           IF_LINT (assert (q > 0 && Q > 0));
2059
2060 #if STAT_SQUFOF
2061           q_freq[0]++;
2062           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2063 #endif
2064
2065           if (Q <= L1)
2066             {
2067               uintmax_t g = Q;
2068
2069               if ( (Q & 1) == 0)
2070                 g /= 2;
2071
2072               g /= gcd_odd (g, mu);
2073
2074               if (g <= L)
2075                 {
2076                   if (qpos >= QUEUE_SIZE)
2077                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2078                   queue[qpos].Q = g;
2079                   queue[qpos].P = P % g;
2080                   qpos++;
2081                 }
2082             }
2083
2084           /* I think the difference can be either sign, but mod
2085              2^W_TYPE_SIZE arithmetic should be fine. */
2086           t = Q1 + q * (P - P1);
2087           Q1 = Q;
2088           Q = t;
2089           P = P1;
2090
2091           if ( (i & 1) == 0)
2092             {
2093               uintmax_t r = is_square (Q);
2094               if (r)
2095                 {
2096                   for (unsigned int j = 0; j < qpos; j++)
2097                     {
2098                       if (queue[j].Q == r)
2099                         {
2100                           if (r == 1)
2101                             /* Traversed entire cycle. */
2102                             goto next_multiplier;
2103
2104                           /* Need the absolute value for divisibility test. */
2105                           if (P >= queue[j].P)
2106                             t = P - queue[j].P;
2107                           else
2108                             t = queue[j].P - P;
2109                           if (t % r == 0)
2110                             {
2111                               /* Delete entries up to and including entry
2112                                  j, which matched. */
2113                               memmove (queue, queue + j + 1,
2114                                        (qpos - j - 1) * sizeof (queue[0]));
2115                               qpos -= (j + 1);
2116                             }
2117                           goto next_i;
2118                         }
2119                     }
2120
2121                   /* We have found a square form, which should give a
2122                      factor. */
2123                   Q1 = r;
2124                   assert (S >= P); /* What signs are possible? */
2125                   P += r * ((S - P) / r);
2126
2127                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2128                      for the case D = 2N. */
2129                   /* Compute Q = (D - P*P) / Q1, but we need double
2130                      precision. */
2131                   uintmax_t hi, lo;
2132                   umul_ppmm (hi, lo, P, P);
2133                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2134                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2135                   assert (rem == 0);
2136
2137                   for (;;)
2138                     {
2139                       /* Note: There appears to by a typo in the paper,
2140                          Step 4a in the algorithm description says q <--
2141                          floor([S+P]/\hat Q), but looking at the equations
2142                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2143                          (In this code, \hat Q is Q1). */
2144                       div_smallq (q, rem, S+P, Q);
2145                       P1 = S - rem;     /* P1 = q*Q - P */
2146
2147 #if STAT_SQUFOF
2148                       q_freq[0]++;
2149                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2150 #endif
2151                       if (P == P1)
2152                         break;
2153                       t = Q1 + q * (P - P1);
2154                       Q1 = Q;
2155                       Q = t;
2156                       P = P1;
2157                     }
2158
2159                   if ( (Q & 1) == 0)
2160                     Q /= 2;
2161                   Q /= gcd_odd (Q, mu);
2162
2163                   assert (Q > 1 && (n1 || Q < n0));
2164
2165                   if (prime_p (Q))
2166                     factor_insert (factors, Q);
2167                   else if (!factor_using_squfof (0, Q, factors))
2168                     factor_using_pollard_rho (Q, 2, factors);
2169
2170                   divexact_21 (n1, n0, n1, n0, Q);
2171
2172                   if (prime2_p (n1, n0))
2173                     factor_insert_large (factors, n1, n0);
2174                   else
2175                     {
2176                       if (!factor_using_squfof (n1, n0, factors))
2177                         {
2178                           if (n1 == 0)
2179                             factor_using_pollard_rho (n0, 1, factors);
2180                           else
2181                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2182                         }
2183                     }
2184
2185                   return true;
2186                 }
2187             }
2188         next_i:;
2189         }
2190     next_multiplier:;
2191     }
2192   return false;
2193 }
2194
2195 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2196    results in FACTORS.  Use the algorithm selected by the global ALG.  */
2197 static void
2198 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2199 {
2200   factors->nfactors = 0;
2201   factors->plarge[1] = 0;
2202
2203   if (t1 == 0 && t0 < 2)
2204     return;
2205
2206   t0 = factor_using_division (&t1, t1, t0, factors);
2207
2208   if (t1 == 0 && t0 < 2)
2209     return;
2210
2211   if (prime2_p (t1, t0))
2212     factor_insert_large (factors, t1, t0);
2213   else
2214     {
2215       if (alg == ALG_SQUFOF)
2216         if (factor_using_squfof (t1, t0, factors))
2217           return;
2218
2219       if (t1 == 0)
2220         factor_using_pollard_rho (t0, 1, factors);
2221       else
2222         factor_using_pollard_rho2 (t1, t0, 1, factors);
2223     }
2224 }
2225
2226 #if HAVE_GMP
2227 /* Use Pollard-rho to compute the prime factors of
2228    arbitrary-precision T, and put the results in FACTORS.  */
2229 static void
2230 mp_factor (mpz_t t, struct mp_factors *factors)
2231 {
2232   mp_factor_init (factors);
2233
2234   if (mpz_sgn (t) != 0)
2235     {
2236       mp_factor_using_division (t, factors);
2237
2238       if (mpz_cmp_ui (t, 1) != 0)
2239         {
2240           devmsg ("[is number prime?] ");
2241           if (mp_prime_p (t))
2242             mp_factor_insert (factors, t);
2243           else
2244             mp_factor_using_pollard_rho (t, 1, factors);
2245         }
2246     }
2247 }
2248 #endif
2249
2250 static strtol_error
2251 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2252 {
2253   unsigned int lo_carry;
2254   uintmax_t hi = 0, lo = 0;
2255
2256   strtol_error err = LONGINT_INVALID;
2257
2258   /* Skip initial spaces and '+'.  */
2259   for (;;)
2260     {
2261       char c = *s;
2262       if (c == ' ')
2263         s++;
2264       else if (c == '+')
2265         {
2266           s++;
2267           break;
2268         }
2269       else
2270         break;
2271     }
2272
2273   /* Initial scan for invalid digits.  */
2274   const char *p = s;
2275   for (;;)
2276     {
2277       unsigned int c = *p++;
2278       if (c == 0)
2279         break;
2280
2281       if (UNLIKELY (!ISDIGIT (c)))
2282         {
2283           err = LONGINT_INVALID;
2284           break;
2285         }
2286
2287       err = LONGINT_OK;           /* we've seen at least one valid digit */
2288     }
2289
2290   for (;err == LONGINT_OK;)
2291     {
2292       unsigned int c = *s++;
2293       if (c == 0)
2294         break;
2295
2296       c -= '0';
2297
2298       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2299         {
2300           err = LONGINT_OVERFLOW;
2301           break;
2302         }
2303       hi = 10 * hi;
2304
2305       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2306       lo_carry += 10 * lo < 2 * lo;
2307
2308       lo = 10 * lo;
2309       lo += c;
2310
2311       lo_carry += lo < c;
2312       hi += lo_carry;
2313       if (UNLIKELY (hi < lo_carry))
2314         {
2315           err = LONGINT_OVERFLOW;
2316           break;
2317         }
2318     }
2319
2320   *hip = hi;
2321   *lop = lo;
2322
2323   return err;
2324 }
2325
2326 static size_t n_out; /* How much data we've written to stdout.  */
2327
2328 static void
2329 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2330 {
2331   uintmax_t q, r;
2332
2333   if (t1 == 0)
2334     {
2335       /* n_out's value is inconsequential on error.  */
2336       n_out += (size_t) printf ("%"PRIuMAX, t0);
2337     }
2338   else
2339     {
2340       /* Use very plain code here since it seems hard to write fast code
2341          without assuming a specific word size.  */
2342       q = t1 / 1000000000;
2343       r = t1 % 1000000000;
2344       udiv_qrnnd (t0, r, r, t0, 1000000000);
2345       print_uintmaxes (q, t0);
2346       n_out += (size_t) printf ("%09u", (unsigned int) r);
2347     }
2348 }
2349
2350 /* Single-precision factoring */
2351 static void
2352 print_factors_single (uintmax_t t1, uintmax_t t0)
2353 {
2354   struct factors factors;
2355
2356   print_uintmaxes (t1, t0);
2357   putchar (':');
2358   n_out++;
2359
2360   factor (t1, t0, &factors);
2361
2362   for (unsigned int j = 0; j < factors.nfactors; j++)
2363     for (unsigned int k = 0; k < factors.e[j]; k++)
2364       {
2365         char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2366         putchar (' ');
2367         char *umaxstr = umaxtostr (factors.p[j], buf);
2368         fputs (umaxstr, stdout);
2369         n_out += strlen (umaxstr) + 1;
2370       }
2371
2372   if (factors.plarge[1])
2373     {
2374       putchar (' ');
2375       n_out++;
2376       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2377     }
2378   putchar ('\n');
2379   n_out++;
2380
2381   /* Assume the stdout buffer is > this value.
2382      Flush here to avoid partial lines being output.
2383      Flushing every line has too much overhead.
2384      TODO: Buffer internally and avoid stdio.  */
2385   if (n_out >= 512)
2386     {
2387       fflush (stdout);
2388       n_out = 0;
2389     }
2390 }
2391
2392 /* Emit the factors of the indicated number.  If we have the option of using
2393    either algorithm, we select on the basis of the length of the number.
2394    For longer numbers, we prefer the MP algorithm even if the native algorithm
2395    has enough digits, because the algorithm is better.  The turnover point
2396    depends on the value.  */
2397 static bool
2398 print_factors (const char *input)
2399 {
2400   uintmax_t t1, t0;
2401
2402   /* Try converting the number to one or two words.  If it fails, use GMP or
2403      print an error message.  The 2nd condition checks that the most
2404      significant bit of the two-word number is clear, in a typesize neutral
2405      way.  */
2406   strtol_error err = strto2uintmax (&t1, &t0, input);
2407
2408   switch (err)
2409     {
2410     case LONGINT_OK:
2411       if (((t1 << 1) >> 1) == t1)
2412         {
2413           devmsg ("[using single-precision arithmetic] ");
2414           print_factors_single (t1, t0);
2415           return true;
2416         }
2417       break;
2418
2419     case LONGINT_OVERFLOW:
2420       /* Try GMP.  */
2421       break;
2422
2423     default:
2424       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2425       return false;
2426     }
2427
2428 #if HAVE_GMP
2429   devmsg ("[using arbitrary-precision arithmetic] ");
2430   mpz_t t;
2431   struct mp_factors factors;
2432
2433   mpz_init_set_str (t, input, 10);
2434
2435   gmp_printf ("%Zd:", t);
2436   mp_factor (t, &factors);
2437
2438   for (unsigned int j = 0; j < factors.nfactors; j++)
2439     for (unsigned int k = 0; k < factors.e[j]; k++)
2440       gmp_printf (" %Zd", factors.p[j]);
2441
2442   mp_factor_clear (&factors);
2443   mpz_clear (t);
2444   putchar ('\n');
2445   fflush (stdout);
2446   return true;
2447 #else
2448   error (0, 0, _("%s is too large"), quote (input));
2449   return false;
2450 #endif
2451 }
2452
2453 void
2454 usage (int status)
2455 {
2456   if (status != EXIT_SUCCESS)
2457     emit_try_help ();
2458   else
2459     {
2460       printf (_("\
2461 Usage: %s [NUMBER]...\n\
2462   or:  %s OPTION\n\
2463 "),
2464               program_name, program_name);
2465       fputs (_("\
2466 Print the prime factors of each specified integer NUMBER.  If none\n\
2467 are specified on the command line, read them from standard input.\n\
2468 \n\
2469 "), stdout);
2470       fputs (HELP_OPTION_DESCRIPTION, stdout);
2471       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2472       emit_ancillary_info (PROGRAM_NAME);
2473     }
2474   exit (status);
2475 }
2476
2477 static bool
2478 do_stdin (void)
2479 {
2480   bool ok = true;
2481   token_buffer tokenbuffer;
2482
2483   init_tokenbuffer (&tokenbuffer);
2484
2485   while (true)
2486     {
2487       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2488                                        &tokenbuffer);
2489       if (token_length == (size_t) -1)
2490         break;
2491       ok &= print_factors (tokenbuffer.buffer);
2492     }
2493   free (tokenbuffer.buffer);
2494
2495   return ok;
2496 }
2497
2498 int
2499 main (int argc, char **argv)
2500 {
2501   initialize_main (&argc, &argv);
2502   set_program_name (argv[0]);
2503   setlocale (LC_ALL, "");
2504   bindtextdomain (PACKAGE, LOCALEDIR);
2505   textdomain (PACKAGE);
2506
2507   atexit (close_stdout);
2508
2509   alg = ALG_POLLARD_RHO;        /* Default to Pollard rho */
2510
2511   int c;
2512   while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2513     {
2514       switch (c)
2515         {
2516         case DEV_DEBUG_OPTION:
2517           dev_debug = true;
2518           break;
2519
2520         case 's':
2521           alg = ALG_SQUFOF;
2522           break;
2523
2524         case 'w':
2525           flag_prove_primality = false;
2526           break;
2527
2528         case_GETOPT_HELP_CHAR;
2529
2530         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2531
2532         default:
2533           usage (EXIT_FAILURE);
2534         }
2535     }
2536
2537 #if STAT_SQUFOF
2538   if (alg == ALG_SQUFOF)
2539     memset (q_freq, 0, sizeof (q_freq));
2540 #endif
2541
2542   bool ok;
2543   if (argc <= optind)
2544     ok = do_stdin ();
2545   else
2546     {
2547       ok = true;
2548       for (int i = optind; i < argc; i++)
2549         if (! print_factors (argv[i]))
2550           ok = false;
2551     }
2552
2553 #if STAT_SQUFOF
2554   if (alg == ALG_SQUFOF && q_freq[0] > 0)
2555     {
2556       double acc_f;
2557       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2558       for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2559         {
2560           double f = (double) q_freq[i] / q_freq[0];
2561           acc_f += f;
2562           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2563                   100.0 * f, 100.0 * acc_f);
2564         }
2565     }
2566 #endif
2567
2568   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2569 }