src/io.c

   1 /* File I/O for GNU DIFF.
   2
   3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
   4    2015-2025 Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    This program is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  20
  21 #include "diff.h"
  22 #include <binary-io.h>
  23 #include <cmpbuf.h>
  24 #include <file-type.h>
  25 #include <ialloc.h>
  26 #include <mcel.h>
  27 #include <xalloc.h>
  28
  29 #include <ctype.h>
  30 #include <uchar.h>
  31 \f
  32 /* The type of a hash value.  */
  33 typedef size_t hash_value;
  34 enum { HASH_VALUE_WIDTH = SIZE_WIDTH };
  35 static_assert (! TYPE_SIGNED (hash_value));
  36
  37 /* Rotate a hash value to the left.  */
  38 static hash_value
  39 rol (hash_value v, int n)
  40 {
  41   return v << n | v >> (HASH_VALUE_WIDTH - n);
  42 }
  43
  44 /* Given a hash value and a new character, return a new hash value.  */
  45 static hash_value
  46 hash (hash_value h, hash_value c)
  47 {
  48   return rol (h, 7) + c;
  49 }
  50
  51 /* Lines are put into equivalence classes of lines that match in lines_differ.
  52    Each equivalence class is represented by one of these structures,
  53    but only while the classes are being computed.
  54    Afterward, each class is represented by a number.  */
  55 struct equivclass
  56 {
  57   lin next;             /* Next item in this bucket.  */
  58   hash_value hash;      /* Hash of lines in this class.  */
  59   char const *line;     /* A line that fits this class.  */
  60   idx_t length;         /* That line's length, counting its newline.  */
  61 };
  62
  63 /* Hash-table: array of buckets, each being a chain of equivalence classes.
  64    buckets[-1] is reserved for incomplete lines.  */
  65 static lin *buckets;
  66
  67 /* Number of buckets in the hash table array, not counting buckets[-1].  */
  68 static idx_t nbuckets;
  69
  70 /* Array in which the equivalence classes are allocated.
  71    The bucket-chains go through the elements in this array.
  72    The number of an equivalence class is its index in this array.  */
  73 static struct equivclass *equivs;
  74
  75 /* Index of first free element in the array 'equivs'.  */
  76 static lin equivs_index;
  77
  78 /* Number of elements allocated in the array 'equivs'.  */
  79 static idx_t equivs_alloc;
  80 \f
  81 /* The file buffer, considered as an array of bytes rather than
  82    as an array of words.  */
  83
  84 static char *
  85 file_buffer (struct file_data const *f)
  86 {
  87   return (char *) f->buffer;
  88 }
  89
  90 /* Read a block of data into a file buffer, checking for EOF and error.  */
  91
  92 void
  93 file_block_read (struct file_data *current, idx_t size)
  94 {
  95   if (size && ! current->eof)
  96     {
  97       ptrdiff_t s = block_read (current->desc,
  98                                 file_buffer (current) + current->buffered,
  99                                 size);
 100       if (s < 0)
 101         pfatal_with_name (current->name);
 102       current->buffered += s;
 103       current->eof = s < size;
 104     }
 105 }
 106 \f
 107 /* Check for binary files and compare them for exact identity.  */
 108
 109 /* Return true if BUF contains a non text character.
 110    SIZE is the number of characters in BUF.  */
 111
 112 #define binary_file_p(buf, size) (!! memchr (buf, 0, size))
 113
 114 /* Get ready to read the current file.
 115    Return nonzero if SKIP_TEST is zero,
 116    and if it appears to be a binary file.  */
 117
 118 static bool
 119 sip (struct file_data *current, bool skip_test)
 120 {
 121   /* If we have a nonexistent file at this stage, treat it as empty.  */
 122   if (current->desc < 0)
 123     {
 124       /* Leave room for a sentinel.  */
 125       current->bufsize = sizeof (word);
 126       current->buffer = ximalloc (current->bufsize);
 127     }
 128   else
 129     {
 130       idx_t blksize;
 131       if (ST_BLKSIZE (current->stat) < 0
 132           || ckd_add (&blksize, ST_BLKSIZE (current->stat), 0))
 133         blksize = 0;
 134       current->bufsize = buffer_lcm (sizeof (word), blksize, IDX_MAX);
 135       current->buffer = ximalloc (current->bufsize);
 136
 137 #ifdef __KLIBC__
 138       /* Skip test if seek is not possible */
 139       skip_test = skip_test
 140                   || (lseek (current->desc, 0, SEEK_CUR) < 0
 141                       && errno == ESPIPE);
 142 #endif
 143
 144       if (! skip_test)
 145         {
 146           /* Check first part of file to see if it's a binary file.  */
 147
 148           int prev_mode = set_binary_mode (current->desc, O_BINARY);
 149           file_block_read (current, current->bufsize);
 150           off_t buffered = current->buffered;
 151
 152           if (prev_mode != O_BINARY)
 153             {
 154               /* Revert to text mode and seek back to the start to reread
 155                  the file.  Use relative seek, since file descriptors
 156                  like stdin might not start at offset zero.  */
 157               if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
 158                 pfatal_with_name (current->name);
 159               set_binary_mode (current->desc, prev_mode);
 160               current->buffered = 0;
 161               current->eof = false;
 162             }
 163
 164           return binary_file_p (current->buffer, buffered);
 165         }
 166     }
 167
 168   current->buffered = 0;
 169   current->eof = false;
 170   return false;
 171 }
 172
 173 /* Slurp the rest of the current file completely into memory.  */
 174
 175 static void
 176 slurp (struct file_data *current)
 177 {
 178   if (current->desc < 0)
 179     {
 180       /* The file is nonexistent.  */
 181       return;
 182     }
 183
 184   /* Upper bound on the room needed for an appended newline, word
 185      sentinel, and worst-case word alignment.  */
 186   enum { extra_room = 2 * sizeof (word) };
 187
 188   if (S_ISREG (current->stat.st_mode))
 189     {
 190       /* It's a regular file; try to allocate a big enough buffer so
 191          that it can be slurped in all at once, along with appended
 192          newline plus word sentinel plus word-alignment.  */
 193       off_t file_size = current->stat.st_size;
 194       idx_t cc;
 195       if (0 <= file_size
 196           && !ckd_add (&cc, file_size - file_size % sizeof (word), extra_room)
 197           && current->bufsize < cc)
 198         {
 199           #if __GNUC__ == 13 && __GNUC_MINOR__ < 3
 200             /* Work around GCC bug 110014.  */
 201             #pragma GCC diagnostic push
 202             #pragma GCC diagnostic ignored "-Wanalyzer-allocation-size"
 203           #endif
 204
 205           word *buffer = irealloc (current->buffer, cc);
 206
 207           if (buffer)
 208             {
 209               current->buffer = buffer;
 210               current->bufsize = cc;
 211             }
 212
 213           #if __GNUC__ == 13
 214             #pragma GCC diagnostic pop
 215           #endif
 216         }
 217     }
 218
 219   /* Read the file, growing the buffer as needed.  */
 220
 221   while (file_block_read (current, current->bufsize - current->buffered),
 222          !current->eof)
 223     current->buffer = xpalloc (current->buffer, &current->bufsize,
 224                                extra_room, -1, 1);
 225
 226   if (current->bufsize - current->buffered < extra_room)
 227     {
 228       if (ckd_add (&current->bufsize, current->buffered, extra_room))
 229         xalloc_die ();
 230       current->buffer = xirealloc (current->buffer, current->bufsize);
 231     }
 232 }
 233 \f
 234 /* Return true if CH1 and ERR1 stand for the same character or
 235    encoding error as CH2 and ERR2.  */
 236 static bool
 237 same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
 238 {
 239   return ! ((ch1 ^ ch2) | (err1 ^ err2));
 240 }
 241
 242 /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
 243    one line from each input file) according to the command line options.
 244    Line lengths include the trailing newline.
 245    For efficiency, this is invoked only when the lines do not match exactly
 246    but an option like -i might cause us to ignore the difference.
 247    Return nonzero if the lines differ.
 248    Line lengths do not include the trailing newline.  */
 249
 250 static bool
 251 lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 252 {
 253   char const *t1 = s1;
 254   char const *t2 = s2;
 255   intmax_t tab = 0, column = 0;
 256
 257   if (MB_CUR_MAX == 1)
 258     while (true)
 259       {
 260         unsigned char c1 = *t1++;
 261         unsigned char c2 = *t2++;
 262
 263         /* Test for exact char equality first, since it's a common case.  */
 264         if (c1 != c2)
 265           {
 266             switch (ignore_white_space)
 267               {
 268               case IGNORE_ALL_SPACE:
 269                 /* For -w, just skip past any white space.  */
 270                 while (isspace (c1) && c1 != '\n') c1 = *t1++;
 271                 while (isspace (c2) && c2 != '\n') c2 = *t2++;
 272                 break;
 273
 274               case IGNORE_SPACE_CHANGE:
 275                 /* For -b, advance past any sequence of white space in
 276                    line 1 and consider it just one space, or nothing at
 277                    all if it is at the end of the line.  */
 278                 if (isspace (c1))
 279                   while (c1 != '\n')
 280                     {
 281                       c1 = *t1++;
 282                       if (! isspace (c1))
 283                         {
 284                           --t1;
 285                           c1 = ' ';
 286                           break;
 287                         }
 288                     }
 289
 290                 /* Likewise for line 2.  */
 291                 if (isspace (c2))
 292                   while (c2 != '\n')
 293                     {
 294                       c2 = *t2++;
 295                       if (! isspace (c2))
 296                         {
 297                           --t2;
 298                           c2 = ' ';
 299                           break;
 300                         }
 301                     }
 302
 303                 if (c1 != c2)
 304                   {
 305                     /* If we went too far when doing the simple test
 306                        for equality, go back to the first non-white-space
 307                        character in both sides and try again.  */
 308                     if (c2 == ' ' && c1 != '\n'
 309                         && s1 + 1 < t1
 310                         && isspace ((unsigned char) t1[-2]))
 311                       {
 312                         --t1;
 313                         continue;
 314                       }
 315                     if (c1 == ' ' && c2 != '\n'
 316                         && s2 + 1 < t2
 317                         && isspace ((unsigned char) t2[-2]))
 318                       {
 319                         --t2;
 320                         continue;
 321                       }
 322                   }
 323
 324                 break;
 325
 326               case IGNORE_TRAILING_SPACE:
 327               case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
 328                 if (isspace (c1) && isspace (c2))
 329                   {
 330                     if (c1 != '\n')
 331                       {
 332                         char const *p = t1;
 333                         unsigned char c;
 334                         while ((c = *p) != '\n' && isspace (c))
 335                           ++p;
 336                         if (c != '\n')
 337                           break;
 338                       }
 339                     if (c2 != '\n')
 340                       {
 341                         char const *p = t2;
 342                         unsigned char c;
 343                         while ((c = *p) != '\n' && isspace (c))
 344                           ++p;
 345                         if (c != '\n')
 346                           break;
 347                       }
 348                     /* Both lines have nothing but whitespace left.  */
 349                     return false;
 350                   }
 351                 if (ignore_white_space == IGNORE_TRAILING_SPACE)
 352                   break;
 353                 FALLTHROUGH;
 354               case IGNORE_TAB_EXPANSION:
 355                 if ((c1 == ' ' && c2 == '\t')
 356                     || (c1 == '\t' && c2 == ' '))
 357                   {
 358                     intmax_t tab2 = tab, column2 = column;
 359                     for (;; c1 = *t1++)
 360                       {
 361                         if (c1 == '\t' || (c1 == ' ' && column == tabsize - 1))
 362                           {
 363                             tab++;
 364                             column = 0;
 365                           }
 366                         else if (c1 == ' ')
 367                           column++;
 368                         else
 369                           break;
 370                       }
 371                     for (;; c2 = *t2++)
 372                       {
 373                         if (c2 == '\t' || (c2 == ' ' && column2 == tabsize - 1))
 374                           {
 375                             tab2++;
 376                             column2 = 0;
 377                           }
 378                         else if (c2 == ' ')
 379                           column2++;
 380                         else
 381                           break;
 382                       }
 383                     if (tab != tab2 || column != column2)
 384                       return true;
 385                   }
 386                 break;
 387
 388               case IGNORE_NO_WHITE_SPACE:
 389                 break;
 390               }
 391
 392             if (ignore_case)
 393               {
 394                 c1 = tolower (c1);
 395                 c2 = tolower (c2);
 396               }
 397
 398             if (c1 != c2)
 399               break;
 400           }
 401
 402         switch (c1)
 403           {
 404           case '\n':
 405             return false;
 406
 407           case '\r':
 408             tab = column = 0;
 409             break;
 410
 411           case '\b':
 412             if (0 < column)
 413               column--;
 414             else if (0 < tab)
 415               {
 416                 tab--;
 417                 column = tabsize - 1;
 418               }
 419             break;
 420
 421           case '\0': case '\a': case '\f': case '\v':
 422             break;
 423
 424           default:
 425             column += !! isprint (c1);
 426             if (column < tabsize)
 427               break;
 428             FALLTHROUGH;
 429           case '\t':
 430             tab++;
 431             column = 0;
 432             break;
 433           }
 434       }
 435   else
 436     {
 437       char const *lim1 = s1 + s1len;
 438       char const *lim2 = s2 + s2len;
 439       char32_t ch1prev = 0;
 440
 441       while (true)
 442         {
 443           mcel_t g1 = mcel_scan (t1, lim1);
 444           mcel_t g2 = mcel_scan (t2, lim2);
 445           t1 += g1.len;
 446           t2 += g2.len;
 447           char32_t ch1 = g1.ch;
 448           char32_t ch2 = g2.ch;
 449
 450           /* Test for exact equality first, since it's a common case.  */
 451           if (! same_ch_err (ch1, g1.err, ch2, g2.err))
 452             {
 453               switch (ignore_white_space)
 454                 {
 455                 case IGNORE_ALL_SPACE:
 456                   /* For -w, just skip past any white space.  */
 457                   while (ch1 != '\n' && c32isspace (ch1))
 458                     {
 459                       g1 = mcel_scan (t1, lim1);
 460                       t1 += g1.len;
 461                       ch1 = g1.ch;
 462                     }
 463                   while (ch2 != '\n' && c32isspace (ch2))
 464                     {
 465                       g2 = mcel_scan (t2, lim2);
 466                       t2 += g2.len;
 467                       ch2 = g2.ch;
 468                     }
 469                   break;
 470
 471                 case IGNORE_SPACE_CHANGE:
 472                   /* For -b, advance past any sequence of white space in
 473                      line 1 and consider it just one space, or nothing at
 474                      all if it is at the end of the line.  */
 475                   if (c32isspace (ch1))
 476                     while (ch1 != '\n')
 477                       {
 478                         g1 = mcel_scan (t1, lim1);
 479                         t1 += g1.len;
 480                         ch1 = g1.ch;
 481                         if (! c32isspace (ch1))
 482                           {
 483                             t1 -= g1.len;
 484                             ch1 = ' ';
 485                             break;
 486                           }
 487                       }
 488
 489                   /* Likewise for line 2.  */
 490                   if (c32isspace (ch2))
 491                     while (ch2 != '\n')
 492                       {
 493                         g2 = mcel_scan (t2, lim2);
 494                         t2 += g2.len;
 495                         ch2 = g2.ch;
 496                         if (! c32isspace (ch2))
 497                           {
 498                             t2 -= g2.len;
 499                             ch2 = ' ';
 500                             break;
 501                           }
 502                       }
 503
 504                   if (ch1 != ch2)
 505                     {
 506                       /* If we went too far when doing the simple test
 507                          for equality, go back to the first non-white-space
 508                          character in both sides and try again.  */
 509                       if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
 510                         {
 511                           t1 -= g1.len;
 512                           continue;
 513                         }
 514                       if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
 515                         {
 516                           t2 -= g2.len;
 517                           continue;
 518                         }
 519                     }
 520
 521                   break;
 522
 523                 case IGNORE_TRAILING_SPACE:
 524                 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
 525                   if (c32isspace (ch1) && c32isspace (ch2))
 526                     {
 527                       if (ch1 != '\n')
 528                         {
 529                           char const *p = t1;
 530                           while (*p != '\n')
 531                             {
 532                               mcel_t g = mcel_scan (p, lim1);
 533                               if (c32isspace (g.ch))
 534                                 break;
 535                               p += g.len;
 536                             }
 537                           if (*p != '\n')
 538                             break;
 539                         }
 540                       if (ch2 != '\n')
 541                         {
 542                           char const *p = t2;
 543                           while (*p != '\n')
 544                             {
 545                               mcel_t g = mcel_scan (p, lim2);
 546                               if (! c32isspace (g.ch))
 547                                 break;
 548                               p += g.len;
 549                             }
 550                           if (*p != '\n')
 551                             break;
 552                         }
 553                       /* Both lines have nothing but whitespace left.  */
 554                       return false;
 555                     }
 556                   if (ignore_white_space == IGNORE_TRAILING_SPACE)
 557                     break;
 558                   FALLTHROUGH;
 559                 case IGNORE_TAB_EXPANSION:
 560                   if ((ch1 == ' ' && ch2 == '\t')
 561                       || (ch1 == '\t' && ch2 == ' '))
 562                     {
 563                       intmax_t tab2 = tab, column2 = column;
 564
 565                       while (true)
 566                         {
 567                           if (ch1 == '\t'
 568                               || (ch1 == ' ' && column == tabsize - 1))
 569                             {
 570                               tab++;
 571                               column = 0;
 572                             }
 573                           else if (ch1 == ' ')
 574                             column++;
 575                           else
 576                             break;
 577
 578                           g1 = mcel_scan (t1, lim1);
 579                           t1 += g1.len;
 580                           ch1 = g1.ch;
 581                         }
 582
 583                       while (true)
 584                         {
 585                           if (ch2 == '\t'
 586                               || (ch2 == ' ' && column2 == tabsize - 1))
 587                             {
 588                               tab2++;
 589                               column2 = 0;
 590                             }
 591                           else if (ch2 == ' ')
 592                             column2++;
 593                           else
 594                             break;
 595
 596                           g2 = mcel_scan (t2, lim2);
 597                           t2 += g2.len;
 598                           ch2 = g2.ch;
 599                         }
 600
 601                       if (tab != tab2 || column != column2)
 602                         return true;
 603                     }
 604                   break;
 605
 606                 case IGNORE_NO_WHITE_SPACE:
 607                   break;
 608                 }
 609
 610               if (ignore_case)
 611                 {
 612                   ch1 = c32tolower (ch1);
 613                   ch2 = c32tolower (ch2);
 614                 }
 615
 616               if (! same_ch_err (ch1, g1.err, ch2, g2.err))
 617                 break;
 618             }
 619
 620           switch (ch1)
 621             {
 622             case '\n':
 623               return false;
 624
 625             case '\r':
 626               tab = column = 0;
 627               break;
 628
 629             case '\b':
 630               if (0 < column)
 631                 column--;
 632               else if (0 < tab)
 633                 {
 634                   tab--;
 635                   column = tabsize - 1;
 636                 }
 637               break;
 638
 639             case '\a': case '\f': case '\v':
 640               break;
 641
 642             default:
 643               /* Assume that downcasing does not change print width.  */
 644               column += g1.err ? 1 : c32width (ch1);
 645               if (column < tabsize)
 646                 break;
 647               FALLTHROUGH;
 648             case '\t':
 649               tab++;
 650               column = 0;
 651               break;
 652             }
 653
 654           ch1prev = ch1;
 655         }
 656     }
 657
 658   return true;
 659 }
 660 \f
 661 /* Split the file into lines, simultaneously computing the equivalence
 662    class for each line.  If two lines hash differently, lines_differ
 663    must return false.  */
 664
 665 static void
 666 find_and_hash_each_line (struct file_data *current)
 667 {
 668   char const *p = current->prefix_end;
 669
 670   /* Cache often-used quantities in local variables to help the compiler.  */
 671   char const **linbuf = current->linbuf;
 672   lin alloc_lines = current->alloc_lines;
 673   lin line = 0;
 674   lin linbuf_base = current->linbuf_base;
 675   lin *cureqs = xinmalloc (alloc_lines, sizeof *cureqs);
 676   struct equivclass *eqs = equivs;
 677   lin eqs_index = equivs_index;
 678   idx_t eqs_alloc = equivs_alloc;
 679   char const *suffix_begin = current->suffix_begin;
 680   char const *bufend = file_buffer (current) + current->buffered;
 681   bool ig_case = ignore_case;
 682   enum DIFF_white_space ig_white_space = ignore_white_space;
 683   bool unibyte = MB_CUR_MAX == 1;
 684   bool diff_length_compare_anyway =
 685     (ig_white_space != IGNORE_NO_WHITE_SPACE) | (!unibyte & ig_case);
 686   bool same_length_diff_contents_compare_anyway =
 687     diff_length_compare_anyway | ig_case;
 688
 689   while (p < suffix_begin)
 690     {
 691       char const *ip = p;
 692       hash_value h = 0;
 693
 694       /* Hash this line until we find a newline.  */
 695       switch (ig_white_space)
 696         {
 697         case IGNORE_ALL_SPACE:
 698           if (unibyte)
 699             for (unsigned char c; (c = *p) != '\n'; p++)
 700               {
 701                 if (! isspace (c))
 702                   h = hash (h, ig_case ? tolower (c) : c);
 703               }
 704           else
 705             for (mcel_t g; *p != '\n'; p += g.len)
 706               {
 707                 g = mcel_scan (p, suffix_begin);
 708                 if (! c32isspace (g.ch))
 709                   h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
 710               }
 711           break;
 712
 713         case IGNORE_SPACE_CHANGE:
 714           if (unibyte)
 715             for (unsigned char c; (c = *p) != '\n'; p++)
 716               {
 717                 if (isspace (c))
 718                   {
 719                     do
 720                       {
 721                         c = *++p;
 722                         if (c == '\n')
 723                           goto hashing_done;
 724                       }
 725                     while (isspace (c));
 726
 727                     h = hash (h, ' ');
 728                   }
 729
 730                 /* C is now the first non-space.  */
 731                 h = hash (h, ig_case ? tolower (c) : c);
 732               }
 733           else
 734             for (mcel_t g; *p != '\n'; p += g.len)
 735               {
 736                 g = mcel_scan (p, suffix_begin);
 737                 if (c32isspace (g.ch))
 738                   {
 739                     do
 740                       {
 741                         p += g.len;
 742                         if (*p == '\n')
 743                           goto hashing_done;
 744                         g = mcel_scan (p, suffix_begin);
 745                       }
 746                     while (c32isspace (g.ch));
 747
 748                     h = hash (h, ' ');
 749                   }
 750
 751                 /* G is now the first non-space.  */
 752                 h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
 753               }
 754           break;
 755
 756         case IGNORE_TAB_EXPANSION:
 757         case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
 758         case IGNORE_TRAILING_SPACE:
 759           {
 760             intmax_t tab = 0, column = 0;
 761             if (unibyte)
 762               for (unsigned char c; (c = *p) != '\n'; p++)
 763                 {
 764                   intmax_t repetitions = 1;
 765
 766                   if (ig_white_space & IGNORE_TRAILING_SPACE
 767                       && isspace (c))
 768                     {
 769                       char const *p1 = p;
 770                       unsigned char c1;
 771                       do
 772                         {
 773                           c1 = *++p1;
 774                           if (c1 == '\n')
 775                             {
 776                               p = p1;
 777                               goto hashing_done;
 778                             }
 779                         }
 780                       while (isspace (c1));
 781                     }
 782
 783                   if (ig_white_space & IGNORE_TAB_EXPANSION)
 784                     switch (c)
 785                       {
 786                       case '\b':
 787                         if (0 < column)
 788                           column--;
 789                         else if (0 < tab)
 790                           {
 791                             tab--;
 792                             column = tabsize - 1;
 793                           }
 794                         break;
 795
 796                       case '\t':
 797                         c = ' ';
 798                         repetitions = tabsize - column % tabsize;
 799                         tab += column / tabsize + 1;
 800                         column = 0;
 801                         break;
 802
 803                       case '\r':
 804                         tab = column = 0;
 805                         break;
 806
 807                       case '\0': case '\a': case '\f': case '\v':
 808                         break;
 809
 810                       default:
 811                         column++;
 812                         break;
 813                       }
 814
 815                   if (ig_case)
 816                     c = tolower (c);
 817
 818                   do
 819                     h = hash (h, c);
 820                   while (--repetitions != 0);
 821                 }
 822             else
 823               for (mcel_t g; *p != '\n'; p += g.len)
 824                 {
 825                   intmax_t repetitions = 1;
 826
 827                   g = mcel_scan (p, suffix_begin);
 828                   char32_t ch;
 829                   if (g.err)
 830                     {
 831                       ch = -g.err;
 832                       column++;
 833                     }
 834                   else
 835                     {
 836                       ch = g.ch;
 837                       if (ig_white_space & IGNORE_TRAILING_SPACE
 838                           && c32isspace (ch))
 839                         {
 840                           char const *p1 = p + g.len;
 841                           for (mcel_t g1; ; p1 += g1.len)
 842                             {
 843                               if (*p1 == '\n')
 844                                 {
 845                                   p = p1;
 846                                   goto hashing_done;
 847                                 }
 848                               g1 = mcel_scan (p1, suffix_begin);
 849                               if (! c32isspace (g1.ch))
 850                                 break;
 851                             }
 852                         }
 853
 854                       if (ig_white_space & IGNORE_TAB_EXPANSION)
 855                         switch (ch)
 856                           {
 857                           case '\b':
 858                             if (0 < column)
 859                               column--;
 860                             else if (0 < tab)
 861                               {
 862                                 tab--;
 863                                 column = tabsize - 1;
 864                               }
 865                             break;
 866
 867                           case '\t':
 868                             ch = ' ';
 869                             repetitions = tabsize - column % tabsize;
 870                             tab += column / tabsize + 1;
 871                             column = 0;
 872                             break;
 873
 874                           case '\r':
 875                             tab = column = 0;
 876                             break;
 877
 878                           case '\0': case '\a': case '\f': case '\v':
 879                             break;
 880
 881                           default:
 882                             column += c32width (ch);
 883                             break;
 884                           }
 885
 886                       if (ig_case)
 887                         ch = c32tolower (ch);
 888                     }
 889
 890                   do
 891                     h = hash (h, ch);
 892                   while (--repetitions != 0);
 893                 }
 894           }
 895           break;
 896
 897         default:
 898           if (unibyte)
 899             {
 900               if (ig_case)
 901                 for (unsigned char c; (c = *p) != '\n'; p++)
 902                   h = hash (h, tolower (c));
 903               else
 904                 for (unsigned char c; (c = *p) != '\n'; p++)
 905                   h = hash (h, c);
 906             }
 907           else
 908             {
 909               if (ig_case)
 910                 for (mcel_t g; *p != '\n'; p += g.len)
 911                   {
 912                     g = mcel_scan (p, suffix_begin);
 913                     h = hash (h, c32tolower (g.ch) - g.err);
 914                   }
 915               else
 916                 for (mcel_t g; *p != '\n'; p += g.len)
 917                   {
 918                     g = mcel_scan (p, suffix_begin);
 919                     h = hash (h, g.ch - g.err);
 920                   }
 921             }
 922           break;
 923         }
 924
 925    hashing_done:;
 926
 927       lin *bucket = &buckets[h % nbuckets];
 928
 929       /* Advance past the line's trailing newline.  */
 930       p++;
 931       idx_t length = p - ip;
 932
 933       if (p == bufend
 934           && current->missing_newline
 935           && robust_output_style (output_style))
 936         {
 937           /* The last line is incomplete and we do not silently
 938              complete lines.  If the line cannot compare equal to any
 939              complete line, put it into buckets[-1] so that it can
 940              compare equal only to the other file's incomplete line
 941              (if one exists).  */
 942           if (ig_white_space < IGNORE_TRAILING_SPACE)
 943             bucket = &buckets[-1];
 944         }
 945
 946       lin i;
 947       for (i = *bucket;  ;  i = eqs[i].next)
 948         if (!i)
 949           {
 950             /* Create a new equivalence class in this bucket.  */
 951             i = eqs_index++;
 952             if (i == eqs_alloc)
 953               eqs = xpalloc (eqs, &eqs_alloc, 1, -1, sizeof *eqs);
 954             eqs[i].next = *bucket;
 955             eqs[i].hash = h;
 956             eqs[i].line = ip;
 957             eqs[i].length = length;
 958             *bucket = i;
 959             break;
 960           }
 961         else if (eqs[i].hash == h)
 962           {
 963             char const *eqline = eqs[i].line;
 964             idx_t eqlinelen = eqs[i].length;
 965
 966             /* Reuse existing class if lines_differ reports the lines
 967                equal.  */
 968             if (eqlinelen == length)
 969               {
 970                 /* Reuse existing equivalence class if the lines are identical.
 971                    This detects the common case of exact identity
 972                    faster than lines_differ would.  */
 973                 if (memcmp (eqline, ip, length - 1) == 0)
 974                   break;
 975                 if (!same_length_diff_contents_compare_anyway)
 976                   continue;
 977               }
 978             else if (!diff_length_compare_anyway)
 979               continue;
 980
 981             if (! lines_differ (eqline, eqlinelen, ip, length))
 982               break;
 983           }
 984
 985       /* Maybe increase the size of the line table.  */
 986       if (line == alloc_lines)
 987         {
 988           /* Grow (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 989           idx_t n = alloc_lines - linbuf_base;
 990           linbuf += linbuf_base;
 991           linbuf = xpalloc (linbuf, &n, 1, -1, sizeof *linbuf);
 992           linbuf -= linbuf_base;
 993           alloc_lines = linbuf_base + n;
 994           cureqs = xirealloc (cureqs, alloc_lines * sizeof *cureqs);
 995         }
 996       linbuf[line] = ip;
 997       cureqs[line] = i;
 998       ++line;
 999     }
1000
1001   current->buffered_lines = line;
1002
1003   for (lin i = 0;  ;  i++)
1004     {
1005       /* Record the line start for lines in the suffix that we care about.
1006          Record one more line start than lines,
1007          so that we can compute the length of any buffered line.  */
1008       if (line == alloc_lines)
1009         {
1010           /* Grow (alloc_lines - linbuf_base) by adding to alloc_lines.  */
1011           idx_t n = alloc_lines - linbuf_base;
1012           linbuf += linbuf_base;
1013           linbuf = xpalloc (linbuf, &n, 1, -1, sizeof *linbuf);
1014           linbuf -= linbuf_base;
1015           alloc_lines = n - linbuf_base;
1016         }
1017       linbuf[line] = p;
1018
1019       if (p == bufend)
1020         {
1021           /* If the last line is incomplete and we do not silently
1022              complete lines, don't count its appended newline.  */
1023           if (current->missing_newline && robust_output_style (output_style))
1024             linbuf[line]--;
1025           break;
1026         }
1027
1028       if (context <= i && no_diff_means_no_output)
1029         break;
1030
1031       line++;
1032
1033       while (*p++ != '\n')
1034         continue;
1035     }
1036
1037   /* Done with cache in local variables.  */
1038   current->linbuf = linbuf;
1039   current->valid_lines = line;
1040   current->alloc_lines = alloc_lines;
1041   current->equivs = cureqs;
1042   equivs = eqs;
1043   equivs_alloc = eqs_alloc;
1044   equivs_index = eqs_index;
1045 }
1046 \f
1047 /* Prepare the text.  Make sure the text end is initialized.
1048    Make sure text ends in a newline,
1049    but remember that we had to add one.
1050    Strip trailing CRs, if that was requested.  */
1051
1052 static void
1053 prepare_text (struct file_data *current)
1054 {
1055   idx_t buffered = current->buffered;
1056   char *p = file_buffer (current);
1057   if (!p)
1058     return;
1059
1060   if (strip_trailing_cr)
1061     {
1062       char *srclim = p + buffered;
1063       *srclim = '\r';
1064       char *dst = rawmemchr (p, '\r');
1065
1066       for (char const *src = dst; src != srclim; src++)
1067         {
1068           src += *src == '\r' && src[1] == '\n';
1069           *dst++ = *src;
1070         }
1071
1072       buffered -= srclim - dst;
1073     }
1074
1075   if (buffered != 0 && p[buffered - 1] != '\n')
1076     {
1077       p[buffered++] = '\n';
1078       current->missing_newline = true;
1079     }
1080
1081   /* Don't use uninitialized storage when planting or using sentinels.  */
1082   memset (p + buffered, 0, sizeof (word));
1083
1084   current->buffered = buffered;
1085 }
1086 \f
1087 /* We have found N lines in a buffer of size S; guess the
1088    proportionate number of lines that will be found in a buffer of
1089    size T.  However, do not guess a number of lines so large that the
1090    resulting line table might cause overflow in size calculations.  */
1091 static lin
1092 guess_lines (lin n, idx_t s, idx_t t)
1093 {
1094   idx_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
1095   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
1096   return MIN (guessed_lines, LIN_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
1097 }
1098
1099 /* Given a vector of two file_data objects, find the identical
1100    prefixes and suffixes of each object.  */
1101
1102 static void
1103 find_identical_ends (struct file_data filevec[])
1104 {
1105   slurp (&filevec[0]);
1106   prepare_text (&filevec[0]);
1107   if (filevec[0].desc != filevec[1].desc)
1108     {
1109       slurp (&filevec[1]);
1110       prepare_text (&filevec[1]);
1111     }
1112   else
1113     {
1114       filevec[1].buffer = filevec[0].buffer;
1115       filevec[1].bufsize = filevec[0].bufsize;
1116       filevec[1].buffered = filevec[0].buffered;
1117       filevec[1].missing_newline = filevec[0].missing_newline;
1118     }
1119
1120   /* Find identical prefix.  */
1121
1122   word *w0 = filevec[0].buffer;
1123   word *w1 = filevec[1].buffer;
1124   char *buffer0 = (char *) w0;
1125   char *buffer1 = (char *) w1;
1126   char *p0 = buffer0;
1127   char *p1 = buffer1;
1128   idx_t n0 = filevec[0].buffered;
1129   idx_t n1 = filevec[1].buffered;
1130
1131   if (p0 == p1)
1132     /* The buffers are the same; sentinels won't work.  */
1133     p0 = p1 += n1;
1134   else
1135     {
1136       /* Insert end sentinels, in this case characters that are guaranteed
1137          to make the equality test false, and thus terminate the loop.  */
1138
1139       if (n0 < n1)
1140         p0[n0] = ~p1[n0];
1141       else
1142         p1[n1] = ~p0[n1];
1143
1144       /* Loop until first mismatch, or to the sentinel characters.  */
1145
1146       /* Compare a word at a time for speed.  */
1147       while (*w0 == *w1)
1148         w0++, w1++;
1149
1150       /* Do the last few bytes of comparison a byte at a time.  */
1151       p0 = (char *) w0;
1152       p1 = (char *) w1;
1153       while (*p0 == *p1)
1154         p0++, p1++;
1155
1156       /* Don't mistakenly count missing newline as part of prefix.  */
1157       if (robust_output_style (output_style)
1158           && ((buffer0 + n0 - filevec[0].missing_newline < p0)
1159               !=
1160               (buffer1 + n1 - filevec[1].missing_newline < p1)))
1161         p0--, p1--;
1162     }
1163
1164   /* Now P0 and P1 point at the first nonmatching characters.  */
1165
1166   /* Skip back to last line-beginning in the prefix,
1167      and then discard up to HORIZON_LINES lines from the prefix.  */
1168   lin hor = horizon_lines;
1169   while (p0 != buffer0 && (p0[-1] != '\n' || hor--))
1170     p0--, p1--;
1171
1172   /* Record the prefix.  */
1173   filevec[0].prefix_end = p0;
1174   filevec[1].prefix_end = p1;
1175
1176   /* Find identical suffix.  */
1177
1178   /* P0 and P1 point beyond the last chars not yet compared.  */
1179   p0 = buffer0 + n0;
1180   p1 = buffer1 + n1;
1181
1182   if (! robust_output_style (output_style)
1183       || filevec[0].missing_newline == filevec[1].missing_newline)
1184     {
1185       char const *end0 = p0;    /* Addr of last char in file 0.  */
1186
1187       /* Get value of P0 at which we should stop scanning backward:
1188          this is when either P0 or P1 points just past the last char
1189          of the identical prefix.  */
1190       char const *beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
1191
1192       /* Scan back until chars don't match or we reach that point.  */
1193       while (p0 != beg0)
1194         if (*--p0 != *--p1)
1195           {
1196             /* Point at the first char of the matching suffix.  */
1197             ++p0, ++p1;
1198             beg0 = p0;
1199             break;
1200           }
1201
1202       /* Are we at a line-beginning in both files?  If not, add the rest of
1203          this line to the main body.  Discard up to HORIZON_LINES lines from
1204          the identical suffix.  Also, discard one extra line,
1205          because shift_boundaries may need it.  */
1206       lin i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
1207                                 &&
1208                                 (buffer1 == p1 || p1[-1] == '\n'));
1209       while (i-- && p0 != end0)
1210         while (*p0++ != '\n')
1211           continue;
1212
1213       p1 += p0 - beg0;
1214     }
1215
1216   /* Record the suffix.  */
1217   filevec[0].suffix_begin = p0;
1218   filevec[1].suffix_begin = p1;
1219
1220   /* Calculate number of lines of prefix to save.
1221
1222      prefix_count == 0 means save the whole prefix;
1223      we need this for options like -D that output the whole file,
1224      or for enormous contexts (to avoid worrying about arithmetic overflow).
1225      We also need it for options like -F that output some preceding line;
1226      at least we will need to find the last few lines,
1227      but since we don't know how many, it's easiest to find them all.
1228
1229      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
1230      of the line buffer; they'll be moved to the proper location later.
1231      Handle 1 more line than the context says (because we count 1 too many),
1232      rounded up to the next power of 2 to speed index computation.  */
1233
1234   lin alloc_lines0, prefix_count, middle_guess;
1235   if (no_diff_means_no_output && ! function_regexp.fastmap
1236       && context < LIN_MAX / 4 && context < n0)
1237     {
1238       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
1239       lin suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
1240       prefix_count = (lin) 1 << (floor_log2 (context) + 1);
1241       alloc_lines0 = (prefix_count + middle_guess
1242                       + MIN (context, suffix_guess));
1243     }
1244   else
1245     {
1246       prefix_count = 0;
1247       alloc_lines0 = guess_lines (0, 0, n0);
1248     }
1249
1250   lin prefix_mask = prefix_count - 1;
1251   lin lines = 0;
1252   char const **linbuf0 = xinmalloc (alloc_lines0, sizeof *linbuf0);
1253   bool prefix_needed = ! (no_diff_means_no_output
1254                           && filevec[0].prefix_end == p0
1255                           && filevec[1].prefix_end == p1);
1256   p0 = buffer0;
1257
1258   /* If the prefix is needed, find the prefix lines.  */
1259   if (prefix_needed)
1260     {
1261       char const *end0 = filevec[0].prefix_end;
1262       while (p0 != end0)
1263         {
1264           lin l = lines++ & prefix_mask;
1265           if (l == alloc_lines0)
1266             linbuf0 = xpalloc (linbuf0, &alloc_lines0, 1, -1, sizeof *linbuf0);
1267           linbuf0[l] = p0;
1268           while (*p0++ != '\n')
1269             continue;
1270         }
1271     }
1272   lin buffered_prefix = prefix_count && context < lines ? context : lines;
1273
1274   /* Allocate line buffer 1.  */
1275
1276   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
1277   lin suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
1278   lin alloc_lines1;
1279   if (ckd_add (&alloc_lines1, buffered_prefix,
1280                middle_guess + MIN (context, suffix_guess)))
1281     xalloc_die ();
1282   char const **linbuf1 = xnmalloc (alloc_lines1, sizeof *linbuf1);
1283
1284   if (buffered_prefix != lines)
1285     {
1286       /* Rotate prefix lines to proper location.  */
1287       for (lin i = 0;  i < buffered_prefix;  i++)
1288         linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
1289       for (lin i = 0;  i < buffered_prefix;  i++)
1290         linbuf0[i] = linbuf1[i];
1291     }
1292
1293   /* Initialize line buffer 1 from line buffer 0.  */
1294   for (lin i = 0; i < buffered_prefix; i++)
1295     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
1296
1297   /* Record the line buffer, adjusted so that
1298      linbuf[0] points at the first differing line.  */
1299   filevec[0].linbuf = linbuf0 + buffered_prefix;
1300   filevec[1].linbuf = linbuf1 + buffered_prefix;
1301   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
1302   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
1303   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
1304   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
1305 }
1306 \f
1307 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
1308    than 2**k.  This table is derived from Chris K. Caldwell's list
1309    <http://www.utm.edu/research/primes/lists/2small/>.  */
1310
1311 static unsigned char const prime_offset[] =
1312 {
1313   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
1314   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
1315   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
1316   55, 93, 1, 57, 25
1317 };
1318
1319 /* Verify that this host's idx_t is not too wide for the above table.  */
1320
1321 static_assert (PTRDIFF_WIDTH - 1 <= sizeof prime_offset);
1322
1323 /* Given a vector of two file_data objects, read the file associated
1324    with each one, and build the table of equivalence classes.
1325    Return nonzero if either file appears to be a binary file.
1326    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
1327
1328 bool
1329 read_files (struct file_data filevec[], bool pretend_binary)
1330 {
1331   bool skip_test = text | pretend_binary;
1332   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
1333
1334   if (filevec[0].desc != filevec[1].desc)
1335     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
1336   else
1337     {
1338       filevec[1].buffer = filevec[0].buffer;
1339       filevec[1].bufsize = filevec[0].bufsize;
1340       filevec[1].buffered = filevec[0].buffered;
1341     }
1342   if (appears_binary)
1343     {
1344       set_binary_mode (filevec[0].desc, O_BINARY);
1345       set_binary_mode (filevec[1].desc, O_BINARY);
1346       return true;
1347     }
1348
1349   find_identical_ends (filevec);
1350
1351   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
1352   equivs = xnmalloc (equivs_alloc, sizeof *equivs);
1353   /* Equivalence class 0 is permanently safe for lines that were not
1354      hashed.  Real equivalence classes start at 1.  */
1355   equivs_index = 1;
1356
1357   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
1358      number between 1/3 and 2/3 of the value of equiv_allocs,
1359      approximately.  */
1360   int p = equivs_alloc <= 256 * 3 ? 9 : floor_log2 (equivs_alloc / 3) + 1;
1361   nbuckets = ((idx_t) 1 << p) - prime_offset[p];
1362   buckets = xicalloc (nbuckets + 1, sizeof *buckets);
1363   buckets++;
1364
1365   for (int i = 0; i < 2; i++)
1366     find_and_hash_each_line (&filevec[i]);
1367
1368   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
1369
1370   free (equivs);
1371   free (buckets - 1);
1372
1373   return false;
1374 }