gnu/dist/diffutils/src/io.c

   1 /*      $NetBSD$        */
   2
   3 /* File I/O for GNU DIFF.
   4
   5    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
   6    Free Software Foundation, Inc.
   7
   8    This file is part of GNU DIFF.
   9
  10    GNU DIFF is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    GNU DIFF is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; see the file COPYING.
  22    If not, write to the Free Software Foundation,
  23    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  24
  25 #include "diff.h"
  26 #include <cmpbuf.h>
  27 #include <regex.h>
  28 #include <setmode.h>
  29 #include <xalloc.h>
  30
  31 /* Rotate an unsigned value to the left.  */
  32 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
  33
  34 /* Given a hash value and a new character, return a new hash value.  */
  35 #define HASH(h, c) ((c) + ROL (h, 7))
  36 \f
  37 /* The type of a hash value.  */
  38 typedef size_t hash_value;
  39 verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
  40
  41 /* Lines are put into equivalence classes of lines that match in lines_differ.
  42    Each equivalence class is represented by one of these structures,
  43    but only while the classes are being computed.
  44    Afterward, each class is represented by a number.  */
  45 struct equivclass
  46 {
  47   lin next;             /* Next item in this bucket.  */
  48   hash_value hash;      /* Hash of lines in this class.  */
  49   char const *line;     /* A line that fits this class.  */
  50   size_t length;        /* That line's length, not counting its newline.  */
  51 };
  52
  53 /* Hash-table: array of buckets, each being a chain of equivalence classes.
  54    buckets[-1] is reserved for incomplete lines.  */
  55 static lin *buckets;
  56
  57 /* Number of buckets in the hash table array, not counting buckets[-1].  */
  58 static size_t nbuckets;
  59
  60 /* Array in which the equivalence classes are allocated.
  61    The bucket-chains go through the elements in this array.
  62    The number of an equivalence class is its index in this array.  */
  63 static struct equivclass *equivs;
  64
  65 /* Index of first free element in the array `equivs'.  */
  66 static lin equivs_index;
  67
  68 /* Number of elements allocated in the array `equivs'.  */
  69 static lin equivs_alloc;
  70 \f
  71 /* Read a block of data into a file buffer, checking for EOF and error.  */
  72
  73 void
  74 file_block_read (struct file_data *current, size_t size)
  75 {
  76   if (size && ! current->eof)
  77     {
  78       size_t s = block_read (current->desc,
  79                              FILE_BUFFER (current) + current->buffered, size);
  80       if (s == SIZE_MAX)
  81         pfatal_with_name (current->name);
  82       current->buffered += s;
  83       current->eof = s < size;
  84     }
  85 }
  86 \f
  87 /* Check for binary files and compare them for exact identity.  */
  88
  89 /* Return 1 if BUF contains a non text character.
  90    SIZE is the number of characters in BUF.  */
  91
  92 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
  93
  94 /* Get ready to read the current file.
  95    Return nonzero if SKIP_TEST is zero,
  96    and if it appears to be a binary file.  */
  97
  98 static bool
  99 sip (struct file_data *current, bool skip_test)
 100 {
 101   /* If we have a nonexistent file at this stage, treat it as empty.  */
 102   if (current->desc < 0)
 103     {
 104       /* Leave room for a sentinel.  */
 105       current->bufsize = sizeof (word);
 106       current->buffer = xmalloc (current->bufsize);
 107     }
 108   else
 109     {
 110       current->bufsize = buffer_lcm (sizeof (word),
 111                                      STAT_BLOCKSIZE (current->stat),
 112                                      PTRDIFF_MAX - 2 * sizeof (word));
 113       current->buffer = xmalloc (current->bufsize);
 114
 115       if (! skip_test)
 116         {
 117           /* Check first part of file to see if it's a binary file.  */
 118
 119           bool was_binary = set_binary_mode (current->desc, 1);
 120           off_t buffered;
 121           file_block_read (current, current->bufsize);
 122           buffered = current->buffered;
 123
 124           if (! was_binary)
 125             {
 126               /* Revert to text mode and seek back to the beginning to
 127                  reread the file.  Use relative seek, since file
 128                  descriptors like stdin might not start at offset
 129                  zero.  */
 130
 131               if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
 132                 pfatal_with_name (current->name);
 133               set_binary_mode (current->desc, 0);
 134               current->buffered = 0;
 135               current->eof = 0;
 136             }
 137
 138           return binary_file_p (current->buffer, buffered);
 139         }
 140     }
 141
 142   current->buffered = 0;
 143   current->eof = 0;
 144   return 0;
 145 }
 146
 147 /* Slurp the rest of the current file completely into memory.  */
 148
 149 static void
 150 slurp (struct file_data *current)
 151 {
 152   size_t cc;
 153
 154   if (current->desc < 0)
 155     {
 156       /* The file is nonexistent.  */
 157       return;
 158     }
 159
 160   if (S_ISREG (current->stat.st_mode))
 161     {
 162       /* It's a regular file; slurp in the rest all at once.  */
 163
 164       /* Get the size out of the stat block.
 165          Allocate just enough room for appended newline plus word sentinel,
 166          plus word-alignment since we want the buffer word-aligned.  */
 167       size_t file_size = current->stat.st_size;
 168       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
 169       if (file_size != current->stat.st_size || cc < file_size
 170           || PTRDIFF_MAX <= cc)
 171         xalloc_die ();
 172
 173       if (current->bufsize < cc)
 174         {
 175           current->bufsize = cc;
 176           current->buffer = xrealloc (current->buffer, cc);
 177         }
 178
 179       /* Try to read at least 1 more byte than the size indicates, to
 180          detect whether the file is growing.  This is a nicety for
 181          users who run 'diff' on files while they are changing.  */
 182
 183       if (current->buffered <= file_size)
 184         {
 185           file_block_read (current, file_size + 1 - current->buffered);
 186           if (current->buffered <= file_size)
 187             return;
 188         }
 189     }
 190
 191   /* It's not a regular file, or it's a growing regular file; read it,
 192      growing the buffer as needed.  */
 193
 194   file_block_read (current, current->bufsize - current->buffered);
 195
 196   if (current->buffered)
 197     {
 198       while (current->buffered == current->bufsize)
 199         {
 200           if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
 201             xalloc_die ();
 202           current->bufsize *= 2;
 203           current->buffer = xrealloc (current->buffer, current->bufsize);
 204           file_block_read (current, current->bufsize - current->buffered);
 205         }
 206
 207       /* Allocate just enough room for appended newline plus word
 208          sentinel, plus word-alignment.  */
 209       cc = current->buffered + 2 * sizeof (word);
 210       current->bufsize = cc - cc % sizeof (word);
 211       current->buffer = xrealloc (current->buffer, current->bufsize);
 212     }
 213 }
 214 \f
 215 /* Split the file into lines, simultaneously computing the equivalence
 216    class for each line.  */
 217
 218 static void
 219 find_and_hash_each_line (struct file_data *current)
 220 {
 221   hash_value h;
 222   unsigned char const *p = (unsigned char const *) current->prefix_end;
 223   unsigned char c;
 224   lin i, *bucket;
 225   size_t length;
 226
 227   /* Cache often-used quantities in local variables to help the compiler.  */
 228   char const **linbuf = current->linbuf;
 229   lin alloc_lines = current->alloc_lines;
 230   lin line = 0;
 231   lin linbuf_base = current->linbuf_base;
 232   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
 233   struct equivclass *eqs = equivs;
 234   lin eqs_index = equivs_index;
 235   lin eqs_alloc = equivs_alloc;
 236   char const *suffix_begin = current->suffix_begin;
 237   char const *bufend = FILE_BUFFER (current) + current->buffered;
 238   bool diff_length_compare_anyway =
 239     ignore_white_space != IGNORE_NO_WHITE_SPACE;
 240   bool same_length_diff_contents_compare_anyway =
 241     diff_length_compare_anyway | ignore_case;
 242
 243   while ((char const *) p < suffix_begin)
 244     {
 245       char const *ip = (char const *) p;
 246
 247       h = 0;
 248
 249       /* Hash this line until we find a newline.  */
 250       if (ignore_case)
 251         switch (ignore_white_space)
 252           {
 253           case IGNORE_ALL_SPACE:
 254             while ((c = *p++) != '\n')
 255               if (! ISSPACE (c))
 256                 h = HASH (h, TOLOWER (c));
 257             break;
 258
 259           case IGNORE_SPACE_CHANGE:
 260             while ((c = *p++) != '\n')
 261               {
 262                 if (ISSPACE (c))
 263                   {
 264                     do
 265                       if ((c = *p++) == '\n')
 266                         goto hashing_done;
 267                     while (ISSPACE (c));
 268
 269                     h = HASH (h, ' ');
 270                   }
 271
 272                 /* C is now the first non-space.  */
 273                 h = HASH (h, TOLOWER (c));
 274               }
 275             break;
 276
 277           case IGNORE_TAB_EXPANSION:
 278             {
 279               size_t column = 0;
 280               while ((c = *p++) != '\n')
 281                 {
 282                   int repetitions = 1;
 283
 284                   switch (c)
 285                     {
 286                     case '\b':
 287                       column -= 0 < column;
 288                       break;
 289
 290                     case '\t':
 291                       c = ' ';
 292                       repetitions = TAB_WIDTH - column % TAB_WIDTH;
 293                       column += repetitions;
 294                       break;
 295
 296                     case '\r':
 297                       column = 0;
 298                       break;
 299
 300                     default:
 301                       c = TOLOWER (c);
 302                       column++;
 303                       break;
 304                     }
 305
 306                   do
 307                     h = HASH (h, c);
 308                   while (--repetitions != 0);
 309                 }
 310             }
 311             break;
 312
 313           default:
 314             while ((c = *p++) != '\n')
 315               h = HASH (h, TOLOWER (c));
 316             break;
 317           }
 318       else
 319         switch (ignore_white_space)
 320           {
 321           case IGNORE_ALL_SPACE:
 322             while ((c = *p++) != '\n')
 323               if (! ISSPACE (c))
 324                 h = HASH (h, c);
 325             break;
 326
 327           case IGNORE_SPACE_CHANGE:
 328             while ((c = *p++) != '\n')
 329               {
 330                 if (ISSPACE (c))
 331                   {
 332                     do
 333                       if ((c = *p++) == '\n')
 334                         goto hashing_done;
 335                     while (ISSPACE (c));
 336
 337                     h = HASH (h, ' ');
 338                   }
 339
 340                 /* C is now the first non-space.  */
 341                 h = HASH (h, c);
 342               }
 343             break;
 344
 345           case IGNORE_TAB_EXPANSION:
 346             {
 347               size_t column = 0;
 348               while ((c = *p++) != '\n')
 349                 {
 350                   int repetitions = 1;
 351
 352                   switch (c)
 353                     {
 354                     case '\b':
 355                       column -= 0 < column;
 356                       break;
 357
 358                     case '\t':
 359                       c = ' ';
 360                       repetitions = TAB_WIDTH - column % TAB_WIDTH;
 361                       column += repetitions;
 362                       break;
 363
 364                     case '\r':
 365                       column = 0;
 366                       break;
 367
 368                     default:
 369                       column++;
 370                       break;
 371                     }
 372
 373                   do
 374                     h = HASH (h, c);
 375                   while (--repetitions != 0);
 376                 }
 377             }
 378             break;
 379
 380           default:
 381             while ((c = *p++) != '\n')
 382               h = HASH (h, c);
 383             break;
 384           }
 385
 386    hashing_done:;
 387
 388       bucket = &buckets[h % nbuckets];
 389       length = (char const *) p - ip - 1;
 390
 391       if ((char const *) p == bufend
 392           && current->missing_newline
 393           && ROBUST_OUTPUT_STYLE (output_style))
 394         {
 395           /* This line is incomplete.  If this is significant,
 396              put the line into buckets[-1].  */
 397           if (ignore_white_space < IGNORE_SPACE_CHANGE)
 398             bucket = &buckets[-1];
 399
 400           /* Omit the inserted newline when computing linbuf later.  */
 401           p--;
 402           bufend = suffix_begin = (char const *) p;
 403         }
 404
 405       for (i = *bucket;  ;  i = eqs[i].next)
 406         if (!i)
 407           {
 408             /* Create a new equivalence class in this bucket.  */
 409             i = eqs_index++;
 410             if (i == eqs_alloc)
 411               {
 412                 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
 413                   xalloc_die ();
 414                 eqs_alloc *= 2;
 415                 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
 416               }
 417             eqs[i].next = *bucket;
 418             eqs[i].hash = h;
 419             eqs[i].line = ip;
 420             eqs[i].length = length;
 421             *bucket = i;
 422             break;
 423           }
 424         else if (eqs[i].hash == h)
 425           {
 426             char const *eqline = eqs[i].line;
 427
 428             /* Reuse existing class if lines_differ reports the lines
 429                equal.  */
 430             if (eqs[i].length == length)
 431               {
 432                 /* Reuse existing equivalence class if the lines are identical.
 433                    This detects the common case of exact identity
 434                    faster than lines_differ would.  */
 435                 if (memcmp (eqline, ip, length) == 0)
 436                   break;
 437                 if (!same_length_diff_contents_compare_anyway)
 438                   continue;
 439               }
 440             else if (!diff_length_compare_anyway)
 441               continue;
 442
 443             if (! lines_differ (eqline, ip))
 444               break;
 445           }
 446
 447       /* Maybe increase the size of the line table.  */
 448       if (line == alloc_lines)
 449         {
 450           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 451           if (PTRDIFF_MAX / 3 <= alloc_lines
 452               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 453               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 454             xalloc_die ();
 455           alloc_lines = 2 * alloc_lines - linbuf_base;
 456           cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
 457           linbuf += linbuf_base;
 458           linbuf = xrealloc (linbuf,
 459                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 460           linbuf -= linbuf_base;
 461         }
 462       linbuf[line] = ip;
 463       cureqs[line] = i;
 464       ++line;
 465     }
 466
 467   current->buffered_lines = line;
 468
 469   for (i = 0;  ;  i++)
 470     {
 471       /* Record the line start for lines in the suffix that we care about.
 472          Record one more line start than lines,
 473          so that we can compute the length of any buffered line.  */
 474       if (line == alloc_lines)
 475         {
 476           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 477           if (PTRDIFF_MAX / 3 <= alloc_lines
 478               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 479               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 480             xalloc_die ();
 481           alloc_lines = 2 * alloc_lines - linbuf_base;
 482           linbuf += linbuf_base;
 483           linbuf = xrealloc (linbuf,
 484                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 485           linbuf -= linbuf_base;
 486         }
 487       linbuf[line] = (char const *) p;
 488
 489       if ((char const *) p == bufend)
 490         break;
 491
 492       if (context <= i && no_diff_means_no_output)
 493         break;
 494
 495       line++;
 496
 497       while (*p++ != '\n')
 498         continue;
 499     }
 500
 501   /* Done with cache in local variables.  */
 502   current->linbuf = linbuf;
 503   current->valid_lines = line;
 504   current->alloc_lines = alloc_lines;
 505   current->equivs = cureqs;
 506   equivs = eqs;
 507   equivs_alloc = eqs_alloc;
 508   equivs_index = eqs_index;
 509 }
 510 \f
 511 /* Prepare the text.  Make sure the text end is initialized.
 512    Make sure text ends in a newline,
 513    but remember that we had to add one.
 514    Strip trailing CRs, if that was requested.  */
 515
 516 static void
 517 prepare_text (struct file_data *current)
 518 {
 519   size_t buffered = current->buffered;
 520   char *p = FILE_BUFFER (current);
 521   char *dst;
 522
 523   if (buffered == 0 || p[buffered - 1] == '\n')
 524     current->missing_newline = 0;
 525   else
 526     {
 527       p[buffered++] = '\n';
 528       current->missing_newline = 1;
 529     }
 530
 531   if (!p)
 532     return;
 533
 534   /* Don't use uninitialized storage when planting or using sentinels.  */
 535   memset (p + buffered, 0, sizeof (word));
 536
 537   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
 538     {
 539       char const *src = dst;
 540       char const *srclim = p + buffered;
 541
 542       do
 543         dst += ! ((*dst = *src++) == '\r' && *src == '\n');
 544       while (src < srclim);
 545
 546       buffered -= src - dst;
 547     }
 548
 549   current->buffered = buffered;
 550 }
 551 \f
 552 /* We have found N lines in a buffer of size S; guess the
 553    proportionate number of lines that will be found in a buffer of
 554    size T.  However, do not guess a number of lines so large that the
 555    resulting line table might cause overflow in size calculations.  */
 556 static lin
 557 guess_lines (lin n, size_t s, size_t t)
 558 {
 559   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
 560   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
 561   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
 562 }
 563
 564 /* Given a vector of two file_data objects, find the identical
 565    prefixes and suffixes of each object.  */
 566
 567 static void
 568 find_identical_ends (struct file_data filevec[])
 569 {
 570   word *w0, *w1;
 571   char *p0, *p1, *buffer0, *buffer1;
 572   char const *end0, *beg0;
 573   char const **linbuf0, **linbuf1;
 574   lin i, lines;
 575   size_t n0, n1;
 576   lin alloc_lines0, alloc_lines1;
 577   lin buffered_prefix, prefix_count, prefix_mask;
 578   lin middle_guess, suffix_guess;
 579
 580   slurp (&filevec[0]);
 581   prepare_text (&filevec[0]);
 582   if (filevec[0].desc != filevec[1].desc)
 583     {
 584       slurp (&filevec[1]);
 585       prepare_text (&filevec[1]);
 586     }
 587   else
 588     {
 589       filevec[1].buffer = filevec[0].buffer;
 590       filevec[1].bufsize = filevec[0].bufsize;
 591       filevec[1].buffered = filevec[0].buffered;
 592       filevec[1].missing_newline = filevec[0].missing_newline;
 593     }
 594
 595   /* Find identical prefix.  */
 596
 597   w0 = filevec[0].buffer;
 598   w1 = filevec[1].buffer;
 599   p0 = buffer0 = (char *) w0;
 600   p1 = buffer1 = (char *) w1;
 601   n0 = filevec[0].buffered;
 602   n1 = filevec[1].buffered;
 603
 604   if (p0 == p1)
 605     /* The buffers are the same; sentinels won't work.  */
 606     p0 = p1 += n1;
 607   else
 608     {
 609       /* Insert end sentinels, in this case characters that are guaranteed
 610          to make the equality test false, and thus terminate the loop.  */
 611
 612       if (n0 < n1)
 613         p0[n0] = ~p1[n0];
 614       else
 615         p1[n1] = ~p0[n1];
 616
 617       /* Loop until first mismatch, or to the sentinel characters.  */
 618
 619       /* Compare a word at a time for speed.  */
 620       while (*w0 == *w1)
 621         w0++, w1++;
 622
 623       /* Do the last few bytes of comparison a byte at a time.  */
 624       p0 = (char *) w0;
 625       p1 = (char *) w1;
 626       while (*p0 == *p1)
 627         p0++, p1++;
 628
 629       /* Don't mistakenly count missing newline as part of prefix.  */
 630       if (ROBUST_OUTPUT_STYLE (output_style)
 631           && ((buffer0 + n0 - filevec[0].missing_newline < p0)
 632               !=
 633               (buffer1 + n1 - filevec[1].missing_newline < p1)))
 634         p0--, p1--;
 635     }
 636
 637   /* Now P0 and P1 point at the first nonmatching characters.  */
 638
 639   /* Skip back to last line-beginning in the prefix,
 640      and then discard up to HORIZON_LINES lines from the prefix.  */
 641   i = horizon_lines;
 642   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
 643     p0--, p1--;
 644
 645   /* Record the prefix.  */
 646   filevec[0].prefix_end = p0;
 647   filevec[1].prefix_end = p1;
 648
 649   /* Find identical suffix.  */
 650
 651   /* P0 and P1 point beyond the last chars not yet compared.  */
 652   p0 = buffer0 + n0;
 653   p1 = buffer1 + n1;
 654
 655   if (! ROBUST_OUTPUT_STYLE (output_style)
 656       || filevec[0].missing_newline == filevec[1].missing_newline)
 657     {
 658       end0 = p0;        /* Addr of last char in file 0.  */
 659
 660       /* Get value of P0 at which we should stop scanning backward:
 661          this is when either P0 or P1 points just past the last char
 662          of the identical prefix.  */
 663       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
 664
 665       /* Scan back until chars don't match or we reach that point.  */
 666       for (; p0 != beg0; p0--, p1--)
 667         if (*p0 != *p1)
 668           {
 669             /* Point at the first char of the matching suffix.  */
 670             beg0 = p0;
 671             break;
 672           }
 673
 674       /* Are we at a line-beginning in both files?  If not, add the rest of
 675          this line to the main body.  Discard up to HORIZON_LINES lines from
 676          the identical suffix.  Also, discard one extra line,
 677          because shift_boundaries may need it.  */
 678       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
 679                             &&
 680                             (buffer1 == p1 || p1[-1] == '\n'));
 681       while (i-- && p0 != end0)
 682         while (*p0++ != '\n')
 683           continue;
 684
 685       p1 += p0 - beg0;
 686     }
 687
 688   /* Record the suffix.  */
 689   filevec[0].suffix_begin = p0;
 690   filevec[1].suffix_begin = p1;
 691
 692   /* Calculate number of lines of prefix to save.
 693
 694      prefix_count == 0 means save the whole prefix;
 695      we need this for options like -D that output the whole file,
 696      or for enormous contexts (to avoid worrying about arithmetic overflow).
 697      We also need it for options like -F that output some preceding line;
 698      at least we will need to find the last few lines,
 699      but since we don't know how many, it's easiest to find them all.
 700
 701      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
 702      of the line buffer; they'll be moved to the proper location later.
 703      Handle 1 more line than the context says (because we count 1 too many),
 704      rounded up to the next power of 2 to speed index computation.  */
 705
 706   if (no_diff_means_no_output && ! function_regexp.fastmap
 707       && context < LIN_MAX / 4 && context < n0)
 708     {
 709       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
 710       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
 711       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
 712         continue;
 713       alloc_lines0 = (prefix_count + middle_guess
 714                       + MIN (context, suffix_guess));
 715     }
 716   else
 717     {
 718       prefix_count = 0;
 719       alloc_lines0 = guess_lines (0, 0, n0);
 720     }
 721
 722   prefix_mask = prefix_count - 1;
 723   lines = 0;
 724   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
 725   p0 = buffer0;
 726
 727   /* If the prefix is needed, find the prefix lines.  */
 728   if (! (no_diff_means_no_output
 729          && filevec[0].prefix_end == p0
 730          && filevec[1].prefix_end == p1))
 731     {
 732       end0 = filevec[0].prefix_end;
 733       while (p0 != end0)
 734         {
 735           lin l = lines++ & prefix_mask;
 736           if (l == alloc_lines0)
 737             {
 738               if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
 739                 xalloc_die ();
 740               alloc_lines0 *= 2;
 741               linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
 742             }
 743           linbuf0[l] = p0;
 744           while (*p0++ != '\n')
 745             continue;
 746         }
 747     }
 748   buffered_prefix = prefix_count && context < lines ? context : lines;
 749
 750   /* Allocate line buffer 1.  */
 751
 752   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
 753   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
 754   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
 755   if (alloc_lines1 < buffered_prefix
 756       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
 757     xalloc_die ();
 758   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
 759
 760   if (buffered_prefix != lines)
 761     {
 762       /* Rotate prefix lines to proper location.  */
 763       for (i = 0;  i < buffered_prefix;  i++)
 764         linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
 765       for (i = 0;  i < buffered_prefix;  i++)
 766         linbuf0[i] = linbuf1[i];
 767     }
 768
 769   /* Initialize line buffer 1 from line buffer 0.  */
 770   for (i = 0; i < buffered_prefix; i++)
 771     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
 772
 773   /* Record the line buffer, adjusted so that
 774      linbuf[0] points at the first differing line.  */
 775   filevec[0].linbuf = linbuf0 + buffered_prefix;
 776   filevec[1].linbuf = linbuf1 + buffered_prefix;
 777   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
 778   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
 779   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
 780   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
 781 }
 782 \f
 783 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
 784    than 2**k.  This table is derived from Chris K. Caldwell's list
 785    <http://www.utm.edu/research/primes/lists/2small/>.  */
 786
 787 static unsigned char const prime_offset[] =
 788 {
 789   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
 790   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
 791   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
 792   55, 93, 1, 57, 25
 793 };
 794
 795 /* Verify that this host's size_t is not too wide for the above table.  */
 796
 797 verify (enough_prime_offsets,
 798         sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
 799
 800 /* Given a vector of two file_data objects, read the file associated
 801    with each one, and build the table of equivalence classes.
 802    Return nonzero if either file appears to be a binary file.
 803    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
 804
 805 bool
 806 read_files (struct file_data filevec[], bool pretend_binary)
 807 {
 808   int i;
 809   bool skip_test = text | pretend_binary;
 810   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
 811
 812   if (filevec[0].desc != filevec[1].desc)
 813     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
 814   else
 815     {
 816       filevec[1].buffer = filevec[0].buffer;
 817       filevec[1].bufsize = filevec[0].bufsize;
 818       filevec[1].buffered = filevec[0].buffered;
 819     }
 820   if (appears_binary)
 821     {
 822       set_binary_mode (filevec[0].desc, 1);
 823       set_binary_mode (filevec[1].desc, 1);
 824       return 1;
 825     }
 826
 827   find_identical_ends (filevec);
 828
 829   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
 830   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
 831     xalloc_die ();
 832   equivs = xmalloc (equivs_alloc * sizeof *equivs);
 833   /* Equivalence class 0 is permanently safe for lines that were not
 834      hashed.  Real equivalence classes start at 1.  */
 835   equivs_index = 1;
 836
 837   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
 838      number between 1/3 and 2/3 of the value of equiv_allocs,
 839      approximately.  */
 840   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
 841     continue;
 842   nbuckets = ((size_t) 1 << i) - prime_offset[i];
 843   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
 844     xalloc_die ();
 845   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
 846   buckets++;
 847
 848   for (i = 0; i < 2; i++)
 849     find_and_hash_each_line (&filevec[i]);
 850
 851   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
 852
 853   free (equivs);
 854   free (buckets - 1);
 855
 856   return 0;
 857 }