src/analyze.c

   1 /* Analyze file differences for GNU DIFF.
   2
   3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006-2007,
   4    2009-2013, 2015-2025 Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    This program is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  20
  21 #include "diff.h"
  22
  23 #include <cmpbuf.h>
  24 #include <diagnose.h>
  25 #include <error.h>
  26 #include <file-type.h>
  27 #include <xalloc.h>
  28
  29 /* The core of the Diff algorithm.  */
  30 #define ELEMENT lin
  31 #define EQUAL(x,y) ((x) == (y))
  32 #define OFFSET lin
  33 #define OFFSET_MAX LIN_MAX
  34 #define EXTRA_CONTEXT_FIELDS /* none */
  35 #define NOTE_DELETE(c, x) \
  36   (curr.file[0].changed[curr.file[0].realindexes[x]] = true)
  37 #define NOTE_INSERT(c, y) \
  38   (curr.file[1].changed[curr.file[1].realindexes[y]] = true)
  39 #define USE_HEURISTIC
  40 #include <diffseq.h>
  41
  42 /* Discard lines from one file that have no matches in the other file.
  43
  44    A line which is discarded will not be considered by the actual
  45    comparison algorithm; it will be as if that line were not in the file.
  46    The file's 'realindexes' table maps virtual line numbers
  47    (which don't count the discarded lines) into real line numbers;
  48    this is how the actual comparison algorithm produces results
  49    that are comprehensible when the discarded lines are counted.
  50
  51    When we discard a line, we also mark it as a deletion or insertion
  52    so that it will be printed in the output.  */
  53
  54 static void
  55 discard_confusing_lines (struct file_data filevec[])
  56 {
  57   /* Allocate our results.  */
  58   lin *p = xinmalloc (filevec[0].buffered_lines + filevec[1].buffered_lines,
  59                       2 * sizeof *p);
  60   for (int f = 0; f < 2; f++)
  61     {
  62       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
  63       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
  64     }
  65
  66   /* Set up equiv_count[F][I] as the number of lines in file F
  67      that fall in equivalence class I.  */
  68
  69   p = xicalloc (filevec[0].equiv_max, 2 * sizeof *p);
  70   lin *equiv_count[2];
  71   equiv_count[0] = p;
  72   equiv_count[1] = p + filevec[0].equiv_max;
  73
  74   for (lin i = 0; i < filevec[0].buffered_lines; i++)
  75     ++equiv_count[0][filevec[0].equivs[i]];
  76   for (lin i = 0; i < filevec[1].buffered_lines; i++)
  77     ++equiv_count[1][filevec[1].equivs[i]];
  78
  79   /* Set up tables of which lines are going to be discarded.  */
  80
  81   char *discarded[2];
  82   discarded[0] = xizalloc (filevec[0].buffered_lines
  83                            + filevec[1].buffered_lines);
  84   discarded[1] = discarded[0] + filevec[0].buffered_lines;
  85
  86   /* Mark to be discarded each line that matches no line of the other file.
  87      If a line matches many lines, mark it as provisionally discardable.  */
  88
  89   for (int f = 0; f < 2; f++)
  90     {
  91       lin end = filevec[f].buffered_lines;
  92       char *discards = discarded[f];
  93       lin *counts = equiv_count[1 - f];
  94       lin *equivs = filevec[f].equivs;
  95       lin many = 5;
  96
  97       /* Multiply MANY by approximate square root of number of lines.
  98          That is the threshold for provisionally discardable lines.  */
  99       many <<= end < 64 ? 0 : (floor_log2 (end) >> 1) - 3;
 100
 101       for (lin i = 0; i < end; i++)
 102         {
 103           if (equivs[i] == 0)
 104             continue;
 105           lin nmatch = counts[equivs[i]];
 106           if (nmatch == 0)
 107             discards[i] = 1;
 108           else if (nmatch > many)
 109             discards[i] = 2;
 110         }
 111     }
 112
 113   /* Don't really discard the provisional lines except when they occur
 114      in a run of discardables, with nonprovisionals at the beginning
 115      and end.  */
 116
 117   for (int f = 0; f < 2; f++)
 118     {
 119       lin end = filevec[f].buffered_lines;
 120       char *discards = discarded[f];
 121
 122       for (lin i = 0; i < end; i++)
 123         {
 124           /* Cancel provisional discards not in middle of run of discards.  */
 125           if (discards[i] == 2)
 126             discards[i] = 0;
 127           else if (discards[i] != 0)
 128             {
 129               /* We have found a nonprovisional discard.  */
 130               lin provisional = 0, j;
 131
 132               /* Find end of this run of discardable lines.
 133                  Count how many are provisionally discardable.  */
 134               for (j = i; j < end; j++)
 135                 {
 136                   if (discards[j] == 0)
 137                     break;
 138                   if (discards[j] == 2)
 139                     ++provisional;
 140                 }
 141
 142               /* Cancel provisional discards at end, and shrink the run.  */
 143               while (j > i && discards[j - 1] == 2)
 144                 discards[--j] = 0, --provisional;
 145
 146               /* Now we have the length of a run of discardable lines
 147                  whose first and last are not provisional.  */
 148               lin length = j - i;
 149
 150               /* If 1/4 of the lines in the run are provisional,
 151                  cancel discarding of all provisional lines in the run.  */
 152               if (length >> 2 < provisional)
 153                 {
 154                   while (j > i)
 155                     if (discards[--j] == 2)
 156                       discards[j] = 0;
 157                 }
 158               else
 159                 {
 160                   /* MINIMUM is approximate square root of LENGTH/4.
 161                      A subrun of two or more provisionals can stand
 162                      when LENGTH is at least 16.
 163                      A subrun of 4 or more can stand when LENGTH >= 64.  */
 164                   lin minimum =
 165                     (length < 4 ? 2
 166                      : ((lin) 1 << ((floor_log2 (length) >> 1) - 1)) + 1);
 167
 168                   /* Cancel any subrun of MINIMUM or more provisionals
 169                      within the larger run.  */
 170                   lin consec = 0;
 171                   for (j = 0; j < length; j++)
 172                     if (discards[i + j] != 2)
 173                       consec = 0;
 174                     else if (minimum == ++consec)
 175                       /* Back up to start of subrun, to cancel it all.  */
 176                       j -= consec;
 177                     else if (minimum < consec)
 178                       discards[i + j] = 0;
 179
 180                   /* Scan from beginning of run
 181                      until we find 3 or more nonprovisionals in a row
 182                      or until the first nonprovisional at least 8 lines in.
 183                      Until that point, cancel any provisionals.  */
 184                   for (j = 0, consec = 0; j < length; j++)
 185                     {
 186                       if (j >= 8 && discards[i + j] == 1)
 187                         break;
 188                       if (discards[i + j] == 2)
 189                         consec = 0, discards[i + j] = 0;
 190                       else if (discards[i + j] == 0)
 191                         consec = 0;
 192                       else
 193                         consec++;
 194                       if (consec == 3)
 195                         break;
 196                     }
 197
 198                   /* I advances to the last line of the run.  */
 199                   i += length - 1;
 200
 201                   /* Same thing, from end.  */
 202                   for (j = 0, consec = 0; j < length; j++)
 203                     {
 204                       if (j >= 8 && discards[i - j] == 1)
 205                         break;
 206                       if (discards[i - j] == 2)
 207                         consec = 0, discards[i - j] = 0;
 208                       else if (discards[i - j] == 0)
 209                         consec = 0;
 210                       else
 211                         consec++;
 212                       if (consec == 3)
 213                         break;
 214                     }
 215                 }
 216             }
 217         }
 218     }
 219
 220   /* Actually discard the lines. */
 221   for (int f = 0; f < 2; f++)
 222     {
 223       char *discards = discarded[f];
 224       lin end = filevec[f].buffered_lines;
 225       lin j = 0;
 226       for (lin i = 0; i < end; i++)
 227         if (minimal || discards[i] == 0)
 228           {
 229             filevec[f].undiscarded[j] = filevec[f].equivs[i];
 230             filevec[f].realindexes[j++] = i;
 231           }
 232         else
 233           filevec[f].changed[i] = true;
 234       filevec[f].nondiscarded_lines = j;
 235     }
 236
 237   free (discarded[0]);
 238   free (equiv_count[0]);
 239 }
 240 \f
 241 /* Adjust inserts/deletes of identical lines to join changes
 242    as much as possible.
 243
 244    We do something when a run of changed lines include a
 245    line at one end and have an excluded, identical line at the other.
 246    We are free to choose which identical line is included.
 247    'compareseq' usually chooses the one at the beginning,
 248    but usually it is cleaner to consider the following identical line
 249    to be the "change".  */
 250
 251 static void
 252 shift_boundaries (struct file_data filevec[])
 253 {
 254   for (int f = 0; f < 2; f++)
 255     {
 256       bool *changed = filevec[f].changed;
 257       bool *other_changed = filevec[1 - f].changed;
 258       lin const *equivs = filevec[f].equivs;
 259       lin i = 0;
 260       lin j = 0;
 261       lin i_end = filevec[f].buffered_lines;
 262
 263       while (true)
 264         {
 265           /* Scan forwards to find beginning of another run of changes.
 266              Also keep track of the corresponding point in the other file.  */
 267
 268           while (i < i_end && !changed[i])
 269             {
 270               while (other_changed[j++])
 271                 continue;
 272               i++;
 273             }
 274
 275           if (i == i_end)
 276             break;
 277
 278           lin start = i;
 279
 280           /* Find the end of this run of changes.  */
 281
 282           while (changed[++i])
 283             continue;
 284           while (other_changed[j])
 285             j++;
 286
 287           lin runlength, corresponding;
 288
 289           do
 290             {
 291               /* Record the length of this run of changes, so that
 292                  we can later determine whether the run has grown.  */
 293               runlength = i - start;
 294
 295               /* Move the changed region back, so long as the
 296                  previous unchanged line matches the last changed one.
 297                  This merges with previous changed regions.  */
 298
 299               while (start && equivs[start - 1] == equivs[i - 1])
 300                 {
 301                   changed[--start] = true;
 302                   changed[--i] = false;
 303                   while (changed[start - 1])
 304                     start--;
 305                   while (other_changed[--j])
 306                     continue;
 307                 }
 308
 309               /* Set CORRESPONDING to the end of the changed run, at the last
 310                  point where it corresponds to a changed run in the other file.
 311                  CORRESPONDING == I_END means no such point has been found.  */
 312               corresponding = other_changed[j - 1] ? i : i_end;
 313
 314               /* Move the changed region forward, so long as the
 315                  first changed line matches the following unchanged one.
 316                  This merges with following changed regions.
 317                  Do this second, so that if there are no merges,
 318                  the changed region is moved forward as far as possible.  */
 319
 320               while (i != i_end && equivs[start] == equivs[i])
 321                 {
 322                   changed[start++] = false;
 323                   changed[i++] = true;
 324                   while (changed[i])
 325                     i++;
 326                   while (other_changed[++j])
 327                     corresponding = i;
 328                 }
 329             }
 330           while (runlength != i - start);
 331
 332           /* If possible, move the fully-merged run of changes
 333              back to a corresponding run in the other file.  */
 334
 335           while (corresponding < i)
 336             {
 337               changed[--start] = true;
 338               changed[--i] = false;
 339               while (other_changed[--j])
 340                 continue;
 341             }
 342         }
 343     }
 344 }
 345 \f
 346 /* Cons an additional entry onto the front of an edit script OLD.
 347    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
 348    DELETED is the number of lines deleted here from file 0.
 349    INSERTED is the number of lines inserted here in file 1.
 350
 351    If DELETED is 0 then LINE0 is the number of the line before
 352    which the insertion was done; vice versa for INSERTED and LINE1.  */
 353
 354 static struct change *
 355 add_change (lin line0, lin line1, lin deleted, lin inserted,
 356             struct change *old)
 357 {
 358   struct change *new = xmalloc (sizeof *new);
 359
 360   new->line0 = line0;
 361   new->line1 = line1;
 362   new->inserted = inserted;
 363   new->deleted = deleted;
 364   new->link = old;
 365   return new;
 366 }
 367
 368 /* Scan the tables of which lines are inserted and deleted,
 369    producing an edit script in reverse order.  */
 370
 371 static struct change *
 372 build_reverse_script (struct file_data const filevec[])
 373 {
 374   struct change *script = nullptr;
 375   bool *changed0 = filevec[0].changed;
 376   bool *changed1 = filevec[1].changed;
 377   lin len0 = filevec[0].buffered_lines;
 378   lin len1 = filevec[1].buffered_lines;
 379
 380   /* Note that changedN[lenN] does exist, and is 0.  */
 381
 382   lin i0 = 0, i1 = 0;
 383
 384   while (i0 < len0 || i1 < len1)
 385     {
 386       if (changed0[i0] | changed1[i1])
 387         {
 388           lin line0 = i0, line1 = i1;
 389
 390           /* Find # lines changed here in each file.  */
 391           while (changed0[i0]) ++i0;
 392           while (changed1[i1]) ++i1;
 393
 394           /* Record this change.  */
 395           script = add_change (line0, line1, i0 - line0, i1 - line1, script);
 396         }
 397
 398       /* We have reached lines in the two files that match each other.  */
 399       i0++, i1++;
 400     }
 401
 402   return script;
 403 }
 404
 405 /* Scan the tables of which lines are inserted and deleted,
 406    producing an edit script in forward order.  */
 407
 408 static struct change *
 409 build_script (struct file_data const filevec[])
 410 {
 411   struct change *script = nullptr;
 412   bool *changed0 = filevec[0].changed;
 413   bool *changed1 = filevec[1].changed;
 414   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
 415
 416   /* Note that changedN[-1] does exist, and is 0.  */
 417
 418   while (i0 >= 0 || i1 >= 0)
 419     {
 420       if (changed0[i0 - 1] | changed1[i1 - 1])
 421         {
 422           lin line0 = i0, line1 = i1;
 423
 424           /* Find # lines changed here in each file.  */
 425           while (changed0[i0 - 1]) --i0;
 426           while (changed1[i1 - 1]) --i1;
 427
 428           /* Record this change.  */
 429           script = add_change (i0, i1, line0 - i0, line1 - i1, script);
 430         }
 431
 432       /* We have reached lines in the two files that match each other.  */
 433       i0--, i1--;
 434     }
 435
 436   return script;
 437 }
 438 \f
 439 /* If CHANGES, briefly report that two files differed.  */
 440 static void
 441 briefly_report (int changes, struct file_data const filevec[])
 442 {
 443   if (changes)
 444     message ((brief
 445               ? N_("Files %s and %s differ\n")
 446               : N_("Binary files %s and %s differ\n")),
 447              file_label[0] ? file_label[0] : squote (0, filevec[0].name),
 448              file_label[1] ? file_label[1] : squote (1, filevec[1].name));
 449 }
 450
 451 /* Report the differences of two files.  */
 452 int
 453 diff_2_files (struct comparison *cmp)
 454 {
 455   int changes;
 456
 457   /* If we have detected that either file is binary,
 458      compare the two files as binary.  This can happen
 459      only when the first chunk is read.
 460      Also, --brief without any --ignore-* options means
 461      we can speed things up by treating the files as binary.  */
 462
 463   if (read_files (cmp->file, files_can_be_treated_as_binary))
 464     {
 465       /* Files with different lengths must be different.  */
 466       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
 467           && 0 <= cmp->file[0].stat.st_size
 468           && 0 <= cmp->file[1].stat.st_size
 469           && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
 470           && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
 471         changes = 1;
 472
 473       /* Standard input equals itself.  */
 474       else if (cmp->file[0].desc == cmp->file[1].desc)
 475         changes = 0;
 476
 477       else
 478         /* Scan both files, a buffer at a time, looking for a difference.  */
 479         {
 480           /* Allocate same-sized buffers for both files.  */
 481           idx_t lcm_max = IDX_MAX - 1, blksize[2];
 482           for (int f = 0; f < 2; f++)
 483             if (ST_BLKSIZE (cmp->file[f].stat) < 0
 484                 || ckd_add (&blksize[f], ST_BLKSIZE (cmp->file[f].stat), 0))
 485               blksize[f] = 0;
 486           idx_t buffer_size =
 487             buffer_lcm (sizeof (word),
 488                         buffer_lcm (blksize[0], blksize[1], lcm_max),
 489                         lcm_max);
 490           for (int f = 0; f < 2; f++)
 491             cmp->file[f].buffer = xirealloc (cmp->file[f].buffer, buffer_size);
 492
 493           for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
 494             {
 495               /* Read a buffer's worth from both files.  */
 496               for (int f = 0; f < 2; f++)
 497                 if (0 <= cmp->file[f].desc)
 498                   file_block_read (&cmp->file[f],
 499                                    buffer_size - cmp->file[f].buffered);
 500
 501               /* If the buffers differ, the files differ.  */
 502               if (cmp->file[0].buffered != cmp->file[1].buffered
 503                   || memcmp (cmp->file[0].buffer,
 504                              cmp->file[1].buffer,
 505                              cmp->file[0].buffered))
 506                 {
 507                   changes = 1;
 508                   break;
 509                 }
 510
 511               /* If we reach end of file, the files are the same.  */
 512               if (cmp->file[0].buffered != buffer_size)
 513                 {
 514                   changes = 0;
 515                   break;
 516                 }
 517             }
 518         }
 519
 520       briefly_report (changes, cmp->file);
 521     }
 522   else
 523     {
 524       /* Allocate vectors for the results of comparison:
 525          a flag for each line of each file, saying whether that line
 526          is an insertion or deletion.
 527          Allocate an extra false element at each end of each vector.  */
 528
 529       bool *flag_space = xizalloc (cmp->file[0].buffered_lines
 530                                    + cmp->file[1].buffered_lines + 4);
 531       cmp->file[0].changed = flag_space + 1;
 532       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
 533
 534       /* Some lines are obviously insertions or deletions
 535          because they don't match anything.  Detect them now, and
 536          avoid even thinking about them in the main comparison algorithm.  */
 537
 538       discard_confusing_lines (cmp->file);
 539
 540       /* Now do the main comparison algorithm, considering just the
 541          undiscarded lines.  */
 542
 543       struct context ctxt;
 544       ctxt.xvec = cmp->file[0].undiscarded;
 545       ctxt.yvec = cmp->file[1].undiscarded;
 546       lin diags = (cmp->file[0].nondiscarded_lines
 547                    + cmp->file[1].nondiscarded_lines + 3);
 548       ctxt.fdiag = xinmalloc (diags, 2 * sizeof *ctxt.fdiag);
 549       ctxt.bdiag = ctxt.fdiag + diags;
 550       ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
 551       ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
 552
 553       ctxt.heuristic = speed_large_files;
 554
 555       /* Set TOO_EXPENSIVE to be the approximate square root of the
 556          input size, bounded below by 4096.  4096 seems to be good for
 557          circa-2016 CPUs; see Bug#16848 and Bug#24715.  */
 558       lin too_expensive = (lin) 1 << ((floor_log2 (diags) >> 1) + 1);
 559       ctxt.too_expensive = MAX (4096, too_expensive);
 560
 561       curr = *cmp;
 562
 563       compareseq (0, cmp->file[0].nondiscarded_lines,
 564                   0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
 565
 566       free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
 567
 568       /* Modify the results slightly to make them prettier
 569          in cases where that can validly be done.  */
 570
 571       shift_boundaries (cmp->file);
 572
 573       /* Get the results of comparison in the form of a chain
 574          of 'struct change's -- an edit script.  */
 575       struct change *script = ((output_style == OUTPUT_ED
 576                                 ? build_reverse_script
 577                                 : build_script)
 578                                (cmp->file));
 579
 580       /* Set CHANGES if we had any diffs.
 581          If some changes are ignored, we must scan the script to decide.  */
 582       if (ignore_blank_lines || ignore_regexp.fastmap)
 583         {
 584           changes = 0;
 585
 586           for (struct change *next = script; next && changes == 0; )
 587             {
 588               /* Find a set of changes that belong together.  */
 589               struct change *this = next;
 590               struct change *end = find_change (next);
 591
 592               /* Disconnect them from the rest of the changes, making them
 593                  a hunk, and remember the rest for next iteration.  */
 594               next = end->link;
 595               end->link = 0;
 596
 597               /* Determine whether this hunk is really a difference.  */
 598               lin first0, last0, first1, last1;
 599               if (analyze_hunk (this, &first0, &last0, &first1, &last1))
 600                 changes = 1;
 601
 602               /* Reconnect the script so it will all be freed properly.  */
 603               end->link = next;
 604             }
 605         }
 606       else
 607         changes = (script != 0);
 608
 609       if (brief)
 610         briefly_report (changes, cmp->file);
 611       else
 612         {
 613           if (changes || !no_diff_means_no_output)
 614             {
 615               /* Record info for starting up output,
 616                  to be used if and when we have some output to print.  */
 617               setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
 618                             file_label[1] ? file_label[1] : cmp->file[1].name,
 619                             cmp->parent != &noparent);
 620
 621               switch (output_style)
 622                 {
 623                 case OUTPUT_CONTEXT:
 624                   print_context_script (script, false);
 625                   break;
 626
 627                 case OUTPUT_UNIFIED:
 628                   print_context_script (script, true);
 629                   break;
 630
 631                 case OUTPUT_ED:
 632                   print_ed_script (script);
 633                   break;
 634
 635                 case OUTPUT_FORWARD_ED:
 636                   pr_forward_ed_script (script);
 637                   break;
 638
 639                 case OUTPUT_RCS:
 640                   print_rcs_script (script);
 641                   break;
 642
 643                 case OUTPUT_NORMAL:
 644                   print_normal_script (script);
 645                   break;
 646
 647                 case OUTPUT_IFDEF:
 648                   print_ifdef_script (script);
 649                   break;
 650
 651                 case OUTPUT_SDIFF:
 652                   print_sdiff_script (script);
 653                   break;
 654
 655                 default:
 656                   unreachable ();
 657                 }
 658
 659               finish_output ();
 660             }
 661         }
 662
 663       free (cmp->file[0].undiscarded);
 664
 665       free (flag_space);
 666
 667       for (int f = 0; f < 2; f++)
 668         {
 669           free (cmp->file[f].equivs);
 670           free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
 671         }
 672
 673       for (struct change *e = script; e; )
 674         {
 675           struct change *p = e->link;
 676           free (e);
 677           e = p;
 678         }
 679
 680       if (! robust_output_style (output_style))
 681         for (int f = 0; f < 2; f++)
 682           if (cmp->file[f].missing_newline)
 683             {
 684               error (0, 0, "%s: %s\n",
 685                      (file_label[f] ? file_label[f]
 686                       : squote (0, cmp->file[f].name)),
 687                      _("No newline at end of file"));
 688               changes = 2;
 689             }
 690     }
 691
 692   if (cmp->file[0].buffer != cmp->file[1].buffer)
 693     free (cmp->file[0].buffer);
 694   free (cmp->file[1].buffer);
 695
 696   return changes;
 697 }