diff.c

   1 /*
   2  * wiggle - apply rejected patches
   3  *
   4  * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
   5  * Copyright (C) 2011 Neil Brown <neilb@suse.de>
   6  *
   7  *
   8  *    This program is free software; you can redistribute it and/or modify
   9  *    it under the terms of the GNU General Public License as published by
  10  *    the Free Software Foundation; either version 2 of the License, or
  11  *    (at your option) any later version.
  12  *
  13  *    This program is distributed in the hope that it will be useful,
  14  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *    GNU General Public License for more details.
  17  *
  18  *    You should have received a copy of the GNU General Public License
  19  *    along with this program; if not, write to the Free Software Foundation, Inc.,
  20  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  *    Author: Neil Brown
  23  *    Email: <neilb@suse.de>
  24  */
  25
  26 /*
  27  * Calculate longest common subsequence between two sequences
  28  *
  29  * Each sequence contains strings with
  30  *   hash start length
  31  * We produce a list of tripples: a b len
  32  * where A and B point to elements in the two sequences, and len is the number
  33  * of common elements there.  The list is terminated by an entry with len==0.
  34  *
  35  * This is roughly based on
  36  *   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  37  *   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  38  * http://xmailserver.org/diff2.pdf
  39  *
  40  * However we don't run the basic algorithm both forward and backward until
  41  * we find an overlap as Myers suggests.  Rather we always run forwards, but
  42  * we record the location of the (possibly empty) snake that crosses the
  43  * midline.  When we finish, this recorded location for the best path shows
  44  * us where to divide and find further midpoints.
  45  *
  46  * In brief, the algorithm is as follows.
  47  *
  48  * Imagine a Cartesian Matrix where x co-ordinates correspond to symbols in
  49  * the first sequence (A, length a) and y co-ordinates correspond to symbols
  50  * in the second sequence (B, length b).  At the origin we have the first
  51  * sequence.
  52  * Movement in the x direction represents deleting the symbol as that point,
  53  * so from x=i-1 to x=i deletes symbol i from A.
  54
  55  * Movement in the y direction represents adding the corresponding symbol
  56  * from B.  So to move from the origin 'a' spaces along X and then 'b' spaces
  57  * up Y will remove all of the first sequence and then add all of the second
  58  * sequence.  Similarly moving firstly up the Y axis, then along the X
  59  * direction will add the new sequence, then remove the old sequence.  Thus
  60  * the point a,b represents the second sequence and a part from 0,0 to a,b
  61  * represent an sequence of edits to change A into B.
  62  *
  63  * There are clearly many paths from 0,0 to a,b going through different
  64  * points in the matrix in different orders.  At some points in the matrix
  65  * the next symbol to be added from B is the same as the next symbol to be
  66  * removed from A.  At these points we can take a diagonal step to a new
  67  * point in the matrix without actually changing any symbol.  A sequence of
  68  * these diagonal steps is called a 'snake'.  The goal then is to find a path
  69  * of x-steps (removals), y-steps (additions) and diagonals (steps and
  70  * snakes) where the number of (non-diagonal) steps is minimal.
  71  *
  72  * i.e. we aim for as many long snakes as possible.
  73  * If the total number of 'steps' is called the 'cost', we aim to minimise
  74  * the cost.
  75  *
  76  * As storing the whole matrix in memory would be prohibitive with large
  77  * sequences we limit ourselves to linear storage proportional to a+b and
  78  * repeat the search at most log2(a+b) times building up the path as we go.
  79  * Specifically we perform a search on the full matrix and record where each
  80  * path crosses the half-way point. i.e. where x+y = (a+b)/2 (== mid).  This
  81  * tells us the mid point of the best path.  We then perform two searches,
  82  * one on each of the two halves and find the 1/4 and 3/4 way points.  This
  83  * continues recursively until we have all points.
  84  *
  85  * The storage is an array v of 'struct v'.  This is indexed by the
  86  * diagonal-number k = x-y.  Thus k can be negative and the array is
  87  * allocated to allow for that.  During the search there is an implicit value
  88  * 'c' which is the cost (length in steps) of all the paths currently under
  89  * consideration.
  90  * v[k] stores details of the longest reaching path of cost c that finishes
  91  *      on diagonal k.  "longest reaching" means "finishes closest to a,b".
  92  * Details are:
  93  *   The location of the end point.  'x' is stored. y = x - k.
  94  *   The diagonal of the midpoint crossing. md is stored. x = (mid + md)/2
  95  *                                                        y = (mid - md)/2
  96  *                                                          = x - md
  97  *   (note: md is a diagonal so md = x-y.  mid is an anti-diagonal: mid = x+y)
  98  *   The number of 'snakes' in the path (l).  This is used to allocate the
  99  *   array which will record the snakes and to terminate recursion.
 100  *
 101  * A path with an even cost (c is even) must end on an even diagonal (k is
 102  * even) and when c is odd, k must be odd.  So the v[] array is treated as
 103  * two sub arrays, the even part and the odd part.  One represents paths of
 104  * cost 'c', the other paths of cost c-1.
 105  *
 106  * Initially only v[0] is meaningful and there are no snakes.  We firstly
 107  * extend all paths under consideration with the longest possible snake on
 108  * that diagonal.
 109  *
 110  * Then we increment 'c' and calculate for each suitable 'k' whether the best
 111  * path to diagonal k of cost c comes from taking an x-step from the c-1 path
 112  * on diagonal k-1, or from taking a y-step from the c-1 path on diagonal
 113  * k+1.  Obviously we need to avoid stepping out of the matrix.  Finally we
 114  * check if the 'v' array can be extended or reduced at the boundaries.  If
 115  * we hit a border we must reduce.  If the best we could possibly do on that
 116  * diagonal is less than the worst result from the current leading path, then
 117  * we also reduce.  Otherwise we extend the range of 'k's we consider.
 118  *
 119  * We continue until we find a path has reached a,b.  This must be a minimal
 120  * cost path (cost==c).  At this point re-check the end of the snake at the
 121  * midpoint and report that.
 122  *
 123  * This all happens recursively for smaller and smaller subranges stopping
 124  * when we examine a submatrix and find that it contains no snakes.  As we
 125  * are usually dealing with sub-matrixes we are not walking from 0,0 to a,b
 126  * from alo,blo to ahi,bhi - low point to high point.  So the initial k is
 127  * alo-blo, not 0.
 128  *
 129  */
 130
 131 #include        "wiggle.h"
 132 #include        <stdlib.h>
 133
 134 struct v {
 135         int x;  /* x location of furthest reaching path of current cost */
 136         int md; /* diagonal location of midline crossing */
 137         int l;  /* number of continuous common sequences found so far */
 138 };
 139
 140 static int find_common(struct file *a, struct file *b,
 141                             int *alop, int *ahip,
 142                             int *blop, int *bhip,
 143                             struct v *v)
 144 {
 145         /* Examine matrix from alo to ahi and blo to bhi.
 146          * i.e. including alo and blo, but less than ahi and bhi.
 147          * Finding longest subsequence and
 148          * return new {a,b}{lo,hi} either side of midline.
 149          * i.e. mid = ( (ahi-alo) + (bhi-blo) ) / 2
 150          *      alo+blo <= mid <= ahi+bhi
 151          *  and alo,blo to ahi,bhi is a common (possibly empty)
 152          *  subseq - a snake.
 153          *
 154          * v is scratch space which is indexable from
 155          * alo-bhi to ahi-blo inclusive.
 156          * i.e. even though there is no symbol at ahi or bhi, we do
 157          * consider paths that reach there as they simply cannot
 158          * go further in that direction.
 159          *
 160          * Return the number of snakes found.
 161          */
 162
 163         int klo, khi;
 164         int alo = *alop;
 165         int ahi = *ahip;
 166         int blo = *blop;
 167         int bhi = *bhip;
 168
 169         int mid = (ahi+bhi+alo+blo)/2;
 170
 171         /* 'worst' is the worst-case extra cost that we need
 172          * to pay before reaching our destination.  It assumes
 173          * no more snakes in the furthest-reaching path so far.
 174          * We use this to know when we can trim the extreme
 175          * diagonals - when their best case does not improve on
 176          * the current worst case.
 177          */
 178         int worst = (ahi-alo)+(bhi-blo);
 179
 180         klo = khi = alo-blo;
 181         v[klo].x = alo;
 182         v[klo].l = 0;
 183
 184         while (1) {
 185                 int x, y;
 186                 int cost;
 187                 int k;
 188
 189                 /* Find the longest snake extending on each current
 190                  * diagonal, and record if it crosses the midline.
 191                  * If we reach the end, return.
 192                  */
 193                 for (k = klo ; k <= khi ; k += 2) {
 194                         int snake = 0;
 195
 196                         x = v[k].x;
 197                         y = x-k;
 198                         if (y > bhi)
 199                                 abort();
 200
 201                         /* Follow any snake that is here */
 202                         while (x < ahi && y < bhi &&
 203                                match(&a->list[x], &b->list[y])
 204                                 ) {
 205                                 x++;
 206                                 y++;
 207                                 snake = 1;
 208                         }
 209
 210                         /* Refine the worst-case remaining cost */
 211                         cost = (ahi-x)+(bhi-y);
 212                         if (cost < worst)
 213                                 worst = cost;
 214
 215                         /* Check for midline crossing */
 216                         if (x+y >= mid &&
 217                              v[k].x + v[k].x-k <= mid)
 218                                 v[k].md = k;
 219
 220                         v[k].x = x;
 221                         v[k].l += snake;
 222
 223                         if (cost == 0) {
 224                                 /* OK! We have arrived.
 225                                  * We crossed the midpoint on diagonal v[k].md
 226                                  */
 227                                 if (x != ahi)
 228                                         abort();
 229
 230                                 /* The snake could start earlier than the
 231                                  * midline.  We cannot just search backwards
 232                                  * as that might find the wrong path - the
 233                                  * greediness of the diff algorithm is
 234                                  * asymmetric.
 235                                  * We could record the start of the snake in
 236                                  * 'v', but we will find the actual snake when
 237                                  * we recurse so there is no need.
 238                                  */
 239                                 x = (v[k].md+mid)/2;
 240                                 y = x-v[k].md;
 241
 242                                 *alop = x;
 243                                 *blop = y;
 244
 245                                 /* Find the end of the snake using the same
 246                                  * greedy approach as when we first found the
 247                                  * snake
 248                                  */
 249                                 while (x < ahi && y < bhi &&
 250                                        match(&a->list[x], &b->list[y])
 251                                         ) {
 252                                         x++;
 253                                         y++;
 254                                 }
 255                                 *ahip = x;
 256                                 *bhip = y;
 257
 258                                 return v[k].l;
 259                         }
 260                 }
 261
 262                 /* No success with previous cost, so increment cost (c) by 1
 263                  * and for each other diagonal, set from the end point of the
 264                  * diagonal on one side of it or the other.
 265                  */
 266                 for (k = klo+1; k <= khi-1 ; k += 2) {
 267                         if (v[k-1].x+1 > ahi) {
 268                                 /* cannot step to the right from previous
 269                                  * diagonal as there is no room.
 270                                  * So step up from next diagonal.
 271                                  */
 272                                 v[k] = v[k+1];
 273                         } else if (v[k+1].x - k > bhi || v[k-1].x+1 >= v[k+1].x) {
 274                                 /* Cannot step up from next diagonal as either
 275                                  * there is no room, or doing so wouldn't get us
 276                                  * as close to the endpoint.
 277                                  * So step to the right.
 278                                  */
 279                                 v[k] = v[k-1];
 280                                 v[k].x++;
 281                         } else {
 282                                 /* There is room in both directions, but
 283                                  * stepping up from the next diagonal gets us
 284                                  * closer
 285                                  */
 286                                 v[k] = v[k+1];
 287                         }
 288                 }
 289
 290                 /* Now we need to either extend or contract klo and khi
 291                  * so they both change parity (odd vs even).
 292                  * If we extend we need to step up (for klo) or to the
 293                  * right (khi) from the adjacent diagonal.  This is
 294                  * not possible if we have hit the edge of the matrix, and
 295                  * not sensible if the new point has a best case remaining
 296                  * cost that is worse than our current worst case remaining
 297                  * cost.
 298                  * The best-case remaining cost is the absolute difference
 299                  * between the remaining number of additions and the remaining
 300                  * number of deletions - and assumes lots of snakes.
 301                  */
 302                 /* new location if we step up from klo to klo-1*/
 303                 x = v[klo].x; y = x - (klo-1);
 304                 cost = abs((ahi-x)-(bhi-y));
 305                 if (y <= bhi && cost <= worst) {
 306                         /* Looks acceptable - step up. */
 307                         v[klo-1] = v[klo];
 308                         klo--;
 309                 } else
 310                         klo++;
 311
 312                 /* new location if we step to the right from khi to khi+1 */
 313                 x = v[khi].x+1; y = x - (khi+1);
 314                 cost = abs((ahi-x)-(bhi-y));
 315                 if (x <= ahi && cost <= worst) {
 316                         /* Looks acceptable - step to the right */
 317                         v[khi+1] = v[khi];
 318                         v[khi+1].x++;
 319                         khi++;
 320                 } else
 321                         khi--;
 322         }
 323 }
 324
 325 static struct csl *lcsl(struct file *a, int alo, int ahi,
 326                         struct file *b, int blo, int bhi,
 327                         struct csl *csl,
 328                         struct v *v)
 329 {
 330         /* lcsl == longest common sub-list.
 331          * This calls itself recursively as it finds the midpoint
 332          * of the best path.
 333          * On first call, 'csl' is NULL and will need to be allocated and
 334          * is returned.
 335          * On subsequence calls when 'csl' is not NULL, we add all the
 336          * snakes we find to csl, and return a pointer to the next
 337          * location where future snakes can be stored.
 338          */
 339         int len;
 340         int alo1 = alo;
 341         int ahi1 = ahi;
 342         int blo1 = blo;
 343         int bhi1 = bhi;
 344         struct csl *rv = NULL;
 345
 346         if (ahi <= alo || bhi <= blo)
 347                 return csl;
 348
 349         len = find_common(a, b,
 350                           &alo1, &ahi1,
 351                           &blo1, &bhi1,
 352                           v);
 353
 354         if (csl == NULL) {
 355                 /* 'len+1' to hold a sentinel */
 356                 rv = csl = xmalloc((len+1)*sizeof(*csl));
 357                 csl->len = 0;
 358         }
 359         if (len) {
 360                 /* There are more snakes to find - keep looking. */
 361
 362                 /* With depth-first recursion, this adds all the snakes
 363                  * before 'alo1' to 'csl'
 364                  */
 365                 csl = lcsl(a, alo, alo1,
 366                            b, blo, blo1,
 367                            csl, v);
 368
 369                 if (ahi1 > alo1) {
 370                         /* need to add this common seq, possibly attach
 371                          * to last
 372                          */
 373                         if (csl->len &&
 374                             csl->a+csl->len == alo1 &&
 375                             csl->b+csl->len == blo1) {
 376                                 csl->len += ahi1-alo1;
 377                         } else {
 378                                 if (csl->len)
 379                                         csl++;
 380                                 csl->len = ahi1-alo1;
 381                                 csl->a = alo1;
 382                                 csl->b = blo1;
 383                                 csl[1].len = 0;
 384                         }
 385                 }
 386                 /* Now recurse to add all the snakes after ahi1 to csl */
 387                 csl = lcsl(a, ahi1, ahi,
 388                            b, bhi1, bhi,
 389                            csl, v);
 390         }
 391         if (rv) {
 392                 /* This was the first call.  Record the endpoint
 393                  * as a snake of length 0.  This might be extended.
 394                  * by 'fixup()' below.
 395                  */
 396                 if (csl->len)
 397                         csl++;
 398                 csl->a = ahi;
 399                 csl->b = bhi;
 400 #if 1
 401                 if (rv+len != csl || csl->len != 0)
 402                         abort(); /* number of runs was wrong */
 403 #endif
 404                 return rv;
 405         } else
 406                 /* intermediate call - return where we are up to */
 407                 return csl;
 408 }
 409
 410 /* If two common sequences are separated by only an add or remove,
 411  * and the first sequence ends the same as the middle text,
 412  * extend the second and contract the first in the hope that the
 413  * first might become empty.  This ameliorates against the greediness
 414  * of the 'diff' algorithm.
 415  * i.e. if we have:
 416  *   [ foo X ] X [ bar ]
 417  *   [ foo X ] [ bar ]
 418  * Then change it to:
 419  *   [ foo ] X [ X bar ]
 420  *   [ foo ] [ X bar ]
 421  * We treat the final zero-length 'csl' as a common sequence which
 422  * can be extended so we must make sure to add a new zero-length csl
 423  * to the end.
 424  * If this doesn't make the first sequence disappear, and (one of the)
 425  * X(s) was a newline, then move back so the newline is at the end
 426  * of the first sequence.  This encourages common sequences
 427  * to be whole-line units where possible.
 428  */
 429 static void fixup(struct file *a, struct file *b, struct csl *list)
 430 {
 431         struct csl *list1, *orig;
 432         int lasteol = -1;
 433         int found_end = 0;
 434
 435         if (!list)
 436                 return;
 437
 438         /* 'list' and 'list1' are adjacent pointers into the csl.
 439          * If a match gets deleted, they might not be physically
 440          * adjacent any more.  Once we get to the end of the list
 441          * this will cease to matter - the list will be a bit
 442          * shorter is all.
 443          */
 444         orig = list;
 445         list1 = list+1;
 446         while (list->len) {
 447                 if (list1->len == 0)
 448                         found_end = 1;
 449
 450                 /* If a single token is either inserted or deleted
 451                  * immediately after a matching token...
 452                  */
 453                 if ((list->a+list->len == list1->a &&
 454                      list->b+list->len != list1->b &&
 455                      /* text at b inserted */
 456                      match(&b->list[list->b+list->len-1],
 457                            &b->list[list1->b-1])
 458                             )
 459                     ||
 460                     (list->b+list->len == list1->b &&
 461                      list->a+list->len != list1->a &&
 462                      /* text at a deleted */
 463                      match(&a->list[list->a+list->len-1],
 464                            &a->list[list1->a-1])
 465                             )
 466                         ) {
 467                         /* If the last common token is a simple end-of-line
 468                          * record where it is.  For a word-wise diff, this is
 469                          * any EOL.  For a line-wise diff this is a blank line.
 470                          * If we are looking at a deletion it must be deleting
 471                          * the eol, so record that deleted eol.
 472                          */
 473                         if (ends_line(a->list[list->a+list->len-1])
 474                             && a->list[list->a+list->len-1].len == 1
 475                             && lasteol == -1
 476                                 ) {
 477                                 lasteol = list1->a-1;
 478                         }
 479                         /* Expand the second match, shrink the first */
 480                         list1->a--;
 481                         list1->b--;
 482                         list1->len++;
 483                         list->len--;
 484
 485                         /* If the first match has become empty, make it
 486                          * disappear.. (and forget about the eol).
 487                          */
 488                         if (list->len == 0) {
 489                                 lasteol = -1;
 490                                 if (found_end) {
 491                                         /* Deleting just before the last
 492                                          * entry */
 493                                         *list = *list1;
 494                                         list1->a += list1->len;
 495                                         list1->b += list1->len;
 496                                         list1->len = 0;
 497                                 } else if (list > orig)
 498                                         /* Deleting in the  middle */
 499                                         list--;
 500                                 else {
 501                                         /* deleting the first entry */
 502                                         *list = *list1++;
 503                                 }
 504                         }
 505                 } else {
 506                         /* Nothing interesting here, though if we
 507                          * shuffled back past an eol, shuffle
 508                          * forward to line up with that eol.
 509                          * This causes an eol to bind more strongly
 510                          * with the preceding line than the following.
 511                          */
 512                         if (lasteol >= 0) {
 513                                 while (list1->a <= lasteol
 514                                        && (list1->len > 1 ||
 515                                            (found_end && list1->len > 0))) {
 516                                         list1->a++;
 517                                         list1->b++;
 518                                         list1->len--;
 519                                         list->len++;
 520                                 }
 521                                 lasteol = -1;
 522                         }
 523                         *++list = *list1;
 524                         if (found_end) {
 525                                 list1->a += list1->len;
 526                                 list1->b += list1->len;
 527                                 list1->len = 0;
 528                         } else
 529                                 list1++;
 530                 }
 531                 if (list->len && list1 == list)
 532                         abort();
 533         }
 534 }
 535
 536 /* Main entry point - find the common-sub-list of files 'a' and 'b'.
 537  * The final element in the list will have 'len' == 0 and will point
 538  * beyond the end of the files.
 539  */
 540 struct csl *diff(struct file a, struct file b)
 541 {
 542         struct v *v;
 543         struct csl *csl;
 544         v = xmalloc(sizeof(struct v)*(a.elcnt+b.elcnt+2));
 545         v += b.elcnt+1;
 546
 547         csl = lcsl(&a, 0, a.elcnt,
 548                    &b, 0, b.elcnt,
 549                    NULL, v);
 550         free(v-(b.elcnt+1));
 551         fixup(&a, &b, csl);
 552         if (!csl) {
 553                 csl = xmalloc(sizeof(*csl));
 554                 csl->len = 0;
 555                 csl->a = a.elcnt;
 556                 csl->b = b.elcnt;
 557         }
 558         return csl;
 559 }
 560
 561 /* Alternate entry point - find the common-sub-list in two
 562  * subranges of files.
 563  */
 564 struct csl *diff_partial(struct file a, struct file b,
 565                          int alo, int ahi, int blo, int bhi)
 566 {
 567         struct v *v;
 568         struct csl *csl;
 569         v = xmalloc(sizeof(struct v)*(ahi-alo+bhi-blo+2));
 570         v += bhi-alo+1;
 571
 572         csl = lcsl(&a, alo, ahi,
 573                    &b, blo, bhi,
 574                    NULL, v);
 575         free(v-(bhi-alo+1));
 576         fixup(&a, &b, csl);
 577         return csl;
 578 }
 579
 580 #ifdef MAIN
 581
 582 main(int argc, char *argv[])
 583 {
 584         struct file a, b;
 585         struct csl *csl;
 586         struct elmnt *lst = xmalloc(argc*sizeof(*lst));
 587         int arg;
 588         struct v *v;
 589         int ln;
 590
 591         arg = 1;
 592         a.elcnt = 0;
 593         a.list = lst;
 594         while (argv[arg] && strcmp(argv[arg], "--")) {
 595                 lst->hash = 0;
 596                 lst->start = argv[arg];
 597                 lst->len = strlen(argv[arg]);
 598                 a.elcnt++;
 599                 lst++;
 600                 arg++;
 601         }
 602         if (!argv[arg]) {
 603                 printf("AARGH\n");
 604                 exit(1);
 605         }
 606         arg++;
 607         b.elcnt = 0;
 608         b.list = lst;
 609         while (argv[arg] && strcmp(argv[arg], "--")) {
 610                 lst->hash = 0;
 611                 lst->start = argv[arg];
 612                 lst->len = strlen(argv[arg]);
 613                 b.elcnt++;
 614                 lst++;
 615                 arg++;
 616         }
 617
 618         csl = diff(a, b);
 619         fixup(&a, &b, csl);
 620         while (csl && csl->len) {
 621                 int i;
 622                 printf("%d,%d for %d:\n", csl->a, csl->b, csl->len);
 623                 for (i = 0; i < csl->len; i++) {
 624                         printf("  %.*s (%.*s)\n",
 625                                a.list[csl->a+i].len, a.list[csl->a+i].start,
 626                                b.list[csl->b+i].len, b.list[csl->b+i].start);
 627                 }
 628                 csl++;
 629         }
 630
 631         exit(0);
 632 }
 633
 634 #endif
 635