diff.c

   1 /*
   2  * wiggle - apply rejected patches
   3  *
   4  * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
   5  * Copyright (C) 2011-2013 Neil Brown <neilb@suse.de>
   6  *
   7  *
   8  *    This program is free software; you can redistribute it and/or modify
   9  *    it under the terms of the GNU General Public License as published by
  10  *    the Free Software Foundation; either version 2 of the License, or
  11  *    (at your option) any later version.
  12  *
  13  *    This program is distributed in the hope that it will be useful,
  14  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *    GNU General Public License for more details.
  17  *
  18  *    You should have received a copy of the GNU General Public License
  19  *    along with this program.
  20  *
  21  *    Author: Neil Brown
  22  *    Email: <neilb@suse.de>
  23  */
  24
  25 /*
  26  * Calculate longest common subsequence between two sequences
  27  *
  28  * Each sequence contains strings with
  29  *   hash start length
  30  * We produce a list of tripples: a b len
  31  * where A and B point to elements in the two sequences, and len is the number
  32  * of common elements there.  The list is terminated by an entry with len==0.
  33  *
  34  * This is roughly based on
  35  *   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  36  *   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  37  * http://xmailserver.org/diff2.pdf
  38  *
  39  * However we don't run the basic algorithm both forward and backward until
  40  * we find an overlap as Myers suggests.  Rather we always run forwards, but
  41  * we record the location of the (possibly empty) snake that crosses the
  42  * midline.  When we finish, this recorded location for the best path shows
  43  * us where to divide and find further midpoints.
  44  *
  45  * In brief, the algorithm is as follows.
  46  *
  47  * Imagine a Cartesian Matrix where x co-ordinates correspond to symbols in
  48  * the first sequence (A, length a) and y co-ordinates correspond to symbols
  49  * in the second sequence (B, length b).  At the origin we have the first
  50  * sequence.
  51  * Movement in the x direction represents deleting the symbol as that point,
  52  * so from x=i-1 to x=i deletes symbol i from A.
  53
  54  * Movement in the y direction represents adding the corresponding symbol
  55  * from B.  So to move from the origin 'a' spaces along X and then 'b' spaces
  56  * up Y will remove all of the first sequence and then add all of the second
  57  * sequence.  Similarly moving firstly up the Y axis, then along the X
  58  * direction will add the new sequence, then remove the old sequence.  Thus
  59  * the point a,b represents the second sequence and a part from 0,0 to a,b
  60  * represent an sequence of edits to change A into B.
  61  *
  62  * There are clearly many paths from 0,0 to a,b going through different
  63  * points in the matrix in different orders.  At some points in the matrix
  64  * the next symbol to be added from B is the same as the next symbol to be
  65  * removed from A.  At these points we can take a diagonal step to a new
  66  * point in the matrix without actually changing any symbol.  A sequence of
  67  * these diagonal steps is called a 'snake'.  The goal then is to find a path
  68  * of x-steps (removals), y-steps (additions) and diagonals (steps and
  69  * snakes) where the number of (non-diagonal) steps is minimal.
  70  *
  71  * i.e. we aim for as many long snakes as possible.
  72  * If the total number of 'steps' is called the 'cost', we aim to minimise
  73  * the cost.
  74  *
  75  * As storing the whole matrix in memory would be prohibitive with large
  76  * sequences we limit ourselves to linear storage proportional to a+b and
  77  * repeat the search at most log2(a+b) times building up the path as we go.
  78  * Specifically we perform a search on the full matrix and record where each
  79  * path crosses the half-way point. i.e. where x+y = (a+b)/2 (== mid).  This
  80  * tells us the mid point of the best path.  We then perform two searches,
  81  * one on each of the two halves and find the 1/4 and 3/4 way points.  This
  82  * continues recursively until we have all points.
  83  *
  84  * The storage is an array v of 'struct v'.  This is indexed by the
  85  * diagonal-number k = x-y.  Thus k can be negative and the array is
  86  * allocated to allow for that.  During the search there is an implicit value
  87  * 'c' which is the cost (length in steps) of all the paths currently under
  88  * consideration.
  89  * v[k] stores details of the longest reaching path of cost c that finishes
  90  *      on diagonal k.  "longest reaching" means "finishes closest to a,b".
  91  * Details are:
  92  *   The location of the end point.  'x' is stored. y = x - k.
  93  *   The diagonal of the midpoint crossing. md is stored. x = (mid + md)/2
  94  *                                                        y = (mid - md)/2
  95  *                                                          = x - md
  96  *   (note: md is a diagonal so md = x-y.  mid is an anti-diagonal: mid = x+y)
  97  *   The number of 'snakes' in the path (l).  This is used to allocate the
  98  *   array which will record the snakes and to terminate recursion.
  99  *
 100  * A path with an even cost (c is even) must end on an even diagonal (k is
 101  * even) and when c is odd, k must be odd.  So the v[] array is treated as
 102  * two sub arrays, the even part and the odd part.  One represents paths of
 103  * cost 'c', the other paths of cost c-1.
 104  *
 105  * Initially only v[0] is meaningful and there are no snakes.  We firstly
 106  * extend all paths under consideration with the longest possible snake on
 107  * that diagonal.
 108  *
 109  * Then we increment 'c' and calculate for each suitable 'k' whether the best
 110  * path to diagonal k of cost c comes from taking an x-step from the c-1 path
 111  * on diagonal k-1, or from taking a y-step from the c-1 path on diagonal
 112  * k+1.  Obviously we need to avoid stepping out of the matrix.  Finally we
 113  * check if the 'v' array can be extended or reduced at the boundaries.  If
 114  * we hit a border we must reduce.  If the best we could possibly do on that
 115  * diagonal is less than the worst result from the current leading path, then
 116  * we also reduce.  Otherwise we extend the range of 'k's we consider.
 117  *
 118  * We continue until we find a path has reached a,b.  This must be a minimal
 119  * cost path (cost==c).  At this point re-check the end of the snake at the
 120  * midpoint and report that.
 121  *
 122  * This all happens recursively for smaller and smaller subranges stopping
 123  * when we examine a submatrix and find that it contains no snakes.  As we
 124  * are usually dealing with sub-matrixes we are not walking from 0,0 to a,b
 125  * from alo,blo to ahi,bhi - low point to high point.  So the initial k is
 126  * alo-blo, not 0.
 127  *
 128  */
 129
 130 #include        "wiggle.h"
 131 #include        <stdlib.h>
 132 #include        <sys/time.h>
 133
 134 struct v {
 135         int x;  /* x location of furthest reaching path of current cost */
 136         int md; /* diagonal location of midline crossing */
 137         int l;  /* number of continuous common sequences found so far */
 138 };
 139
 140 static int find_common(struct file *a, struct file *b,
 141                        int *alop, int *ahip,
 142                        int *blop, int *bhip,
 143                        struct v *v, int shortcut)
 144 {
 145         /* Examine matrix from alo to ahi and blo to bhi.
 146          * i.e. including alo and blo, but less than ahi and bhi.
 147          * Finding longest subsequence and
 148          * return new {a,b}{lo,hi} either side of midline.
 149          * i.e. mid = ( (ahi-alo) + (bhi-blo) ) / 2
 150          *      alo+blo <= mid <= ahi+bhi
 151          *  and alo,blo to ahi,bhi is a common (possibly empty)
 152          *  subseq - a snake.
 153          *
 154          * v is scratch space which is indexable from
 155          * alo-bhi to ahi-blo inclusive.
 156          * i.e. even though there is no symbol at ahi or bhi, we do
 157          * consider paths that reach there as they simply cannot
 158          * go further in that direction.
 159          *
 160          * Return the number of snakes found.
 161          */
 162
 163         struct timeval start, stop;
 164         int klo, khi;
 165         int alo = *alop;
 166         int ahi = *ahip;
 167         int blo = *blop;
 168         int bhi = *bhip;
 169
 170         int mid = (ahi+bhi+alo+blo)/2;
 171
 172         /* 'worst' is the worst-case extra cost that we need
 173          * to pay before reaching our destination.  It assumes
 174          * no more snakes in the furthest-reaching path so far.
 175          * We use this to know when we can trim the extreme
 176          * diagonals - when their best case does not improve on
 177          * the current worst case.
 178          */
 179         int worst = (ahi-alo)+(bhi-blo);
 180
 181         shortcut = !!shortcut;
 182         if (shortcut)
 183                 gettimeofday(&start, NULL);
 184
 185         klo = khi = alo-blo;
 186         v[klo].x = alo;
 187         v[klo].l = 0;
 188
 189         while (1) {
 190                 int x, y;
 191                 int cost;
 192                 int k;
 193
 194                 if (shortcut == 1 &&
 195                     khi - klo > 5000 &&
 196                     gettimeofday(&stop, NULL) == 0 &&
 197                     (stop.tv_sec - start.tv_sec) * 1000000 +
 198                     (stop.tv_usec - start.tv_usec) > 20000)
 199                         /* 20ms is a long time.  Time to take a shortcut
 200                          * Next snake wins
 201                          */
 202                         shortcut = 2;
 203                 /* Find the longest snake extending on each current
 204                  * diagonal, and record if it crosses the midline.
 205                  * If we reach the end, return.
 206                  */
 207                 for (k = klo ; k <= khi ; k += 2) {
 208                         int snake = 0;
 209
 210                         x = v[k].x;
 211                         y = x-k;
 212                         if (y > bhi)
 213                                 abort();
 214
 215                         /* Follow any snake that is here */
 216                         while (x < ahi && y < bhi &&
 217                                match(&a->list[x], &b->list[y])
 218                                 ) {
 219                                 x++;
 220                                 y++;
 221                                 snake = 1;
 222                         }
 223
 224                         /* Refine the worst-case remaining cost */
 225                         cost = (ahi-x)+(bhi-y);
 226                         if (cost < worst) {
 227                                 worst = cost;
 228                                 if (snake && shortcut == 2) {
 229                                         *alop = v[k].x;
 230                                         *blop = v[k].x - k;
 231                                         *ahip = x;
 232                                         *bhip = y;
 233                                         return 1;
 234                                 }
 235                         }
 236                         /* Check for midline crossing */
 237                         if (x+y >= mid &&
 238                              v[k].x + v[k].x-k <= mid)
 239                                 v[k].md = k;
 240
 241                         v[k].x = x;
 242                         v[k].l += snake;
 243
 244                         if (cost == 0) {
 245                                 /* OK! We have arrived.
 246                                  * We crossed the midpoint on diagonal v[k].md
 247                                  */
 248                                 if (x != ahi)
 249                                         abort();
 250
 251                                 /* The snake could start earlier than the
 252                                  * midline.  We cannot just search backwards
 253                                  * as that might find the wrong path - the
 254                                  * greediness of the diff algorithm is
 255                                  * asymmetric.
 256                                  * We could record the start of the snake in
 257                                  * 'v', but we will find the actual snake when
 258                                  * we recurse so there is no need.
 259                                  */
 260                                 x = (v[k].md+mid)/2;
 261                                 y = x-v[k].md;
 262
 263                                 *alop = x;
 264                                 *blop = y;
 265
 266                                 /* Find the end of the snake using the same
 267                                  * greedy approach as when we first found the
 268                                  * snake
 269                                  */
 270                                 while (x < ahi && y < bhi &&
 271                                        match(&a->list[x], &b->list[y])
 272                                         ) {
 273                                         x++;
 274                                         y++;
 275                                 }
 276                                 *ahip = x;
 277                                 *bhip = y;
 278
 279                                 return v[k].l;
 280                         }
 281                 }
 282
 283                 /* No success with previous cost, so increment cost (c) by 1
 284                  * and for each other diagonal, set from the end point of the
 285                  * diagonal on one side of it or the other.
 286                  */
 287                 for (k = klo+1; k <= khi-1 ; k += 2) {
 288                         if (v[k-1].x+1 > ahi) {
 289                                 /* cannot step to the right from previous
 290                                  * diagonal as there is no room.
 291                                  * So step up from next diagonal.
 292                                  */
 293                                 v[k] = v[k+1];
 294                         } else if (v[k+1].x - k > bhi || v[k-1].x+1 >= v[k+1].x) {
 295                                 /* Cannot step up from next diagonal as either
 296                                  * there is no room, or doing so wouldn't get us
 297                                  * as close to the endpoint.
 298                                  * So step to the right.
 299                                  */
 300                                 v[k] = v[k-1];
 301                                 v[k].x++;
 302                         } else {
 303                                 /* There is room in both directions, but
 304                                  * stepping up from the next diagonal gets us
 305                                  * closer
 306                                  */
 307                                 v[k] = v[k+1];
 308                         }
 309                 }
 310
 311                 /* Now we need to either extend or contract klo and khi
 312                  * so they both change parity (odd vs even).
 313                  * If we extend we need to step up (for klo) or to the
 314                  * right (khi) from the adjacent diagonal.  This is
 315                  * not possible if we have hit the edge of the matrix, and
 316                  * not sensible if the new point has a best case remaining
 317                  * cost that is worse than our current worst case remaining
 318                  * cost.
 319                  * The best-case remaining cost is the absolute difference
 320                  * between the remaining number of additions and the remaining
 321                  * number of deletions - and assumes lots of snakes.
 322                  */
 323                 /* new location if we step up from klo to klo-1*/
 324                 x = v[klo].x; y = x - (klo-1);
 325                 cost = abs((ahi-x)-(bhi-y));
 326                 klo--;
 327                 if (y <= bhi && cost <= worst) {
 328                         /* Looks acceptable - step up. */
 329                         v[klo] = v[klo+1];
 330                 } else do {
 331                                 klo += 2;
 332                                 x = v[klo].x; y = x - (klo-1);
 333                                 cost = abs((ahi-x)-(bhi-y));
 334                         } while (cost > worst);
 335
 336                 /* new location if we step to the right from khi to khi+1 */
 337                 x = v[khi].x+1; y = x - (khi+1);
 338                 cost = abs((ahi-x)-(bhi-y));
 339                 khi++;
 340                 if (x <= ahi && cost <= worst) {
 341                         /* Looks acceptable - step to the right */
 342                         v[khi] = v[khi-1];
 343                         v[khi].x++;
 344                 } else do {
 345                                 khi -= 2;
 346                                 x = v[khi].x+1; y = x - (khi+1);
 347                                 cost = abs((ahi-x)-(bhi-y));
 348                         } while (cost > worst);
 349         }
 350 }
 351
 352 struct cslb {
 353         int             size;   /* How much is alloced */
 354         int             len;    /* How much is used */
 355         struct csl      *csl;
 356 };
 357
 358 static void csl_add(struct cslb *buf, int a, int b, int len)
 359 {
 360         struct csl *csl;
 361         if (len && buf->len) {
 362                 csl = buf->csl + buf->len - 1;
 363                 if (csl->a + csl->len == a &&
 364                     csl->b + csl->len == b) {
 365                         csl->len += len;
 366                         return;
 367                 }
 368         }
 369         if (buf->size <= buf->len) {
 370                 if (buf->size < 64)
 371                         buf->size = 64;
 372                 else
 373                         buf->size += buf->size;
 374                 buf->csl = realloc(buf->csl, sizeof(buf->csl[0]) * buf->size);
 375         }
 376         csl = buf->csl + buf->len;
 377         csl->len = len;
 378         csl->a = a;
 379         csl->b = b;
 380         buf->len += 1;
 381 }
 382
 383 static void lcsl(struct file *a, int alo, int ahi,
 384                  struct file *b, int blo, int bhi,
 385                  struct cslb *cslb,
 386                  struct v *v, int shortcut)
 387 {
 388         /* lcsl == longest common sub-list.
 389          * This calls itself recursively as it finds the midpoint
 390          * of the best path.
 391          * On first call, 'csl' is NULL and will need to be allocated and
 392          * is returned.
 393          * On subsequence calls when 'csl' is not NULL, we add all the
 394          * snakes we find to csl, and return a pointer to the next
 395          * location where future snakes can be stored.
 396          */
 397         int alo1 = alo;
 398         int ahi1 = ahi;
 399         int blo1 = blo;
 400         int bhi1 = bhi;
 401
 402         if (ahi <= alo || bhi <= blo)
 403                 return;
 404
 405         if (!find_common(a, b,
 406                          &alo1, &ahi1,
 407                          &blo1, &bhi1,
 408                          v, shortcut))
 409                 return;
 410
 411         /* There are more snakes to find - keep looking. */
 412
 413         /* With depth-first recursion, this adds all the snakes
 414          * before 'alo1' to 'csl'
 415          */
 416         lcsl(a, alo, alo1,
 417              b, blo, blo1,
 418              cslb, v, 0);
 419
 420         if (ahi1 > alo1)
 421                 /* need to add this common seq, possibly attach
 422                  * to last
 423                  */
 424                 csl_add(cslb, alo1, blo1, ahi1 - alo1);
 425
 426         /* Now recurse to add all the snakes after ahi1 to csl */
 427         lcsl(a, ahi1, ahi,
 428              b, bhi1, bhi,
 429              cslb, v, shortcut);
 430 }
 431
 432 /* If two common sequences are separated by only an add or remove,
 433  * and the first sequence ends the same as the middle text,
 434  * extend the second and contract the first in the hope that the
 435  * first might become empty.  This ameliorates against the greediness
 436  * of the 'diff' algorithm.
 437  * i.e. if we have:
 438  *   [ foo X ] X [ bar ]
 439  *   [ foo X ] [ bar ]
 440  * Then change it to:
 441  *   [ foo ] X [ X bar ]
 442  *   [ foo ] [ X bar ]
 443  * We treat the final zero-length 'csl' as a common sequence which
 444  * can be extended so we must make sure to add a new zero-length csl
 445  * to the end.
 446  * If this doesn't make the first sequence disappear, and (one of the)
 447  * X(s) was a newline, then move back so the newline is at the end
 448  * of the first sequence.  This encourages common sequences
 449  * to be whole-line units where possible.
 450  */
 451 static void fixup(struct file *a, struct file *b, struct csl *list)
 452 {
 453         struct csl *list1, *orig;
 454         int lasteol = -1;
 455         int found_end = 0;
 456
 457         if (!list)
 458                 return;
 459
 460         /* 'list' and 'list1' are adjacent pointers into the csl.
 461          * If a match gets deleted, they might not be physically
 462          * adjacent any more.  Once we get to the end of the list
 463          * this will cease to matter - the list will be a bit
 464          * shorter is all.
 465          */
 466         orig = list;
 467         list1 = list+1;
 468         while (list->len) {
 469                 if (list1->len == 0)
 470                         found_end = 1;
 471
 472                 /* If a single token is either inserted or deleted
 473                  * immediately after a matching token...
 474                  */
 475                 if ((list->a+list->len == list1->a &&
 476                      list->b+list->len != list1->b &&
 477                      /* text at b inserted */
 478                      match(&b->list[list->b+list->len-1],
 479                            &b->list[list1->b-1])
 480                             )
 481                     ||
 482                     (list->b+list->len == list1->b &&
 483                      list->a+list->len != list1->a &&
 484                      /* text at a deleted */
 485                      match(&a->list[list->a+list->len-1],
 486                            &a->list[list1->a-1])
 487                             )
 488                         ) {
 489                         /* If the last common token is a simple end-of-line
 490                          * record where it is.  For a word-wise diff, this is
 491                          * any EOL.  For a line-wise diff this is a blank line.
 492                          * If we are looking at a deletion it must be deleting
 493                          * the eol, so record that deleted eol.
 494                          */
 495                         if (ends_line(a->list[list->a+list->len-1])
 496                             && a->list[list->a+list->len-1].len == 1
 497                             && lasteol == -1
 498                                 ) {
 499                                 lasteol = list1->a-1;
 500                         }
 501                         /* Expand the second match, shrink the first */
 502                         list1->a--;
 503                         list1->b--;
 504                         list1->len++;
 505                         list->len--;
 506
 507                         /* If the first match has become empty, make it
 508                          * disappear.. (and forget about the eol).
 509                          */
 510                         if (list->len == 0) {
 511                                 lasteol = -1;
 512                                 if (found_end) {
 513                                         /* Deleting just before the last
 514                                          * entry */
 515                                         *list = *list1;
 516                                         list1->a += list1->len;
 517                                         list1->b += list1->len;
 518                                         list1->len = 0;
 519                                 } else if (list > orig)
 520                                         /* Deleting in the  middle */
 521                                         list--;
 522                                 else {
 523                                         /* deleting the first entry */
 524                                         *list = *list1++;
 525                                 }
 526                         }
 527                 } else {
 528                         /* Nothing interesting here, though if we
 529                          * shuffled back past an eol, shuffle
 530                          * forward to line up with that eol.
 531                          * This causes an eol to bind more strongly
 532                          * with the preceding line than the following.
 533                          */
 534                         if (lasteol >= 0) {
 535                                 while (list1->a <= lasteol
 536                                        && (list1->len > 1 ||
 537                                            (found_end && list1->len > 0))) {
 538                                         list1->a++;
 539                                         list1->b++;
 540                                         list1->len--;
 541                                         list->len++;
 542                                 }
 543                                 lasteol = -1;
 544                         }
 545                         *++list = *list1;
 546                         if (found_end) {
 547                                 list1->a += list1->len;
 548                                 list1->b += list1->len;
 549                                 list1->len = 0;
 550                         } else
 551                                 list1++;
 552                 }
 553                 if (list->len && list1 == list)
 554                         abort();
 555         }
 556 }
 557
 558 static int elcmp(const void *v1, const void *v2)
 559 {
 560         const struct elmnt *e1 = v1;
 561         const struct elmnt *e2 = v2;
 562
 563         if (e1->hash != e2->hash) {
 564                 if (e1->hash < e2->hash)
 565                         return -1;
 566                 return 1;
 567         }
 568         if (e1->start[0] == 0 && e2->start[0] == 0)
 569                 return 0;
 570         if (e1->len != e2->len)
 571                 return e1->len - e2->len;
 572         return strncmp(e1->start, e2->start, e1->len);
 573 }
 574
 575 #define BPL (sizeof(unsigned long) * 8)
 576 static struct file filter_unique(struct file f, struct file ref)
 577 {
 578         /* Use a bloom-filter to record all hashes in 'ref' and
 579          * then if there are consequtive entries in 'f' that are
 580          * not in 'ref', reduce each such run to 1 entry
 581          */
 582         struct file n;
 583         int fi, cnt;
 584         struct file sorted;
 585
 586         sorted.list = xmalloc(sizeof(sorted.list[0]) * ref.elcnt);
 587         sorted.elcnt = ref.elcnt;
 588         memcpy(sorted.list, ref.list, sizeof(sorted.list[0]) * sorted.elcnt);
 589         qsort(sorted.list, sorted.elcnt, sizeof(sorted.list[0]),
 590               elcmp);
 591
 592         n.list = xmalloc(sizeof(n.list[0]) * f.elcnt);
 593         n.elcnt = 0;
 594         cnt = 0;
 595         for (fi = 0; fi < f.elcnt; fi++) {
 596                 int lo = 0, hi = sorted.elcnt;
 597                 while (lo + 1 < hi) {
 598                         int mid = (lo + hi) / 2;
 599                         if (elcmp(&f.list[fi], &sorted.list[mid]) < 0)
 600                                 hi = mid;
 601                         else
 602                                 lo = mid;
 603                 }
 604                 if (match(&f.list[fi], &sorted.list[lo]))
 605                         cnt = 0;
 606                 else
 607                         cnt += 1;
 608                 if (cnt <= 1)
 609                         n.list[n.elcnt++] = f.list[fi];
 610         }
 611         free(sorted.list);
 612         return n;
 613 }
 614
 615 static void remap(struct csl *csl, int which, struct file from, struct file to)
 616 {
 617         /* The a,b pointer in csl points to 'from' we need to remap to 'to'.
 618          * 'to' has everything that 'from' has, plus more.
 619          * Each list[].start is unique
 620          */
 621         int ti = 0;
 622         while (csl->len) {
 623                 int fi = which ? csl->b : csl->a;
 624                 while (to.list[ti].start != from.list[fi].start) {
 625                         ti += 1;
 626                         if (ti > to.elcnt)
 627                                 abort();
 628                 }
 629                 if (which)
 630                         csl->b = ti;
 631                 else
 632                         csl->a = ti;
 633                 csl += 1;
 634         }
 635         if (which)
 636                 csl->b = to.elcnt;
 637         else
 638                 csl->a = to.elcnt;
 639 }
 640 /* Main entry point - find the common-sub-list of files 'a' and 'b'.
 641  * The final element in the list will have 'len' == 0 and will point
 642  * beyond the end of the files.
 643  */
 644 struct csl *diff(struct file a, struct file b, int shortest)
 645 {
 646         struct v *v;
 647         struct cslb cslb = {};
 648         struct file af, bf;
 649
 650         /* Remove runs of 2 or more elements in one file that don't
 651          * exist in the other file.  This often makes the number of
 652          * elements more manageable.
 653          */
 654         af = filter_unique(a, b);
 655         bf = filter_unique(b, a);
 656
 657         v = xmalloc(sizeof(struct v)*(af.elcnt+bf.elcnt+2));
 658         v += bf.elcnt+1;
 659
 660         lcsl(&af, 0, af.elcnt,
 661              &bf, 0, bf.elcnt,
 662              &cslb, v, !shortest);
 663         csl_add(&cslb, af.elcnt, bf.elcnt, 0);
 664         free(v-(bf.elcnt+1));
 665         remap(cslb.csl, 0, af, a);
 666         remap(cslb.csl, 1, bf, b);
 667         free(af.list);
 668         free(bf.list);
 669         fixup(&a, &b, cslb.csl);
 670         return cslb.csl;
 671 }
 672
 673 /* Alternate entry point - find the common-sub-list in two
 674  * subranges of files.
 675  */
 676 struct csl *diff_partial(struct file a, struct file b,
 677                          int alo, int ahi, int blo, int bhi)
 678 {
 679         struct v *v;
 680         struct cslb cslb = {};
 681         v = xmalloc(sizeof(struct v)*(ahi-alo+bhi-blo+2));
 682         v += bhi-alo+1;
 683
 684         lcsl(&a, alo, ahi,
 685              &b, blo, bhi,
 686              &cslb, v, 0);
 687         csl_add(&cslb, ahi, bhi, 0);
 688         free(v-(bhi-alo+1));
 689         fixup(&a, &b, cslb.csl);
 690         return cslb.csl;
 691 }
 692
 693 struct csl *csl_join(struct csl *c1, struct csl *c2)
 694 {
 695         int cnt1, cnt2;
 696         int offset = 0;
 697         if (c1 == NULL)
 698                 return c2;
 699         if (c2 == NULL)
 700                 return c1;
 701
 702         for (cnt1 = 0; c1[cnt1].len; cnt1++)
 703                 ;
 704         for (cnt2 = 0; c2[cnt2].len; cnt2++)
 705                 ;
 706         if (cnt1 && cnt2 &&
 707             c1[cnt1-1].a + c1[cnt1-1].len == c2[0].a &&
 708             c1[cnt1-1].b + c1[cnt1-1].len == c2[0].b) {
 709                 /* Merge these two */
 710                 c1[cnt1-1].len += c2[0].len;
 711                 offset = 1;
 712                 cnt2--;
 713         }
 714         c1 = realloc(c1, (cnt1+cnt2+1)*sizeof(*c1));
 715         while (cnt2 >= 0) {
 716                 c1[cnt1+cnt2] = c2[cnt2 + offset];
 717                 cnt2--;
 718         }
 719         free(c2);
 720         return c1;
 721 }
 722
 723 /* When rediffing a patch, we *must* make sure the hunk headers
 724  * line up.  So don't do a full diff, but rather find the hunk
 725  * headers and diff the bits between them.
 726  */
 727 struct csl *diff_patch(struct file a, struct file b, int shortest)
 728 {
 729         int ap, bp;
 730         struct csl *csl = NULL;
 731         if (a.elcnt == 0 || b.elcnt == 0 ||
 732             a.list[0].start[0] != '\0' ||
 733             b.list[0].start[0] != '\0')
 734                 /* this is not a patch */
 735                 return diff(a, b, shortest);
 736
 737         ap = 0; bp = 0;
 738         while (ap < a.elcnt && bp < b.elcnt) {
 739                 int alo = ap;
 740                 int blo = bp;
 741                 struct csl *cs;
 742
 743                 do
 744                         ap++;
 745                 while (ap < a.elcnt &&
 746                        a.list[ap].start[0] != '\0');
 747                 do
 748                         bp++;
 749                 while (bp < b.elcnt &&
 750                        b.list[bp].start[0] != '\0');
 751                 cs = diff_partial(a, b, alo, ap, blo, bp);
 752                 csl = csl_join(csl, cs);
 753         }
 754         return csl;
 755 }
 756
 757 #ifdef MAIN
 758
 759 main(int argc, char *argv[])
 760 {
 761         struct file a, b;
 762         struct csl *csl;
 763         struct elmnt *lst = xmalloc(argc*sizeof(*lst));
 764         int arg;
 765         struct v *v;
 766         int ln;
 767
 768         arg = 1;
 769         a.elcnt = 0;
 770         a.list = lst;
 771         while (argv[arg] && strcmp(argv[arg], "--")) {
 772                 lst->hash = 0;
 773                 lst->start = argv[arg];
 774                 lst->len = strlen(argv[arg]);
 775                 a.elcnt++;
 776                 lst++;
 777                 arg++;
 778         }
 779         if (!argv[arg]) {
 780                 printf("AARGH\n");
 781                 exit(1);
 782         }
 783         arg++;
 784         b.elcnt = 0;
 785         b.list = lst;
 786         while (argv[arg] && strcmp(argv[arg], "--")) {
 787                 lst->hash = 0;
 788                 lst->start = argv[arg];
 789                 lst->len = strlen(argv[arg]);
 790                 b.elcnt++;
 791                 lst++;
 792                 arg++;
 793         }
 794
 795         csl = diff(a, b, 1);
 796         fixup(&a, &b, csl);
 797         while (csl && csl->len) {
 798                 int i;
 799                 printf("%d,%d for %d:\n", csl->a, csl->b, csl->len);
 800                 for (i = 0; i < csl->len; i++) {
 801                         printf("  %.*s (%.*s)\n",
 802                                a.list[csl->a+i].len, a.list[csl->a+i].start,
 803                                b.list[csl->b+i].len, b.list[csl->b+i].start);
 804                 }
 805                 csl++;
 806         }
 807
 808         exit(0);
 809 }
 810
 811 #endif
 812