diff.c

   1 /*
   2  * wiggle - apply rejected patches
   3  *
   4  * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
   5  * Copyright (C) 2011-2013 Neil Brown <neilb@suse.de>
   6  *
   7  *
   8  *    This program is free software; you can redistribute it and/or modify
   9  *    it under the terms of the GNU General Public License as published by
  10  *    the Free Software Foundation; either version 2 of the License, or
  11  *    (at your option) any later version.
  12  *
  13  *    This program is distributed in the hope that it will be useful,
  14  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *    GNU General Public License for more details.
  17  *
  18  *    You should have received a copy of the GNU General Public License
  19  *    along with this program.
  20  *
  21  *    Author: Neil Brown
  22  *    Email: <neilb@suse.de>
  23  */
  24
  25 /*
  26  * Calculate longest common subsequence between two sequences
  27  *
  28  * Each sequence contains strings with
  29  *   hash start length
  30  * We produce a list of tripples: a b len
  31  * where A and B point to elements in the two sequences, and len is the number
  32  * of common elements there.  The list is terminated by an entry with len==0.
  33  *
  34  * This is roughly based on
  35  *   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  36  *   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  37  * http://xmailserver.org/diff2.pdf
  38  *
  39  * However we don't run the basic algorithm both forward and backward until
  40  * we find an overlap as Myers suggests.  Rather we always run forwards, but
  41  * we record the location of the (possibly empty) snake that crosses the
  42  * midline.  When we finish, this recorded location for the best path shows
  43  * us where to divide and find further midpoints.
  44  *
  45  * In brief, the algorithm is as follows.
  46  *
  47  * Imagine a Cartesian Matrix where x co-ordinates correspond to symbols in
  48  * the first sequence (A, length a) and y co-ordinates correspond to symbols
  49  * in the second sequence (B, length b).  At the origin we have the first
  50  * sequence.
  51  * Movement in the x direction represents deleting the symbol as that point,
  52  * so from x=i-1 to x=i deletes symbol i from A.
  53
  54  * Movement in the y direction represents adding the corresponding symbol
  55  * from B.  So to move from the origin 'a' spaces along X and then 'b' spaces
  56  * up Y will remove all of the first sequence and then add all of the second
  57  * sequence.  Similarly moving firstly up the Y axis, then along the X
  58  * direction will add the new sequence, then remove the old sequence.  Thus
  59  * the point a,b represents the second sequence and a part from 0,0 to a,b
  60  * represent an sequence of edits to change A into B.
  61  *
  62  * There are clearly many paths from 0,0 to a,b going through different
  63  * points in the matrix in different orders.  At some points in the matrix
  64  * the next symbol to be added from B is the same as the next symbol to be
  65  * removed from A.  At these points we can take a diagonal step to a new
  66  * point in the matrix without actually changing any symbol.  A sequence of
  67  * these diagonal steps is called a 'snake'.  The goal then is to find a path
  68  * of x-steps (removals), y-steps (additions) and diagonals (steps and
  69  * snakes) where the number of (non-diagonal) steps is minimal.
  70  *
  71  * i.e. we aim for as many long snakes as possible.
  72  * If the total number of 'steps' is called the 'cost', we aim to minimise
  73  * the cost.
  74  *
  75  * As storing the whole matrix in memory would be prohibitive with large
  76  * sequences we limit ourselves to linear storage proportional to a+b and
  77  * repeat the search at most log2(a+b) times building up the path as we go.
  78  * Specifically we perform a search on the full matrix and record where each
  79  * path crosses the half-way point. i.e. where x+y = (a+b)/2 (== mid).  This
  80  * tells us the mid point of the best path.  We then perform two searches,
  81  * one on each of the two halves and find the 1/4 and 3/4 way points.  This
  82  * continues recursively until we have all points.
  83  *
  84  * The storage is an array v of 'struct v'.  This is indexed by the
  85  * diagonal-number k = x-y.  Thus k can be negative and the array is
  86  * allocated to allow for that.  During the search there is an implicit value
  87  * 'c' which is the cost (length in steps) of all the paths currently under
  88  * consideration.
  89  * v[k] stores details of the longest reaching path of cost c that finishes
  90  *      on diagonal k.  "longest reaching" means "finishes closest to a,b".
  91  * Details are:
  92  *   The location of the end point.  'x' is stored. y = x - k.
  93  *   The diagonal of the midpoint crossing. md is stored. x = (mid + md)/2
  94  *                                                        y = (mid - md)/2
  95  *                                                          = x - md
  96  *   (note: md is a diagonal so md = x-y.  mid is an anti-diagonal: mid = x+y)
  97  *   The number of 'snakes' in the path (l).  This is used to allocate the
  98  *   array which will record the snakes and to terminate recursion.
  99  *
 100  * A path with an even cost (c is even) must end on an even diagonal (k is
 101  * even) and when c is odd, k must be odd.  So the v[] array is treated as
 102  * two sub arrays, the even part and the odd part.  One represents paths of
 103  * cost 'c', the other paths of cost c-1.
 104  *
 105  * Initially only v[0] is meaningful and there are no snakes.  We firstly
 106  * extend all paths under consideration with the longest possible snake on
 107  * that diagonal.
 108  *
 109  * Then we increment 'c' and calculate for each suitable 'k' whether the best
 110  * path to diagonal k of cost c comes from taking an x-step from the c-1 path
 111  * on diagonal k-1, or from taking a y-step from the c-1 path on diagonal
 112  * k+1.  Obviously we need to avoid stepping out of the matrix.  Finally we
 113  * check if the 'v' array can be extended or reduced at the boundaries.  If
 114  * we hit a border we must reduce.  If the best we could possibly do on that
 115  * diagonal is less than the worst result from the current leading path, then
 116  * we also reduce.  Otherwise we extend the range of 'k's we consider.
 117  *
 118  * We continue until we find a path has reached a,b.  This must be a minimal
 119  * cost path (cost==c).  At this point re-check the end of the snake at the
 120  * midpoint and report that.
 121  *
 122  * This all happens recursively for smaller and smaller subranges stopping
 123  * when we examine a submatrix and find that it contains no snakes.  As we
 124  * are usually dealing with sub-matrixes we are not walking from 0,0 to a,b
 125  * from alo,blo to ahi,bhi - low point to high point.  So the initial k is
 126  * alo-blo, not 0.
 127  *
 128  */
 129
 130 #include        "wiggle.h"
 131 #include        <stdlib.h>
 132
 133 struct v {
 134         int x;  /* x location of furthest reaching path of current cost */
 135         int md; /* diagonal location of midline crossing */
 136         int l;  /* number of continuous common sequences found so far */
 137 };
 138
 139 static int find_common(struct file *a, struct file *b,
 140                             int *alop, int *ahip,
 141                             int *blop, int *bhip,
 142                             struct v *v)
 143 {
 144         /* Examine matrix from alo to ahi and blo to bhi.
 145          * i.e. including alo and blo, but less than ahi and bhi.
 146          * Finding longest subsequence and
 147          * return new {a,b}{lo,hi} either side of midline.
 148          * i.e. mid = ( (ahi-alo) + (bhi-blo) ) / 2
 149          *      alo+blo <= mid <= ahi+bhi
 150          *  and alo,blo to ahi,bhi is a common (possibly empty)
 151          *  subseq - a snake.
 152          *
 153          * v is scratch space which is indexable from
 154          * alo-bhi to ahi-blo inclusive.
 155          * i.e. even though there is no symbol at ahi or bhi, we do
 156          * consider paths that reach there as they simply cannot
 157          * go further in that direction.
 158          *
 159          * Return the number of snakes found.
 160          */
 161
 162         int klo, khi;
 163         int alo = *alop;
 164         int ahi = *ahip;
 165         int blo = *blop;
 166         int bhi = *bhip;
 167
 168         int mid = (ahi+bhi+alo+blo)/2;
 169
 170         /* 'worst' is the worst-case extra cost that we need
 171          * to pay before reaching our destination.  It assumes
 172          * no more snakes in the furthest-reaching path so far.
 173          * We use this to know when we can trim the extreme
 174          * diagonals - when their best case does not improve on
 175          * the current worst case.
 176          */
 177         int worst = (ahi-alo)+(bhi-blo);
 178
 179         klo = khi = alo-blo;
 180         v[klo].x = alo;
 181         v[klo].l = 0;
 182
 183         while (1) {
 184                 int x, y;
 185                 int cost;
 186                 int k;
 187
 188                 /* Find the longest snake extending on each current
 189                  * diagonal, and record if it crosses the midline.
 190                  * If we reach the end, return.
 191                  */
 192                 for (k = klo ; k <= khi ; k += 2) {
 193                         int snake = 0;
 194
 195                         x = v[k].x;
 196                         y = x-k;
 197                         if (y > bhi)
 198                                 abort();
 199
 200                         /* Follow any snake that is here */
 201                         while (x < ahi && y < bhi &&
 202                                match(&a->list[x], &b->list[y])
 203                                 ) {
 204                                 x++;
 205                                 y++;
 206                                 snake = 1;
 207                         }
 208
 209                         /* Refine the worst-case remaining cost */
 210                         cost = (ahi-x)+(bhi-y);
 211                         if (cost < worst)
 212                                 worst = cost;
 213
 214                         /* Check for midline crossing */
 215                         if (x+y >= mid &&
 216                              v[k].x + v[k].x-k <= mid)
 217                                 v[k].md = k;
 218
 219                         v[k].x = x;
 220                         v[k].l += snake;
 221
 222                         if (cost == 0) {
 223                                 /* OK! We have arrived.
 224                                  * We crossed the midpoint on diagonal v[k].md
 225                                  */
 226                                 if (x != ahi)
 227                                         abort();
 228
 229                                 /* The snake could start earlier than the
 230                                  * midline.  We cannot just search backwards
 231                                  * as that might find the wrong path - the
 232                                  * greediness of the diff algorithm is
 233                                  * asymmetric.
 234                                  * We could record the start of the snake in
 235                                  * 'v', but we will find the actual snake when
 236                                  * we recurse so there is no need.
 237                                  */
 238                                 x = (v[k].md+mid)/2;
 239                                 y = x-v[k].md;
 240
 241                                 *alop = x;
 242                                 *blop = y;
 243
 244                                 /* Find the end of the snake using the same
 245                                  * greedy approach as when we first found the
 246                                  * snake
 247                                  */
 248                                 while (x < ahi && y < bhi &&
 249                                        match(&a->list[x], &b->list[y])
 250                                         ) {
 251                                         x++;
 252                                         y++;
 253                                 }
 254                                 *ahip = x;
 255                                 *bhip = y;
 256
 257                                 return v[k].l;
 258                         }
 259                 }
 260
 261                 /* No success with previous cost, so increment cost (c) by 1
 262                  * and for each other diagonal, set from the end point of the
 263                  * diagonal on one side of it or the other.
 264                  */
 265                 for (k = klo+1; k <= khi-1 ; k += 2) {
 266                         if (v[k-1].x+1 > ahi) {
 267                                 /* cannot step to the right from previous
 268                                  * diagonal as there is no room.
 269                                  * So step up from next diagonal.
 270                                  */
 271                                 v[k] = v[k+1];
 272                         } else if (v[k+1].x - k > bhi || v[k-1].x+1 >= v[k+1].x) {
 273                                 /* Cannot step up from next diagonal as either
 274                                  * there is no room, or doing so wouldn't get us
 275                                  * as close to the endpoint.
 276                                  * So step to the right.
 277                                  */
 278                                 v[k] = v[k-1];
 279                                 v[k].x++;
 280                         } else {
 281                                 /* There is room in both directions, but
 282                                  * stepping up from the next diagonal gets us
 283                                  * closer
 284                                  */
 285                                 v[k] = v[k+1];
 286                         }
 287                 }
 288
 289                 /* Now we need to either extend or contract klo and khi
 290                  * so they both change parity (odd vs even).
 291                  * If we extend we need to step up (for klo) or to the
 292                  * right (khi) from the adjacent diagonal.  This is
 293                  * not possible if we have hit the edge of the matrix, and
 294                  * not sensible if the new point has a best case remaining
 295                  * cost that is worse than our current worst case remaining
 296                  * cost.
 297                  * The best-case remaining cost is the absolute difference
 298                  * between the remaining number of additions and the remaining
 299                  * number of deletions - and assumes lots of snakes.
 300                  */
 301                 /* new location if we step up from klo to klo-1*/
 302                 x = v[klo].x; y = x - (klo-1);
 303                 cost = abs((ahi-x)-(bhi-y));
 304                 if (y <= bhi && cost <= worst) {
 305                         /* Looks acceptable - step up. */
 306                         v[klo-1] = v[klo];
 307                         klo--;
 308                 } else
 309                         klo++;
 310
 311                 /* new location if we step to the right from khi to khi+1 */
 312                 x = v[khi].x+1; y = x - (khi+1);
 313                 cost = abs((ahi-x)-(bhi-y));
 314                 if (x <= ahi && cost <= worst) {
 315                         /* Looks acceptable - step to the right */
 316                         v[khi+1] = v[khi];
 317                         v[khi+1].x++;
 318                         khi++;
 319                 } else
 320                         khi--;
 321         }
 322 }
 323
 324 static struct csl *lcsl(struct file *a, int alo, int ahi,
 325                         struct file *b, int blo, int bhi,
 326                         struct csl *csl,
 327                         struct v *v)
 328 {
 329         /* lcsl == longest common sub-list.
 330          * This calls itself recursively as it finds the midpoint
 331          * of the best path.
 332          * On first call, 'csl' is NULL and will need to be allocated and
 333          * is returned.
 334          * On subsequence calls when 'csl' is not NULL, we add all the
 335          * snakes we find to csl, and return a pointer to the next
 336          * location where future snakes can be stored.
 337          */
 338         int len;
 339         int alo1 = alo;
 340         int ahi1 = ahi;
 341         int blo1 = blo;
 342         int bhi1 = bhi;
 343         struct csl *rv = NULL;
 344
 345         if (ahi <= alo || bhi <= blo)
 346                 return csl;
 347
 348         len = find_common(a, b,
 349                           &alo1, &ahi1,
 350                           &blo1, &bhi1,
 351                           v);
 352
 353         if (csl == NULL) {
 354                 /* 'len+1' to hold a sentinel */
 355                 rv = csl = xmalloc((len+1)*sizeof(*csl));
 356                 csl->len = 0;
 357         }
 358         if (len) {
 359                 /* There are more snakes to find - keep looking. */
 360
 361                 /* With depth-first recursion, this adds all the snakes
 362                  * before 'alo1' to 'csl'
 363                  */
 364                 csl = lcsl(a, alo, alo1,
 365                            b, blo, blo1,
 366                            csl, v);
 367
 368                 if (ahi1 > alo1) {
 369                         /* need to add this common seq, possibly attach
 370                          * to last
 371                          */
 372                         if (csl->len &&
 373                             csl->a+csl->len == alo1 &&
 374                             csl->b+csl->len == blo1) {
 375                                 csl->len += ahi1-alo1;
 376                         } else {
 377                                 if (csl->len)
 378                                         csl++;
 379                                 csl->len = ahi1-alo1;
 380                                 csl->a = alo1;
 381                                 csl->b = blo1;
 382                                 csl[1].len = 0;
 383                         }
 384                 }
 385                 /* Now recurse to add all the snakes after ahi1 to csl */
 386                 csl = lcsl(a, ahi1, ahi,
 387                            b, bhi1, bhi,
 388                            csl, v);
 389         }
 390         if (rv) {
 391                 /* This was the first call.  Record the endpoint
 392                  * as a snake of length 0.  This might be extended.
 393                  * by 'fixup()' below.
 394                  */
 395                 if (csl->len)
 396                         csl++;
 397                 csl->a = ahi;
 398                 csl->b = bhi;
 399 #if 1
 400                 if (rv+len != csl || csl->len != 0)
 401                         abort(); /* number of runs was wrong */
 402 #endif
 403                 return rv;
 404         } else
 405                 /* intermediate call - return where we are up to */
 406                 return csl;
 407 }
 408
 409 /* If two common sequences are separated by only an add or remove,
 410  * and the first sequence ends the same as the middle text,
 411  * extend the second and contract the first in the hope that the
 412  * first might become empty.  This ameliorates against the greediness
 413  * of the 'diff' algorithm.
 414  * i.e. if we have:
 415  *   [ foo X ] X [ bar ]
 416  *   [ foo X ] [ bar ]
 417  * Then change it to:
 418  *   [ foo ] X [ X bar ]
 419  *   [ foo ] [ X bar ]
 420  * We treat the final zero-length 'csl' as a common sequence which
 421  * can be extended so we must make sure to add a new zero-length csl
 422  * to the end.
 423  * If this doesn't make the first sequence disappear, and (one of the)
 424  * X(s) was a newline, then move back so the newline is at the end
 425  * of the first sequence.  This encourages common sequences
 426  * to be whole-line units where possible.
 427  */
 428 static void fixup(struct file *a, struct file *b, struct csl *list)
 429 {
 430         struct csl *list1, *orig;
 431         int lasteol = -1;
 432         int found_end = 0;
 433
 434         if (!list)
 435                 return;
 436
 437         /* 'list' and 'list1' are adjacent pointers into the csl.
 438          * If a match gets deleted, they might not be physically
 439          * adjacent any more.  Once we get to the end of the list
 440          * this will cease to matter - the list will be a bit
 441          * shorter is all.
 442          */
 443         orig = list;
 444         list1 = list+1;
 445         while (list->len) {
 446                 if (list1->len == 0)
 447                         found_end = 1;
 448
 449                 /* If a single token is either inserted or deleted
 450                  * immediately after a matching token...
 451                  */
 452                 if ((list->a+list->len == list1->a &&
 453                      list->b+list->len != list1->b &&
 454                      /* text at b inserted */
 455                      match(&b->list[list->b+list->len-1],
 456                            &b->list[list1->b-1])
 457                             )
 458                     ||
 459                     (list->b+list->len == list1->b &&
 460                      list->a+list->len != list1->a &&
 461                      /* text at a deleted */
 462                      match(&a->list[list->a+list->len-1],
 463                            &a->list[list1->a-1])
 464                             )
 465                         ) {
 466                         /* If the last common token is a simple end-of-line
 467                          * record where it is.  For a word-wise diff, this is
 468                          * any EOL.  For a line-wise diff this is a blank line.
 469                          * If we are looking at a deletion it must be deleting
 470                          * the eol, so record that deleted eol.
 471                          */
 472                         if (ends_line(a->list[list->a+list->len-1])
 473                             && a->list[list->a+list->len-1].len == 1
 474                             && lasteol == -1
 475                                 ) {
 476                                 lasteol = list1->a-1;
 477                         }
 478                         /* Expand the second match, shrink the first */
 479                         list1->a--;
 480                         list1->b--;
 481                         list1->len++;
 482                         list->len--;
 483
 484                         /* If the first match has become empty, make it
 485                          * disappear.. (and forget about the eol).
 486                          */
 487                         if (list->len == 0) {
 488                                 lasteol = -1;
 489                                 if (found_end) {
 490                                         /* Deleting just before the last
 491                                          * entry */
 492                                         *list = *list1;
 493                                         list1->a += list1->len;
 494                                         list1->b += list1->len;
 495                                         list1->len = 0;
 496                                 } else if (list > orig)
 497                                         /* Deleting in the  middle */
 498                                         list--;
 499                                 else {
 500                                         /* deleting the first entry */
 501                                         *list = *list1++;
 502                                 }
 503                         }
 504                 } else {
 505                         /* Nothing interesting here, though if we
 506                          * shuffled back past an eol, shuffle
 507                          * forward to line up with that eol.
 508                          * This causes an eol to bind more strongly
 509                          * with the preceding line than the following.
 510                          */
 511                         if (lasteol >= 0) {
 512                                 while (list1->a <= lasteol
 513                                        && (list1->len > 1 ||
 514                                            (found_end && list1->len > 0))) {
 515                                         list1->a++;
 516                                         list1->b++;
 517                                         list1->len--;
 518                                         list->len++;
 519                                 }
 520                                 lasteol = -1;
 521                         }
 522                         *++list = *list1;
 523                         if (found_end) {
 524                                 list1->a += list1->len;
 525                                 list1->b += list1->len;
 526                                 list1->len = 0;
 527                         } else
 528                                 list1++;
 529                 }
 530                 if (list->len && list1 == list)
 531                         abort();
 532         }
 533 }
 534
 535 static int elcmp(const void *v1, const void *v2)
 536 {
 537         const struct elmnt *e1 = v1;
 538         const struct elmnt *e2 = v2;
 539
 540         if (e1->hash != e2->hash) {
 541                 if (e1->hash < e2->hash)
 542                         return -1;
 543                 return 1;
 544         }
 545         if (e1->start[0] == 0 && e2->start[0] == 0)
 546                 return 0;
 547         if (e1->len != e2->len)
 548                 return e1->len - e2->len;
 549         return strncmp(e1->start, e2->start, e1->len);
 550 }
 551
 552 #define BPL (sizeof(unsigned long) * 8)
 553 static struct file filter_unique(struct file f, struct file ref)
 554 {
 555         /* Use a bloom-filter to record all hashes in 'ref' and
 556          * then if there are consequtive entries in 'f' that are
 557          * not in 'ref', reduce each such run to 1 entry
 558          */
 559         struct file n;
 560         int fi, cnt;
 561         struct file sorted;
 562
 563         sorted.list = xmalloc(sizeof(sorted.list[0]) * ref.elcnt);
 564         sorted.elcnt = ref.elcnt;
 565         memcpy(sorted.list, ref.list, sizeof(sorted.list[0]) * sorted.elcnt);
 566         qsort(sorted.list, sorted.elcnt, sizeof(sorted.list[0]),
 567               elcmp);
 568
 569         n.list = xmalloc(sizeof(n.list[0]) * f.elcnt);
 570         n.elcnt = 0;
 571         cnt = 0;
 572         for (fi = 0; fi < f.elcnt; fi++) {
 573                 int lo = 0, hi = sorted.elcnt;
 574                 while (lo + 1 < hi) {
 575                         int mid = (lo + hi) / 2;
 576                         if (elcmp(&f.list[fi], &sorted.list[mid]) < 0)
 577                                 hi = mid;
 578                         else
 579                                 lo = mid;
 580                 }
 581                 if (match(&f.list[fi], &sorted.list[lo]))
 582                         cnt = 0;
 583                 else
 584                         cnt += 1;
 585                 if (cnt <= 1)
 586                         n.list[n.elcnt++] = f.list[fi];
 587         }
 588         free(sorted.list);
 589         return n;
 590 }
 591
 592 static void remap(struct csl *csl, int which, struct file from, struct file to)
 593 {
 594         /* The a,b pointer in csl points to 'from' we need to remap to 'to'.
 595          * 'to' has everything that 'from' has, plus more.
 596          * Each list[].start is unique
 597          */
 598         int ti = 0;
 599         while (csl->len) {
 600                 int fi = which ? csl->b : csl->a;
 601                 while (to.list[ti].start != from.list[fi].start) {
 602                         ti += 1;
 603                         if (ti > to.elcnt)
 604                                 abort();
 605                 }
 606                 if (which)
 607                         csl->b = ti;
 608                 else
 609                         csl->a = ti;
 610                 csl += 1;
 611         }
 612         if (which)
 613                 csl->b = to.elcnt;
 614         else
 615                 csl->a = to.elcnt;
 616 }
 617 /* Main entry point - find the common-sub-list of files 'a' and 'b'.
 618  * The final element in the list will have 'len' == 0 and will point
 619  * beyond the end of the files.
 620  */
 621 struct csl *diff(struct file a, struct file b)
 622 {
 623         struct v *v;
 624         struct csl *csl;
 625         struct file af, bf;
 626
 627         /* Remove runs of 2 or more elements in one file that don't
 628          * exist in the other file.  This often makes the number of
 629          * elements more manageable.
 630          */
 631         af = filter_unique(a, b);
 632         bf = filter_unique(b, a);
 633
 634         v = xmalloc(sizeof(struct v)*(af.elcnt+bf.elcnt+2));
 635         v += bf.elcnt+1;
 636
 637         csl = lcsl(&af, 0, af.elcnt,
 638                    &bf, 0, bf.elcnt,
 639                    NULL, v);
 640         free(v-(bf.elcnt+1));
 641         remap(csl, 0, af, a);
 642         remap(csl, 1, bf, b);
 643         free(af.list);
 644         free(bf.list);
 645         fixup(&a, &b, csl);
 646         if (!csl) {
 647                 csl = xmalloc(sizeof(*csl));
 648                 csl->len = 0;
 649                 csl->a = a.elcnt;
 650                 csl->b = b.elcnt;
 651         }
 652         return csl;
 653 }
 654
 655 /* Alternate entry point - find the common-sub-list in two
 656  * subranges of files.
 657  */
 658 struct csl *diff_partial(struct file a, struct file b,
 659                          int alo, int ahi, int blo, int bhi)
 660 {
 661         struct v *v;
 662         struct csl *csl;
 663         v = xmalloc(sizeof(struct v)*(ahi-alo+bhi-blo+2));
 664         v += bhi-alo+1;
 665
 666         csl = lcsl(&a, alo, ahi,
 667                    &b, blo, bhi,
 668                    NULL, v);
 669         free(v-(bhi-alo+1));
 670         fixup(&a, &b, csl);
 671         return csl;
 672 }
 673
 674 struct csl *csl_join(struct csl *c1, struct csl *c2)
 675 {
 676         int cnt1, cnt2;
 677         int offset = 0;
 678         if (c1 == NULL)
 679                 return c2;
 680         if (c2 == NULL)
 681                 return c1;
 682
 683         for (cnt1 = 0; c1[cnt1].len; cnt1++)
 684                 ;
 685         for (cnt2 = 0; c2[cnt2].len; cnt2++)
 686                 ;
 687         if (cnt1 && cnt2 &&
 688             c1[cnt1-1].a + c1[cnt1-1].len == c2[0].a &&
 689             c1[cnt1-1].b + c1[cnt1-1].len == c2[0].b) {
 690                 /* Merge these two */
 691                 c1[cnt1-1].len += c2[0].len;
 692                 offset = 1;
 693                 cnt2--;
 694         }
 695         c1 = realloc(c1, (cnt1+cnt2+1)*sizeof(*c1));
 696         while (cnt2 >= 0) {
 697                 c1[cnt1+cnt2] = c2[cnt2 + offset];
 698                 cnt2--;
 699         }
 700         free(c2);
 701         return c1;
 702 }
 703
 704 /* When rediffing a patch, we *must* make sure the hunk headers
 705  * line up.  So don't do a full diff, but rather find the hunk
 706  * headers and diff the bits between them.
 707  */
 708 struct csl *diff_patch(struct file a, struct file b)
 709 {
 710         int ap, bp;
 711         struct csl *csl = NULL;
 712         if (a.elcnt == 0 || b.elcnt == 0 ||
 713             a.list[0].start[0] != '\0' ||
 714             b.list[0].start[0] != '\0')
 715                 /* this is not a patch */
 716                 return diff(a, b);
 717
 718         ap = 0; bp = 0;
 719         while (ap < a.elcnt && bp < b.elcnt) {
 720                 int alo = ap;
 721                 int blo = bp;
 722                 struct csl *cs;
 723
 724                 do
 725                         ap++;
 726                 while (ap < a.elcnt &&
 727                        a.list[ap].start[0] != '\0');
 728                 do
 729                         bp++;
 730                 while (bp < b.elcnt &&
 731                        b.list[bp].start[0] != '\0');
 732                 cs = diff_partial(a, b, alo, ap, blo, bp);
 733                 csl = csl_join(csl, cs);
 734         }
 735         return csl;
 736 }
 737
 738 #ifdef MAIN
 739
 740 main(int argc, char *argv[])
 741 {
 742         struct file a, b;
 743         struct csl *csl;
 744         struct elmnt *lst = xmalloc(argc*sizeof(*lst));
 745         int arg;
 746         struct v *v;
 747         int ln;
 748
 749         arg = 1;
 750         a.elcnt = 0;
 751         a.list = lst;
 752         while (argv[arg] && strcmp(argv[arg], "--")) {
 753                 lst->hash = 0;
 754                 lst->start = argv[arg];
 755                 lst->len = strlen(argv[arg]);
 756                 a.elcnt++;
 757                 lst++;
 758                 arg++;
 759         }
 760         if (!argv[arg]) {
 761                 printf("AARGH\n");
 762                 exit(1);
 763         }
 764         arg++;
 765         b.elcnt = 0;
 766         b.list = lst;
 767         while (argv[arg] && strcmp(argv[arg], "--")) {
 768                 lst->hash = 0;
 769                 lst->start = argv[arg];
 770                 lst->len = strlen(argv[arg]);
 771                 b.elcnt++;
 772                 lst++;
 773                 arg++;
 774         }
 775
 776         csl = diff(a, b);
 777         fixup(&a, &b, csl);
 778         while (csl && csl->len) {
 779                 int i;
 780                 printf("%d,%d for %d:\n", csl->a, csl->b, csl->len);
 781                 for (i = 0; i < csl->len; i++) {
 782                         printf("  %.*s (%.*s)\n",
 783                                a.list[csl->a+i].len, a.list[csl->a+i].start,
 784                                b.list[csl->b+i].len, b.list[csl->b+i].start);
 785                 }
 786                 csl++;
 787         }
 788
 789         exit(0);
 790 }
 791
 792 #endif
 793