extract: removed unused variable: lineno
[wiggle/upstream.git] / diff.c
blob239d4099db67ac14aa4bc80c6cf70f845d03dc17
1 /*
2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
5 * Copyright (C) 2011 Neil Brown <neilb@suse.de>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 * Author: Neil Brown
23 * Email: <neilb@suse.de>
27 * Calculate longest common subsequence between two sequences
29 * Each sequence contains strings with
30 * hash start length
31 * We produce a list of tripples: a b len
32 * where A and B point to elements in the two sequences, and len is the number
33 * of common elements there. The list is terminated by an entry with len==0.
35 * This is roughly based on
36 * "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
37 * Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
38 * http://xmailserver.org/diff2.pdf
40 * However we don't run the basic algorithm both forward and backward until
41 * we find an overlap as Myers suggests. Rather we always run forwards, but
42 * we record the location of the (possibly empty) snake that crosses the
43 * midline. When we finish, this recorded location for the best path shows
44 * us where to divide and find further midpoints.
46 * In brief, the algorithm is as follows.
48 * Imagine a Cartesian Matrix where x co-ordinates correspond to symbols in
49 * the first sequence (A, length a) and y co-ordinates correspond to symbols
50 * in the second sequence (B, length b). At the origin we have the first
51 * sequence.
52 * Movement in the x direction represents deleting the symbol as that point,
53 * so from x=i-1 to x=i deletes symbol i from A.
55 * Movement in the y direction represents adding the corresponding symbol
56 * from B. So to move from the origin 'a' spaces along X and then 'b' spaces
57 * up Y will remove all of the first sequence and then add all of the second
58 * sequence. Similarly moving firstly up the Y axis, then along the X
59 * direction will add the new sequence, then remove the old sequence. Thus
60 * the point a,b represents the second sequence and a part from 0,0 to a,b
61 * represent an sequence of edits to change A into B.
63 * There are clearly many paths from 0,0 to a,b going through different
64 * points in the matrix in different orders. At some points in the matrix
65 * the next symbol to be added from B is the same as the next symbol to be
66 * removed from A. At these points we can take a diagonal step to a new
67 * point in the matrix without actually changing any symbol. A sequence of
68 * these diagonal steps is called a 'snake'. The goal then is to find a path
69 * of x-steps (removals), y-steps (additions) and diagonals (steps and
70 * snakes) where the number of (non-diagonal) steps is minimal.
72 * i.e. we aim for as many long snakes as possible.
73 * If the total number of 'steps' is called the 'cost', we aim to minimise
74 * the cost.
76 * As storing the whole matrix in memory would be prohibitive with large
77 * sequences we limit ourselves to linear storage proportional to a+b and
78 * repeat the search at most log2(a+b) times building up the path as we go.
79 * Specifically we perform a search on the full matrix and record where each
80 * path crosses the half-way point. i.e. where x+y = (a+b)/2 (== mid). This
81 * tells us the mid point of the best path. We then perform two searches,
82 * one on each of the two halves and find the 1/4 and 3/4 way points. This
83 * continues recursively until we have all points.
85 * The storage is an array v of 'struct v'. This is indexed by the
86 * diagonal-number k = x-y. Thus k can be negative and the array is
87 * allocated to allow for that. During the search there is an implicit value
88 * 'c' which is the cost (length in steps) of all the paths currently under
89 * consideration.
90 * v[k] stores details of the longest reaching path of cost c that finishes
91 * on diagonal k. "longest reaching" means "finishes closest to a,b".
92 * Details are:
93 * The location of the end point. 'x' is stored. y = x - k.
94 * The diagonal of the midpoint crossing. md is stored. x = (mid + md)/2
95 * y = (mid - md)/2
96 * = x - md
97 * (note: md is a diagonal so md = x-y. mid is an anti-diagonal: mid = x+y)
98 * The number of 'snakes' in the path (l). This is used to allocate the
99 * array which will record the snakes and to terminate recursion.
101 * A path with an even cost (c is even) must end on an even diagonal (k is
102 * even) and when c is odd, k must be odd. So the v[] array is treated as
103 * two sub arrays, the even part and the odd part. One represents paths of
104 * cost 'c', the other paths of cost c-1.
106 * Initially only v[0] is meaningful and there are no snakes. We firstly
107 * extend all paths under consideration with the longest possible snake on
108 * that diagonal.
110 * Then we increment 'c' and calculate for each suitable 'k' whether the best
111 * path to diagonal k of cost c comes from taking an x-step from the c-1 path
112 * on diagonal k-1, or from taking a y-step from the c-1 path on diagonal
113 * k+1. Obviously we need to avoid stepping out of the matrix. Finally we
114 * check if the 'v' array can be extended or reduced at the boundaries. If
115 * we hit a border we must reduce. If the best we could possibly do on that
116 * diagonal is less than the worst result from the current leading path, then
117 * we also reduce. Otherwise we extend the range of 'k's we consider.
119 * We continue until we find a path has reached a,b. This must be a minimal
120 * cost path (cost==c). At this point re-check the end of the snake at the
121 * midpoint and report that.
123 * This all happens recursively for smaller and smaller subranges stopping
124 * when we examine a submatrix and find that it contains no snakes. As we
125 * are usually dealing with sub-matrixes we are not walking from 0,0 to a,b
126 * from alo,blo to ahi,bhi - low point to high point. So the initial k is
127 * alo-blo, not 0.
131 #include "wiggle.h"
132 #include <stdlib.h>
134 struct v {
135 int x; /* x location of furthest reaching path of current cost */
136 int md; /* diagonal location of midline crossing */
137 int l; /* number of continuous common sequences found so far */
140 static int find_common(struct file *a, struct file *b,
141 int *alop, int *ahip,
142 int *blop, int *bhip,
143 struct v *v)
145 /* Examine matrix from alo to ahi and blo to bhi.
146 * i.e. including alo and blo, but less than ahi and bhi.
147 * Finding longest subsequence and
148 * return new {a,b}{lo,hi} either side of midline.
149 * i.e. mid = ( (ahi-alo) + (bhi-blo) ) / 2
150 * alo+blo <= mid <= ahi+bhi
151 * and alo,blo to ahi,bhi is a common (possibly empty)
152 * subseq - a snake.
154 * v is scratch space which is indexable from
155 * alo-bhi to ahi-blo inclusive.
156 * i.e. even though there is no symbol at ahi or bhi, we do
157 * consider paths that reach there as they simply cannot
158 * go further in that direction.
160 * Return the number of snakes found.
163 int klo, khi;
164 int alo = *alop;
165 int ahi = *ahip;
166 int blo = *blop;
167 int bhi = *bhip;
169 int mid = (ahi+bhi+alo+blo)/2;
171 /* 'worst' is the worst-case extra cost that we need
172 * to pay before reaching our destination. It assumes
173 * no more snakes in the furthest-reaching path so far.
174 * We use this to know when we can trim the extreme
175 * diagonals - when their best case does not improve on
176 * the current worst case.
178 int worst = (ahi-alo)+(bhi-blo);
180 klo = khi = alo-blo;
181 v[klo].x = alo;
182 v[klo].l = 0;
184 while (1) {
185 int x, y;
186 int cost;
187 int k;
189 /* Find the longest snake extending on each current
190 * diagonal, and record if it crosses the midline.
191 * If we reach the end, return.
193 for (k = klo ; k <= khi ; k += 2) {
194 int snake = 0;
196 x = v[k].x;
197 y = x-k;
198 if (y > bhi)
199 abort();
201 /* Follow any snake that is here */
202 while (x < ahi && y < bhi &&
203 match(&a->list[x], &b->list[y])
205 x++;
206 y++;
207 snake = 1;
210 /* Refine the worst-case remaining cost */
211 cost = (ahi-x)+(bhi-y);
212 if (cost < worst)
213 worst = cost;
215 /* Check for midline crossing */
216 if (x+y >= mid &&
217 v[k].x + v[k].x-k <= mid)
218 v[k].md = k;
220 v[k].x = x;
221 v[k].l += snake;
223 if (cost == 0) {
224 /* OK! We have arrived.
225 * We crossed the midpoint on diagonal v[k].md
227 if (x != ahi)
228 abort();
230 /* The snake could start earlier than the
231 * midline. We cannot just search backwards
232 * as that might find the wrong path - the
233 * greediness of the diff algorithm is
234 * asymmetric.
235 * We could record the start of the snake in
236 * 'v', but we will find the actual snake when
237 * we recurse so there is no need.
239 x = (v[k].md+mid)/2;
240 y = x-v[k].md;
242 *alop = x;
243 *blop = y;
245 /* Find the end of the snake using the same
246 * greedy approach as when we first found the
247 * snake
249 while (x < ahi && y < bhi &&
250 match(&a->list[x], &b->list[y])
252 x++;
253 y++;
255 *ahip = x;
256 *bhip = y;
258 return v[k].l;
262 /* No success with previous cost, so increment cost (c) by 1
263 * and for each other diagonal, set from the end point of the
264 * diagonal on one side of it or the other.
266 for (k = klo+1; k <= khi-1 ; k += 2) {
267 if (v[k-1].x+1 > ahi) {
268 /* cannot step to the right from previous
269 * diagonal as there is no room.
270 * So step up from next diagonal.
272 v[k] = v[k+1];
273 } else if (v[k+1].x - k > bhi || v[k-1].x+1 >= v[k+1].x) {
274 /* Cannot step up from next diagonal as either
275 * there is no room, or doing so wouldn't get us
276 * as close to the endpoint.
277 * So step to the right.
279 v[k] = v[k-1];
280 v[k].x++;
281 } else {
282 /* There is room in both directions, but
283 * stepping up from the next diagonal gets us
284 * closer
286 v[k] = v[k+1];
290 /* Now we need to either extend or contract klo and khi
291 * so they both change parity (odd vs even).
292 * If we extend we need to step up (for klo) or to the
293 * right (khi) from the adjacent diagonal. This is
294 * not possible if we have hit the edge of the matrix, and
295 * not sensible if the new point has a best case remaining
296 * cost that is worse than our current worst case remaining
297 * cost.
298 * The best-case remaining cost is the absolute difference
299 * between the remaining number of additions and the remaining
300 * number of deletions - and assumes lots of snakes.
302 /* new location if we step up from klo to klo-1*/
303 x = v[klo].x; y = x - (klo-1);
304 cost = abs((ahi-x)-(bhi-y));
305 if (y <= bhi && cost <= worst) {
306 /* Looks acceptable - step up. */
307 v[klo-1] = v[klo];
308 klo--;
309 } else
310 klo++;
312 /* new location if we step to the right from khi to khi+1 */
313 x = v[khi].x+1; y = x - (khi+1);
314 cost = abs((ahi-x)-(bhi-y));
315 if (x <= ahi && cost <= worst) {
316 /* Looks acceptable - step to the right */
317 v[khi+1] = v[khi];
318 v[khi+1].x++;
319 khi++;
320 } else
321 khi--;
325 static struct csl *lcsl(struct file *a, int alo, int ahi,
326 struct file *b, int blo, int bhi,
327 struct csl *csl,
328 struct v *v)
330 /* lcsl == longest common sub-list.
331 * This calls itself recursively as it finds the midpoint
332 * of the best path.
333 * On first call, 'csl' is NULL and will need to be allocated and
334 * is returned.
335 * On subsequence calls when 'csl' is not NULL, we add all the
336 * snakes we find to csl, and return a pointer to the next
337 * location where future snakes can be stored.
339 int len;
340 int alo1 = alo;
341 int ahi1 = ahi;
342 int blo1 = blo;
343 int bhi1 = bhi;
344 struct csl *rv = NULL;
346 if (ahi <= alo || bhi <= blo)
347 return csl;
349 len = find_common(a, b,
350 &alo1, &ahi1,
351 &blo1, &bhi1,
354 if (csl == NULL) {
355 /* 'len+1' to hold a sentinel */
356 rv = csl = xmalloc((len+1)*sizeof(*csl));
357 csl->len = 0;
359 if (len) {
360 /* There are more snakes to find - keep looking. */
362 /* With depth-first recursion, this adds all the snakes
363 * before 'alo1' to 'csl'
365 csl = lcsl(a, alo, alo1,
366 b, blo, blo1,
367 csl, v);
369 if (ahi1 > alo1) {
370 /* need to add this common seq, possibly attach
371 * to last
373 if (csl->len &&
374 csl->a+csl->len == alo1 &&
375 csl->b+csl->len == blo1) {
376 csl->len += ahi1-alo1;
377 } else {
378 if (csl->len)
379 csl++;
380 csl->len = ahi1-alo1;
381 csl->a = alo1;
382 csl->b = blo1;
383 csl[1].len = 0;
386 /* Now recurse to add all the snakes after ahi1 to csl */
387 csl = lcsl(a, ahi1, ahi,
388 b, bhi1, bhi,
389 csl, v);
391 if (rv) {
392 /* This was the first call. Record the endpoint
393 * as a snake of length 0. This might be extended.
394 * by 'fixup()' below.
396 if (csl->len)
397 csl++;
398 csl->a = ahi;
399 csl->b = bhi;
400 #if 1
401 if (rv+len != csl || csl->len != 0)
402 abort(); /* number of runs was wrong */
403 #endif
404 return rv;
405 } else
406 /* intermediate call - return where we are up to */
407 return csl;
410 /* If two common sequences are separated by only an add or remove,
411 * and the first sequence ends the same as the middle text,
412 * extend the second and contract the first in the hope that the
413 * first might become empty. This ameliorates against the greediness
414 * of the 'diff' algorithm.
415 * i.e. if we have:
416 * [ foo X ] X [ bar ]
417 * [ foo X ] [ bar ]
418 * Then change it to:
419 * [ foo ] X [ X bar ]
420 * [ foo ] [ X bar ]
421 * We treat the final zero-length 'csl' as a common sequence which
422 * can be extended so we must make sure to add a new zero-length csl
423 * to the end.
424 * If this doesn't make the first sequence disappear, and (one of the)
425 * X(s) was a newline, then move back so the newline is at the end
426 * of the first sequence. This encourages common sequences
427 * to be whole-line units where possible.
429 static void fixup(struct file *a, struct file *b, struct csl *list)
431 struct csl *list1, *orig;
432 int lasteol = -1;
433 int found_end = 0;
435 if (!list)
436 return;
438 /* 'list' and 'list1' are adjacent pointers into the csl.
439 * If a match gets deleted, they might not be physically
440 * adjacent any more. Once we get to the end of the list
441 * this will cease to matter - the list will be a bit
442 * shorter is all.
444 orig = list;
445 list1 = list+1;
446 while (list->len) {
447 if (list1->len == 0)
448 found_end = 1;
450 /* If a single token is either inserted or deleted
451 * immediately after a matching token...
453 if ((list->a+list->len == list1->a &&
454 list->b+list->len != list1->b &&
455 /* text at b inserted */
456 match(&b->list[list->b+list->len-1],
457 &b->list[list1->b-1])
460 (list->b+list->len == list1->b &&
461 list->a+list->len != list1->a &&
462 /* text at a deleted */
463 match(&a->list[list->a+list->len-1],
464 &a->list[list1->a-1])
467 /* If the last common token is a simple end-of-line
468 * record where it is. For a word-wise diff, this is
469 * any EOL. For a line-wise diff this is a blank line.
470 * If we are looking at a deletion it must be deleting
471 * the eol, so record that deleted eol.
473 if (ends_line(a->list[list->a+list->len-1])
474 && a->list[list->a+list->len-1].len == 1
475 && lasteol == -1
477 lasteol = list1->a-1;
479 /* Expand the second match, shrink the first */
480 list1->a--;
481 list1->b--;
482 list1->len++;
483 list->len--;
485 /* If the first match has become empty, make it
486 * disappear.. (and forget about the eol).
488 if (list->len == 0) {
489 lasteol = -1;
490 if (found_end) {
491 /* Deleting just before the last
492 * entry */
493 *list = *list1;
494 list1->a += list1->len;
495 list1->b += list1->len;
496 list1->len = 0;
497 } else if (list > orig)
498 /* Deleting in the middle */
499 list--;
500 else {
501 /* deleting the first entry */
502 *list = *list1++;
505 } else {
506 /* Nothing interesting here, though if we
507 * shuffled back past an eol, shuffle
508 * forward to line up with that eol.
509 * This causes an eol to bind more strongly
510 * with the preceding line than the following.
512 if (lasteol >= 0) {
513 while (list1->a <= lasteol
514 && (list1->len > 1 ||
515 (found_end && list1->len > 0))) {
516 list1->a++;
517 list1->b++;
518 list1->len--;
519 list->len++;
521 lasteol = -1;
523 *++list = *list1;
524 if (found_end) {
525 list1->a += list1->len;
526 list1->b += list1->len;
527 list1->len = 0;
528 } else
529 list1++;
531 if (list->len && list1 == list)
532 abort();
536 /* Main entry point - find the common-sub-list of files 'a' and 'b'.
537 * The final element in the list will have 'len' == 0 and will point
538 * beyond the end of the files.
540 struct csl *diff(struct file a, struct file b)
542 struct v *v;
543 struct csl *csl;
544 v = xmalloc(sizeof(struct v)*(a.elcnt+b.elcnt+2));
545 v += b.elcnt+1;
547 csl = lcsl(&a, 0, a.elcnt,
548 &b, 0, b.elcnt,
549 NULL, v);
550 free(v-(b.elcnt+1));
551 fixup(&a, &b, csl);
552 if (!csl) {
553 csl = xmalloc(sizeof(*csl));
554 csl->len = 0;
555 csl->a = a.elcnt;
556 csl->b = b.elcnt;
558 return csl;
561 /* Alternate entry point - find the common-sub-list in two
562 * subranges of files.
564 struct csl *diff_partial(struct file a, struct file b,
565 int alo, int ahi, int blo, int bhi)
567 struct v *v;
568 struct csl *csl;
569 v = xmalloc(sizeof(struct v)*(ahi-alo+bhi-blo+2));
570 v += bhi-alo+1;
572 csl = lcsl(&a, alo, ahi,
573 &b, blo, bhi,
574 NULL, v);
575 free(v-(bhi-alo+1));
576 fixup(&a, &b, csl);
577 return csl;
580 #ifdef MAIN
582 main(int argc, char *argv[])
584 struct file a, b;
585 struct csl *csl;
586 struct elmnt *lst = xmalloc(argc*sizeof(*lst));
587 int arg;
588 struct v *v;
589 int ln;
591 arg = 1;
592 a.elcnt = 0;
593 a.list = lst;
594 while (argv[arg] && strcmp(argv[arg], "--")) {
595 lst->hash = 0;
596 lst->start = argv[arg];
597 lst->len = strlen(argv[arg]);
598 a.elcnt++;
599 lst++;
600 arg++;
602 if (!argv[arg]) {
603 printf("AARGH\n");
604 exit(1);
606 arg++;
607 b.elcnt = 0;
608 b.list = lst;
609 while (argv[arg] && strcmp(argv[arg], "--")) {
610 lst->hash = 0;
611 lst->start = argv[arg];
612 lst->len = strlen(argv[arg]);
613 b.elcnt++;
614 lst++;
615 arg++;
618 csl = diff(a, b);
619 fixup(&a, &b, csl);
620 while (csl && csl->len) {
621 int i;
622 printf("%d,%d for %d:\n", csl->a, csl->b, csl->len);
623 for (i = 0; i < csl->len; i++) {
624 printf(" %.*s (%.*s)\n",
625 a.list[csl->a+i].len, a.list[csl->a+i].start,
626 b.list[csl->b+i].len, b.list[csl->b+i].start);
628 csl++;
631 exit(0);
634 #endif