2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
5 * Copyright (C) 2011-2013 Neil Brown <neilb@suse.de>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program.
22 * Email: <neilb@suse.de>
26 * Calculate longest common subsequence between two sequences
28 * Each sequence contains strings with
30 * We produce a list of tripples: a b len
31 * where A and B point to elements in the two sequences, and len is the number
32 * of common elements there. The list is terminated by an entry with len==0.
34 * This is roughly based on
35 * "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
36 * Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
37 * http://xmailserver.org/diff2.pdf
39 * However we don't run the basic algorithm both forward and backward until
40 * we find an overlap as Myers suggests. Rather we always run forwards, but
41 * we record the location of the (possibly empty) snake that crosses the
42 * midline. When we finish, this recorded location for the best path shows
43 * us where to divide and find further midpoints.
45 * In brief, the algorithm is as follows.
47 * Imagine a Cartesian Matrix where x co-ordinates correspond to symbols in
48 * the first sequence (A, length a) and y co-ordinates correspond to symbols
49 * in the second sequence (B, length b). At the origin we have the first
51 * Movement in the x direction represents deleting the symbol as that point,
52 * so from x=i-1 to x=i deletes symbol i from A.
54 * Movement in the y direction represents adding the corresponding symbol
55 * from B. So to move from the origin 'a' spaces along X and then 'b' spaces
56 * up Y will remove all of the first sequence and then add all of the second
57 * sequence. Similarly moving firstly up the Y axis, then along the X
58 * direction will add the new sequence, then remove the old sequence. Thus
59 * the point a,b represents the second sequence and a part from 0,0 to a,b
60 * represent an sequence of edits to change A into B.
62 * There are clearly many paths from 0,0 to a,b going through different
63 * points in the matrix in different orders. At some points in the matrix
64 * the next symbol to be added from B is the same as the next symbol to be
65 * removed from A. At these points we can take a diagonal step to a new
66 * point in the matrix without actually changing any symbol. A sequence of
67 * these diagonal steps is called a 'snake'. The goal then is to find a path
68 * of x-steps (removals), y-steps (additions) and diagonals (steps and
69 * snakes) where the number of (non-diagonal) steps is minimal.
71 * i.e. we aim for as many long snakes as possible.
72 * If the total number of 'steps' is called the 'cost', we aim to minimise
75 * As storing the whole matrix in memory would be prohibitive with large
76 * sequences we limit ourselves to linear storage proportional to a+b and
77 * repeat the search at most log2(a+b) times building up the path as we go.
78 * Specifically we perform a search on the full matrix and record where each
79 * path crosses the half-way point. i.e. where x+y = (a+b)/2 (== mid). This
80 * tells us the mid point of the best path. We then perform two searches,
81 * one on each of the two halves and find the 1/4 and 3/4 way points. This
82 * continues recursively until we have all points.
84 * The storage is an array v of 'struct v'. This is indexed by the
85 * diagonal-number k = x-y. Thus k can be negative and the array is
86 * allocated to allow for that. During the search there is an implicit value
87 * 'c' which is the cost (length in steps) of all the paths currently under
89 * v[k] stores details of the longest reaching path of cost c that finishes
90 * on diagonal k. "longest reaching" means "finishes closest to a,b".
92 * The location of the end point. 'x' is stored. y = x - k.
93 * The diagonal of the midpoint crossing. md is stored. x = (mid + md)/2
96 * (note: md is a diagonal so md = x-y. mid is an anti-diagonal: mid = x+y)
97 * The number of 'snakes' in the path (l). This is used to allocate the
98 * array which will record the snakes and to terminate recursion.
100 * A path with an even cost (c is even) must end on an even diagonal (k is
101 * even) and when c is odd, k must be odd. So the v[] array is treated as
102 * two sub arrays, the even part and the odd part. One represents paths of
103 * cost 'c', the other paths of cost c-1.
105 * Initially only v[0] is meaningful and there are no snakes. We firstly
106 * extend all paths under consideration with the longest possible snake on
109 * Then we increment 'c' and calculate for each suitable 'k' whether the best
110 * path to diagonal k of cost c comes from taking an x-step from the c-1 path
111 * on diagonal k-1, or from taking a y-step from the c-1 path on diagonal
112 * k+1. Obviously we need to avoid stepping out of the matrix. Finally we
113 * check if the 'v' array can be extended or reduced at the boundaries. If
114 * we hit a border we must reduce. If the best we could possibly do on that
115 * diagonal is less than the worst result from the current leading path, then
116 * we also reduce. Otherwise we extend the range of 'k's we consider.
118 * We continue until we find a path has reached a,b. This must be a minimal
119 * cost path (cost==c). At this point re-check the end of the snake at the
120 * midpoint and report that.
122 * This all happens recursively for smaller and smaller subranges stopping
123 * when we examine a submatrix and find that it contains no snakes. As we
124 * are usually dealing with sub-matrixes we are not walking from 0,0 to a,b
125 * from alo,blo to ahi,bhi - low point to high point. So the initial k is
132 #include <sys/time.h>
135 int x
; /* x location of furthest reaching path of current cost */
136 int md
; /* diagonal location of midline crossing */
137 int l
; /* number of continuous common sequences found so far */
140 static int find_common(struct file
*a
, struct file
*b
,
141 int *alop
, int *ahip
,
142 int *blop
, int *bhip
,
143 struct v
*v
, int shortcut
)
145 /* Examine matrix from alo to ahi and blo to bhi.
146 * i.e. including alo and blo, but less than ahi and bhi.
147 * Finding longest subsequence and
148 * return new {a,b}{lo,hi} either side of midline.
149 * i.e. mid = ( (ahi-alo) + (bhi-blo) ) / 2
150 * alo+blo <= mid <= ahi+bhi
151 * and alo,blo to ahi,bhi is a common (possibly empty)
154 * v is scratch space which is indexable from
155 * alo-bhi to ahi-blo inclusive.
156 * i.e. even though there is no symbol at ahi or bhi, we do
157 * consider paths that reach there as they simply cannot
158 * go further in that direction.
160 * Return the number of snakes found.
163 struct timeval start
, stop
;
170 int mid
= (ahi
+bhi
+alo
+blo
)/2;
172 /* 'worst' is the worst-case extra cost that we need
173 * to pay before reaching our destination. It assumes
174 * no more snakes in the furthest-reaching path so far.
175 * We use this to know when we can trim the extreme
176 * diagonals - when their best case does not improve on
177 * the current worst case.
179 int worst
= (ahi
-alo
)+(bhi
-blo
);
182 shortcut
= !!shortcut
;
184 char *lc
= getenv("WIGGLE_LOOPCOUNT");
186 loopcount
= atoi(lc
);
189 gettimeofday(&start
, NULL
);
208 gettimeofday(&stop
, NULL
) == 0 &&
209 (stop
.tv_sec
- start
.tv_sec
) * 1000000 +
210 (stop
.tv_usec
- start
.tv_usec
) > 20000)))
211 /* 20ms is a long time. Time to take a shortcut
215 /* Find the longest snake extending on each current
216 * diagonal, and record if it crosses the midline.
217 * If we reach the end, return.
219 for (k
= klo
; k
<= khi
; k
+= 2) {
227 /* Follow any snake that is here */
228 while (x
< ahi
&& y
< bhi
&&
229 match(&a
->list
[x
], &b
->list
[y
])
236 /* Refine the worst-case remaining cost */
237 cost
= (ahi
-x
)+(bhi
-y
);
240 if (snake
&& shortcut
== 2) {
248 /* Check for midline crossing */
250 v
[k
].x
+ v
[k
].x
-k
<= mid
)
257 /* OK! We have arrived.
258 * We crossed the midpoint on diagonal v[k].md
263 /* The snake could start earlier than the
264 * midline. We cannot just search backwards
265 * as that might find the wrong path - the
266 * greediness of the diff algorithm is
268 * We could record the start of the snake in
269 * 'v', but we will find the actual snake when
270 * we recurse so there is no need.
278 /* Find the end of the snake using the same
279 * greedy approach as when we first found the
282 while (x
< ahi
&& y
< bhi
&&
283 match(&a
->list
[x
], &b
->list
[y
])
295 /* No success with previous cost, so increment cost (c) by 1
296 * and for each other diagonal, set from the end point of the
297 * diagonal on one side of it or the other.
299 for (k
= klo
+1; k
<= khi
-1 ; k
+= 2) {
300 if (v
[k
-1].x
+1 > ahi
) {
301 /* cannot step to the right from previous
302 * diagonal as there is no room.
303 * So step up from next diagonal.
306 } else if (v
[k
+1].x
- k
> bhi
|| v
[k
-1].x
+1 >= v
[k
+1].x
) {
307 /* Cannot step up from next diagonal as either
308 * there is no room, or doing so wouldn't get us
309 * as close to the endpoint.
310 * So step to the right.
315 /* There is room in both directions, but
316 * stepping up from the next diagonal gets us
323 /* Now we need to either extend or contract klo and khi
324 * so they both change parity (odd vs even).
325 * If we extend we need to step up (for klo) or to the
326 * right (khi) from the adjacent diagonal. This is
327 * not possible if we have hit the edge of the matrix, and
328 * not sensible if the new point has a best case remaining
329 * cost that is worse than our current worst case remaining
331 * The best-case remaining cost is the absolute difference
332 * between the remaining number of additions and the remaining
333 * number of deletions - and assumes lots of snakes.
335 /* new location if we step up from klo to klo-1*/
336 x
= v
[klo
].x
; y
= x
- (klo
-1);
337 cost
= abs((ahi
-x
)-(bhi
-y
));
339 if (y
<= bhi
&& cost
<= worst
) {
340 /* Looks acceptable - step up. */
344 x
= v
[klo
].x
; y
= x
- (klo
-1);
345 cost
= abs((ahi
-x
)-(bhi
-y
));
346 } while (cost
> worst
);
348 /* new location if we step to the right from khi to khi+1 */
349 x
= v
[khi
].x
+1; y
= x
- (khi
+1);
350 cost
= abs((ahi
-x
)-(bhi
-y
));
352 if (x
<= ahi
&& cost
<= worst
) {
353 /* Looks acceptable - step to the right */
358 x
= v
[khi
].x
+1; y
= x
- (khi
+1);
359 cost
= abs((ahi
-x
)-(bhi
-y
));
360 } while (cost
> worst
);
365 int size
; /* How much is alloced */
366 int len
; /* How much is used */
370 static void csl_add(struct cslb
*buf
, int a
, int b
, int len
)
373 if (len
&& buf
->len
) {
374 csl
= buf
->csl
+ buf
->len
- 1;
375 if (csl
->a
+ csl
->len
== a
&&
376 csl
->b
+ csl
->len
== b
) {
381 if (buf
->size
<= buf
->len
) {
385 buf
->size
+= buf
->size
;
386 buf
->csl
= realloc(buf
->csl
, sizeof(buf
->csl
[0]) * buf
->size
);
388 csl
= buf
->csl
+ buf
->len
;
395 static void lcsl(struct file
*a
, int alo
, int ahi
,
396 struct file
*b
, int blo
, int bhi
,
398 struct v
*v
, int shortcut
)
400 /* lcsl == longest common sub-list.
401 * This calls itself recursively as it finds the midpoint
403 * On first call, 'csl' is NULL and will need to be allocated and
405 * On subsequence calls when 'csl' is not NULL, we add all the
406 * snakes we find to csl, and return a pointer to the next
407 * location where future snakes can be stored.
414 if (ahi
<= alo
|| bhi
<= blo
)
417 if (!find_common(a
, b
,
423 /* There are more snakes to find - keep looking. */
425 /* With depth-first recursion, this adds all the snakes
426 * before 'alo1' to 'csl'
433 /* need to add this common seq, possibly attach
436 csl_add(cslb
, alo1
, blo1
, ahi1
- alo1
);
438 /* Now recurse to add all the snakes after ahi1 to csl */
444 /* If two common sequences are separated by only an add or remove,
445 * and the first sequence ends the same as the middle text,
446 * extend the second and contract the first in the hope that the
447 * first might become empty. This ameliorates against the greediness
448 * of the 'diff' algorithm.
450 * [ foo X ] X [ bar ]
453 * [ foo ] X [ X bar ]
455 * We treat the final zero-length 'csl' as a common sequence which
456 * can be extended so we must make sure to add a new zero-length csl
458 * If this doesn't make the first sequence disappear, and (one of the)
459 * X(s) was a newline, then move back so the newline is at the end
460 * of the first sequence. This encourages common sequences
461 * to be whole-line units where possible.
463 static void fixup(struct file
*a
, struct file
*b
, struct csl
*list
)
465 struct csl
*list1
, *orig
;
472 /* 'list' and 'list1' are adjacent pointers into the csl.
473 * If a match gets deleted, they might not be physically
474 * adjacent any more. Once we get to the end of the list
475 * this will cease to matter - the list will be a bit
484 /* If a single token is either inserted or deleted
485 * immediately after a matching token...
487 if ((list
->a
+list
->len
== list1
->a
&&
488 list
->b
+list
->len
!= list1
->b
&&
489 /* text at b inserted */
490 match(&b
->list
[list
->b
+list
->len
-1],
491 &b
->list
[list1
->b
-1])
494 (list
->b
+list
->len
== list1
->b
&&
495 list
->a
+list
->len
!= list1
->a
&&
496 /* text at a deleted */
497 match(&a
->list
[list
->a
+list
->len
-1],
498 &a
->list
[list1
->a
-1])
501 /* If the last common token is a simple end-of-line
502 * record where it is. For a word-wise diff, this is
503 * any EOL. For a line-wise diff this is a blank line.
504 * If we are looking at a deletion it must be deleting
505 * the eol, so record that deleted eol.
507 if (ends_line(a
->list
[list
->a
+list
->len
-1])
508 && a
->list
[list
->a
+list
->len
-1].len
== 1
511 lasteol
= list1
->a
-1;
513 /* Expand the second match, shrink the first */
519 /* If the first match has become empty, make it
520 * disappear.. (and forget about the eol).
522 if (list
->len
== 0) {
525 /* Deleting just before the last
528 list1
->a
+= list1
->len
;
529 list1
->b
+= list1
->len
;
531 } else if (list
> orig
)
532 /* Deleting in the middle */
535 /* deleting the first entry */
540 /* Nothing interesting here, though if we
541 * shuffled back past an eol, shuffle
542 * forward to line up with that eol.
543 * This causes an eol to bind more strongly
544 * with the preceding line than the following.
547 while (list1
->a
<= lasteol
548 && (list1
->len
> 1 ||
549 (found_end
&& list1
->len
> 0))) {
559 list1
->a
+= list1
->len
;
560 list1
->b
+= list1
->len
;
565 if (list
->len
&& list1
== list
)
570 static int elcmp(const void *v1
, const void *v2
)
572 const struct elmnt
*e1
= v1
;
573 const struct elmnt
*e2
= v2
;
575 if (e1
->hash
!= e2
->hash
) {
576 if (e1
->hash
< e2
->hash
)
580 if (e1
->start
[0] == 0 && e2
->start
[0] == 0)
582 if (e1
->len
!= e2
->len
)
583 return e1
->len
- e2
->len
;
584 return strncmp(e1
->start
, e2
->start
, e1
->len
);
587 #define BPL (sizeof(unsigned long) * 8)
588 static struct file
filter_unique(struct file f
, struct file ref
)
590 /* Use a bloom-filter to record all hashes in 'ref' and
591 * then if there are consequtive entries in 'f' that are
592 * not in 'ref', reduce each such run to 1 entry
598 sorted
.list
= xmalloc(sizeof(sorted
.list
[0]) * ref
.elcnt
);
599 sorted
.elcnt
= ref
.elcnt
;
600 memcpy(sorted
.list
, ref
.list
, sizeof(sorted
.list
[0]) * sorted
.elcnt
);
601 qsort(sorted
.list
, sorted
.elcnt
, sizeof(sorted
.list
[0]),
604 n
.list
= xmalloc(sizeof(n
.list
[0]) * f
.elcnt
);
607 for (fi
= 0; fi
< f
.elcnt
; fi
++) {
608 int lo
= 0, hi
= sorted
.elcnt
;
609 while (lo
+ 1 < hi
) {
610 int mid
= (lo
+ hi
) / 2;
611 if (elcmp(&f
.list
[fi
], &sorted
.list
[mid
]) < 0)
616 if (match(&f
.list
[fi
], &sorted
.list
[lo
]))
621 n
.list
[n
.elcnt
++] = f
.list
[fi
];
627 static void remap(struct csl
*csl
, int which
, struct file from
, struct file to
)
629 /* The a,b pointer in csl points to 'from' we need to remap to 'to'.
630 * 'to' has everything that 'from' has, plus more.
631 * Each list[].start is unique
635 int fi
= which
? csl
->b
: csl
->a
;
636 while (to
.list
[ti
].start
!= from
.list
[fi
].start
) {
652 /* Main entry point - find the common-sub-list of files 'a' and 'b'.
653 * The final element in the list will have 'len' == 0 and will point
654 * beyond the end of the files.
656 struct csl
*diff(struct file a
, struct file b
, int shortest
)
659 struct cslb cslb
= {};
662 /* Remove runs of 2 or more elements in one file that don't
663 * exist in the other file. This often makes the number of
664 * elements more manageable.
666 af
= filter_unique(a
, b
);
667 bf
= filter_unique(b
, a
);
669 v
= xmalloc(sizeof(struct v
)*(af
.elcnt
+bf
.elcnt
+2));
672 lcsl(&af
, 0, af
.elcnt
,
674 &cslb
, v
, !shortest
);
675 csl_add(&cslb
, af
.elcnt
, bf
.elcnt
, 0);
676 free(v
-(bf
.elcnt
+1));
677 remap(cslb
.csl
, 0, af
, a
);
678 remap(cslb
.csl
, 1, bf
, b
);
681 fixup(&a
, &b
, cslb
.csl
);
685 /* Alternate entry point - find the common-sub-list in two
686 * subranges of files.
688 struct csl
*diff_partial(struct file a
, struct file b
,
689 int alo
, int ahi
, int blo
, int bhi
)
692 struct cslb cslb
= {};
693 v
= xmalloc(sizeof(struct v
)*(ahi
-alo
+bhi
-blo
+2));
699 csl_add(&cslb
, ahi
, bhi
, 0);
701 fixup(&a
, &b
, cslb
.csl
);
705 struct csl
*csl_join(struct csl
*c1
, struct csl
*c2
)
714 for (cnt1
= 0; c1
[cnt1
].len
; cnt1
++)
716 for (cnt2
= 0; c2
[cnt2
].len
; cnt2
++)
719 c1
[cnt1
-1].a
+ c1
[cnt1
-1].len
== c2
[0].a
&&
720 c1
[cnt1
-1].b
+ c1
[cnt1
-1].len
== c2
[0].b
) {
721 /* Merge these two */
722 c1
[cnt1
-1].len
+= c2
[0].len
;
726 c1
= realloc(c1
, (cnt1
+cnt2
+1)*sizeof(*c1
));
728 c1
[cnt1
+cnt2
] = c2
[cnt2
+ offset
];
735 /* When rediffing a patch, we *must* make sure the hunk headers
736 * line up. So don't do a full diff, but rather find the hunk
737 * headers and diff the bits between them.
739 struct csl
*diff_patch(struct file a
, struct file b
, int shortest
)
742 struct csl
*csl
= NULL
;
743 if (a
.elcnt
== 0 || b
.elcnt
== 0 ||
744 a
.list
[0].start
[0] != '\0' ||
745 b
.list
[0].start
[0] != '\0')
746 /* this is not a patch */
747 return diff(a
, b
, shortest
);
750 while (ap
< a
.elcnt
&& bp
< b
.elcnt
) {
757 while (ap
< a
.elcnt
&&
758 a
.list
[ap
].start
[0] != '\0');
761 while (bp
< b
.elcnt
&&
762 b
.list
[bp
].start
[0] != '\0');
763 cs
= diff_partial(a
, b
, alo
, ap
, blo
, bp
);
764 csl
= csl_join(csl
, cs
);
771 main(int argc
, char *argv
[])
775 struct elmnt
*lst
= xmalloc(argc
*sizeof(*lst
));
783 while (argv
[arg
] && strcmp(argv
[arg
], "--")) {
785 lst
->start
= argv
[arg
];
786 lst
->len
= strlen(argv
[arg
]);
798 while (argv
[arg
] && strcmp(argv
[arg
], "--")) {
800 lst
->start
= argv
[arg
];
801 lst
->len
= strlen(argv
[arg
]);
809 while (csl
&& csl
->len
) {
811 printf("%d,%d for %d:\n", csl
->a
, csl
->b
, csl
->len
);
812 for (i
= 0; i
< csl
->len
; i
++) {
813 printf(" %.*s (%.*s)\n",
814 a
.list
[csl
->a
+i
].len
, a
.list
[csl
->a
+i
].start
,
815 b
.list
[csl
->b
+i
].len
, b
.list
[csl
->b
+i
].start
);