2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
5 * Copyright (C) 2011 Neil Brown <neilb@suse.de>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 * Email: <neilb@suse.de>
27 * Find the best match for a patch against a file. A patch is a
28 * sequence of chunks each of which is expected to match a particular
29 * locality of the file. So we expect big gaps between where chunks
30 * match, but only small gaps within chunks.
32 * The matching algorithm is similar to that in diff.c, so you should
33 * understand that first. However it takes fewer shortcuts and
34 * analyses cost in a more detailed way.
36 * We walk the whole matrix in a breadth first fashion following a
37 * 'front' on which x+y is constant. Along this front we examine each
38 * diagonal. For each point we calculate a 'value' for the match so
39 * far. This will be in some particular chunk. For each chunk we
40 * separately record the best value found so far, and where it was.
41 * To choose a new value for each point we calculate based on the
42 * previous value on each neighbouring diagonal and on this diagonal.
44 * This can result is a set of 'best' matches for each chunk which are
45 * not in the same order that the chunks initially were. This
46 * probably isn't desired, so we choose a 'best' best match and
47 * recurse on each side of it.
49 * The quality of a match is a somewhat complex function that is
50 * roughly 3 times the number of matching symbols minus the number
51 * of replaced, added, or deleted. This seems to work.
53 * For any point, the best possible score using that point
54 * is a complete diagonal to the nearest edge. We ignore points
55 * which cannot contibute to a better overall score.
57 * As this is a fairly expensive search we remove uninteresting
58 * symbols before searching. Specifically we only keep alphanumeric
59 * (plus '_') strings. Spaces and punctuation is ignored. This should
60 * contain enough information to achieve a reliable match while scanning
68 /* This structure keeps track of the current match at each point.
69 * It holds the start of the match as x,k where k is the
70 * diagonal, so y = x-k.
71 * Also the length of the match so far.
72 * If l == 0, there is no match.
75 int x
, y
; /* location of start of match */
76 int val
; /* value of match from x,y to here */
77 int k
; /* diagonal of last match - if val > 0 */
78 int inmatch
; /* 1 if last point was a match */
79 int c
; /* chunk number */
83 * Here we must determine the 'value' of a partial match.
84 * The input parameters are:
85 * length - the total number of symbols matched
86 * errs - the total number of insertions or deletions
87 * dif - the absolute difference between number of insertions and deletions.
89 * In general we want length to be high, errs to be low, and dif to be low.
90 * Particular questions that must be answered include:
91 * - When does adding an extra symbol after a small gap improve the match
92 * - When does a match become so bad that we would rather start again.
94 * We would like symmetry in our answers so that a good sequence with
95 * an out-rider on one end is evaluated the same as a good sequence
96 * with an out-rider on the other end.
98 * However to do this we cannot really use the value of the good
99 * sequence to weigh in the out-riders favour as in the case of a
100 * leading outrider, we do not yet know the value of the good
103 * First, we need an arbitrary number, X, to say "Given a single
104 * symbol, after X errors, we forget that symbol". 5 seems a good
107 * Next we need to understand how replacements compare to insertions
108 * or deletions. Probably a replacement is the same cost as an
109 * insertion or deletion. Finally, a few large stretches are better
110 * then lots of little ones, so the number of disjoint stretches
111 * should be kept low.
114 * The first match sets the value to 6.
115 * Each consecutive match adds 3
116 * A non-consecutive match which value is still +ve adds 2
117 * Each non-match subtracts one unless it is the other half of a replacement.
118 * A value of 0 causes us to forget where we are and start again.
120 * We need to not only assess the value at a particular location, but
121 * also assess the maximum value we could get if all remaining symbols
122 * matched, to help exclude parts of the matrix. The value of that
123 * possibility is 6 times the number of remaining symbols, -1 if we
126 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
127 static inline void update_value(struct v
*v
, int dir
, int k
, int x
)
136 v
->val
+= 2+v
->inmatch
;
139 } else if (v
->val
> 0) {
141 if (dir
* (v
->k
- k
) > 0) {
142 /* other half of replacement */
149 /* Calculate the best possible value that this 'struct v'
150 * could reach if there are 'max' symbols remaining
151 * that could possibly be matches.
153 static inline int best_val(struct v
*v
, int max
)
158 return max
*3-1+v
->inmatch
+v
->val
;
167 static inline int min(int a
, int b
)
169 return a
< b
? a
: b
;
172 static void find_best(struct file
*a
, struct file
*b
,
174 int blo
, int bhi
, struct best
*best
)
179 struct v
*valloc
= xmalloc(sizeof(struct v
)*((ahi
-alo
)+(bhi
-blo
)+5));
180 struct v
*v
= valloc
+ (bhi
-alo
+2);
182 k
= klo
= khi
= alo
-blo
;
183 f
= alo
+blo
; /* front that moves forward */
187 while (f
< ahi
+bhi
) {
191 for (k
= klo
+1; k
<= khi
-1 ; k
+= 2) {
192 struct v vnew
, vnew2
;
195 /* first consider the diagonal - if possible
196 * it is always preferred
198 if (match(&a
->list
[x
-1], &b
->list
[y
-1])) {
200 update_value(&v
[k
], 0, k
, x
);
203 if (v
[k
].val
> best
[v
[k
].c
].val
) {
205 best
[chunk
].xlo
= v
[k
].x
;
206 best
[chunk
].ylo
= v
[k
].y
;
209 best
[chunk
].val
= v
[k
].val
;
212 /* First consider a y-step: adding a
215 update_value(&vnew
, -1, k
, x
);
216 /* might cross a chunk boundary */
217 if (b
->list
[y
-1].len
&& b
->list
[y
-1].start
[0] == 0) {
218 vnew
.c
= atoi(b
->list
[y
-1].start
+1);
222 /* Not consider an x-step: deleting
223 * a symbol. This cannot be a chunk
224 * boundary as there aren't any in 'A'
227 update_value(&vnew2
, 1, k
, x
);
229 /* Now choose the best. */
230 if (vnew2
.val
> vnew
.val
)
236 /* extend or contract range */
239 x
= (klo
+f
)/2; y
= x
-klo
;
240 update_value(&v
[klo
], -1, klo
, x
);
241 if (y
<= bhi
&& b
->list
[y
-1].len
&& b
->list
[y
-1].start
[0] == 0) {
242 v
[klo
].c
= atoi(b
->list
[y
-1].start
+1);
245 while (klo
+2 < (ahi
-bhi
) &&
247 (best_val(&v
[klo
], min(ahi
-x
, bhi
-y
)) < best
[v
[klo
].c
].val
&&
248 best_val(&v
[klo
+1], min(ahi
-x
, bhi
-y
+1)) < best
[v
[klo
+1].c
].val
252 x
= (klo
+f
)/2; y
= x
-klo
;
257 x
= (khi
+f
)/2; y
= x
- khi
;
258 update_value(&v
[khi
], -1, khi
, x
);
259 while (khi
-2 > (ahi
-bhi
) &&
262 best_val(&v
[khi
], min(ahi
-x
, bhi
-y
)) < best
[v
[khi
].c
].val
&&
263 best_val(&v
[khi
-1], min(ahi
-x
+1, bhi
-y
)) < best
[v
[khi
].c
].val
267 x
= (khi
+f
)/2; y
= x
- khi
;
275 * Reduce a file by discarding less interesting words
276 * Words that end with a newline are interesting (so all words
277 * in line-mode are interesting) and words that start with
278 * and alphanumeric are interesting. This excludes spaces and
279 * special characters in word mode
280 * Doing a best-fit comparision on only interesting words is
281 * much faster than on all words, and is nearly as good
284 static inline int is_skipped(struct elmnt e
)
286 return !(ends_line(e
) ||
287 isalnum(e
.start
[0]) ||
291 static struct file
reduce(struct file orig
)
297 for (i
= 0; i
< orig
.elcnt
; i
++)
298 if (!is_skipped(orig
.list
[i
]))
301 if (cnt
== orig
.elcnt
)
305 rv
.list
= xmalloc(cnt
*sizeof(struct elmnt
));
307 for (i
= 0; i
< orig
.elcnt
; i
++)
308 if (!is_skipped(orig
.list
[i
]))
309 rv
.list
[cnt
++] = orig
.list
[i
];
313 /* Given a list of best matches between a1 and b1 which are
314 * subsets of a2 and b2, convert that list to indexes into a2/b2
316 * When we find the location in a2/b2, we expand to include all
317 * immediately surrounding words which were skipped
319 static void remap(struct best
*best
, int cnt
,
320 struct file a1
, struct file b1
,
321 struct file a2
, struct file b2
)
324 int pa
, pb
; /* pointers into the a2 and b2 arrays */
328 if (a1
.elcnt
== 0 && a2
.elcnt
== 0)
331 for (b
= 1; b
< cnt
; b
++)
332 if (best
[b
].val
> 0) {
333 while (pa
< a2
.elcnt
&&
334 a2
.list
[pa
].start
!= a1
.list
[best
[b
].xlo
].start
)
338 while (pb
< b2
.elcnt
&&
339 b2
.list
[pb
].start
!= b1
.list
[best
[b
].ylo
].start
)
344 /* pa,pb is the start of this best bit. Step
345 * backward over ignored words
347 while (pa
> 0 && is_skipped(a2
.list
[pa
-1]))
349 while (pb
> 0 && is_skipped(b2
.list
[pb
-1]))
360 while (pa
< a2
.elcnt
&&
361 (pa
== 0 || (a2
.list
[pa
-1].start
362 != a1
.list
[best
[b
].xhi
-1].start
)))
364 if (pa
== a2
.elcnt
&& best
[b
].xhi
!= a1
.elcnt
)
366 while (pb
< b2
.elcnt
&&
367 (pb
== 0 || (b2
.list
[pb
-1].start
368 != b1
.list
[best
[b
].yhi
-1].start
)))
370 if (pb
== b2
.elcnt
&& best
[b
].yhi
!= b1
.elcnt
)
373 /* pa,pb is now the end of the best bit.
374 * Step pa,pb forward over ignored words.
376 while (pa
< a2
.elcnt
&& is_skipped(a2
.list
[pa
]))
378 while (pb
< b2
.elcnt
&& is_skipped(b2
.list
[pb
]))
385 static void find_best_inorder(struct file
*a
, struct file
*b
,
386 int alo
, int ahi
, int blo
, int bhi
,
387 struct best
*best
, int bestlo
, int besthi
)
389 /* make sure the best matches we find are inorder.
390 * If they aren't we find a overall best, and
391 * recurse either side of that
395 int bestval
, bestpos
= 0;
397 for (i
= bestlo
; i
< besthi
; i
++)
399 find_best(a
, b
, alo
, ahi
, blo
, bhi
, best
);
400 for (i
= bestlo
+ 1; i
< besthi
; i
++)
401 if (best
[i
-1].val
> 0 &&
403 best
[i
-1].xhi
>= best
[i
].xlo
)
409 for (i
= bestlo
; i
< besthi
; i
++)
410 if (best
[i
].val
> bestval
) {
411 bestval
= best
[i
].val
;
414 if (bestpos
> bestlo
) {
415 /* move top down below chunk marker */
416 int y
= best
[bestpos
].ylo
;
417 while (b
->list
[y
].start
[0])
419 find_best_inorder(a
, b
,
420 alo
, best
[bestpos
].xlo
,
422 best
, bestlo
, bestpos
);
424 if (bestpos
< besthi
-1) {
425 /* move bottom up to chunk marker */
426 int y
= best
[bestpos
].yhi
;
427 while (b
->list
[y
].start
[0])
429 find_best_inorder(a
, b
,
430 best
[bestpos
].xhi
, ahi
,
432 best
, bestpos
+1, besthi
);
436 struct csl
*pdiff(struct file a
, struct file b
, int chunks
)
438 struct csl
*csl1
, *csl2
;
439 struct best
*best
= xmalloc(sizeof(struct best
)*(chunks
+1));
441 struct file asmall
, bsmall
;
447 for (i
= 0; i
< chunks
+1; i
++)
449 find_best_inorder(&asmall
, &bsmall
,
450 0, asmall
.elcnt
, 0, bsmall
.elcnt
,
452 remap(best
, chunks
+1, asmall
, bsmall
, a
, b
);
453 if (asmall
.list
!= a
.list
)
455 if (bsmall
.list
!= b
.list
)
460 for (i
= 1; i
<= chunks
; i
++)
461 if (best
[i
].val
> 0) {
462 /* If we there are any newlines in the hunk before
463 * ylo, then extend xlo back that many newlines if
464 * possible and diff_partial the extended regions.
467 int ylo
= best
[i
].ylo
;
469 while (ylo
> 0 && b
.list
[ylo
-1].start
[0]) {
471 lines
+= !!ends_line(b
.list
[ylo
]);
474 int xlo
= best
[i
].xlo
;
475 while (lines
&& xlo
> xmin
) {
477 lines
-= !!ends_line(a
.list
[xlo
]);
479 while (xlo
> xmin
&& !ends_line(a
.list
[xlo
-1]))
481 csl2
= diff_partial(a
, b
,
484 csl1
= csl_join(csl1
, csl2
);
487 /* Now diff_partial the good bit of the hunk with the
490 csl2
= diff_partial(a
, b
,
491 best
[i
].xlo
, best
[i
].xhi
,
492 best
[i
].ylo
, best
[i
].yhi
);
493 csl1
= csl_join(csl1
, csl2
);
495 /* Now if there are newlines at the end of the
496 * hunk that weren't matched, find as many in
497 * original and diff_partial those bits
501 while (yhi
< b
.elcnt
&& b
.list
[yhi
].start
[0]) {
502 lines
+= !!ends_line(b
.list
[yhi
]);
507 int xhi
= best
[i
].xhi
;
510 xmax
= best
[i
+1].xlo
;
511 while (lines
&& xhi
< xmax
) {
512 lines
-= !!ends_line(a
.list
[xhi
]);
515 csl2
= diff_partial(a
, b
,
518 csl1
= csl_join(csl1
, csl2
);
522 /* FIXME we just lost a hunk!! */;
525 for (csl2
= csl1
; csl2
->len
; csl2
++)
530 csl1
= xmalloc(sizeof(*csl1
));