Add heuristic to take shortcut when too slow.
[wiggle/upstream.git] / bestmatch.c
blobeb3559284f9609be6057248f2e31911a33c30a3d
1 /*
2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
5 * Copyright (C) 2011-2013 Neil Brown <neilb@suse.de>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program.
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
26 * Find the best match for a patch against a file. A patch is a
27 * sequence of chunks each of which is expected to match a particular
28 * locality of the file. So we expect big gaps between where chunks
29 * match, but only small gaps within chunks.
31 * The matching algorithm is similar to that in diff.c, so you should
32 * understand that first. However it takes fewer shortcuts and
33 * analyses cost in a more detailed way.
35 * We walk the whole matrix in a breadth first fashion following a
36 * 'front' on which x+y is constant. Along this front we examine each
37 * diagonal. For each point we calculate a 'value' for the match so
38 * far. This will be in some particular chunk. For each chunk we
39 * separately record the best value found so far, and where it was.
40 * To choose a new value for each point we calculate based on the
41 * previous value on each neighbouring diagonal and on this diagonal.
43 * This can result is a set of 'best' matches for each chunk which are
44 * not in the same order that the chunks initially were. This
45 * probably isn't desired, so we choose a 'best' best match and
46 * recurse on each side of it.
48 * The quality of a match is a somewhat complex function that is
49 * roughly 3 times the number of matching symbols minus the number
50 * of replaced, added, or deleted. This seems to work.
52 * For any point, the best possible score using that point
53 * is a complete diagonal to the nearest edge. We ignore points
54 * which cannot contibute to a better overall score.
56 * As this is a fairly expensive search we remove uninteresting
57 * symbols before searching. Specifically we only keep alphanumeric
58 * (plus '_') strings. Spaces and punctuation is ignored. This should
59 * contain enough information to achieve a reliable match while scanning
60 * many fewer symbols.
63 #include <ctype.h>
64 #include <stdlib.h>
65 #include "wiggle.h"
67 /* This structure keeps track of the current match at each point.
68 * It holds the start of the match as x,k where k is the
69 * diagonal, so y = x-k.
70 * Also the length of the match so far.
71 * If l == 0, there is no match.
73 struct v {
74 int x, y; /* location of start of match */
75 int val; /* value of match from x,y to here */
76 int k; /* diagonal of last match - if val > 0 */
77 int inmatch; /* 1 if last point was a match */
78 int c; /* chunk number */
82 * Here we must determine the 'value' of a partial match.
83 * The input parameters are:
84 * length - the total number of symbols matched
85 * errs - the total number of insertions or deletions
86 * dif - the absolute difference between number of insertions and deletions.
88 * In general we want length to be high, errs to be low, and dif to be low.
89 * Particular questions that must be answered include:
90 * - When does adding an extra symbol after a small gap improve the match
91 * - When does a match become so bad that we would rather start again.
93 * We would like symmetry in our answers so that a good sequence with
94 * an out-rider on one end is evaluated the same as a good sequence
95 * with an out-rider on the other end.
97 * However to do this we cannot really use the value of the good
98 * sequence to weigh in the out-riders favour as in the case of a
99 * leading outrider, we do not yet know the value of the good
100 * sequence.
102 * First, we need an arbitrary number, X, to say "Given a single
103 * symbol, after X errors, we forget that symbol". 5 seems a good
104 * number.
106 * Next we need to understand how replacements compare to insertions
107 * or deletions. Probably a replacement is the same cost as an
108 * insertion or deletion. Finally, a few large stretches are better
109 * then lots of little ones, so the number of disjoint stretches
110 * should be kept low.
112 * So:
113 * The first match sets the value to 6.
114 * Each consecutive match adds 3
115 * A non-consecutive match which value is still +ve adds 2
116 * Each non-match subtracts one unless it is the other half of a replacement.
117 * A value of 0 causes us to forget where we are and start again.
119 * We need to not only assess the value at a particular location, but
120 * also assess the maximum value we could get if all remaining symbols
121 * matched, to help exclude parts of the matrix. The value of that
122 * possibility is 6 times the number of remaining symbols, -1 if we
123 * just had a match.
125 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
126 static inline void update_value(struct v *v, int dir, int k, int x)
128 if (dir == 0) {
129 if (v->val <= 0) {
130 v->x = x-1;
131 v->y = x-k-1;
132 v->inmatch = 0;
133 v->val = 4;
135 v->val += 2+v->inmatch;
136 v->inmatch = 1;
137 v->k = k;
138 } else if (v->val > 0) {
139 v->inmatch = 0;
140 if (dir * (v->k - k) > 0) {
141 /* other half of replacement */
142 } else {
143 v->val -= 1;
148 /* Calculate the best possible value that this 'struct v'
149 * could reach if there are 'max' symbols remaining
150 * that could possibly be matches.
152 static inline int best_val(struct v *v, int max)
154 if (v->val <= 0)
155 return 4+max*3-1;
156 else
157 return max*3-1+v->inmatch+v->val;
160 struct best {
161 int xlo, ylo;
162 int xhi, yhi;
163 int val;
166 static inline int min(int a, int b)
168 return a < b ? a : b;
171 static void find_best(struct file *a, struct file *b,
172 int alo, int ahi,
173 int blo, int bhi, struct best *best)
175 int klo, khi, k;
176 int f;
178 struct v *valloc = xmalloc(sizeof(struct v)*((ahi-alo)+(bhi-blo)+5));
179 struct v *v = valloc + (bhi-alo+2);
181 k = klo = khi = alo-blo;
182 f = alo+blo; /* front that moves forward */
183 v[k].val = 0;
184 v[k].c = -1;
186 while (f < ahi+bhi) {
187 int x, y;
189 f++;
190 for (k = klo+1; k <= khi-1 ; k += 2) {
191 struct v vnew, vnew2;
192 x = (k+f)/2;
193 y = x-k;
194 /* first consider the diagonal - if possible
195 * it is always preferred
197 if (match(&a->list[x-1], &b->list[y-1])) {
198 vnew = v[k];
199 update_value(&v[k], 0, k, x);
200 if (v[k].c < 0)
201 abort();
202 if (v[k].val > best[v[k].c].val) {
203 int chunk = v[k].c;
204 best[chunk].xlo = v[k].x;
205 best[chunk].ylo = v[k].y;
206 best[chunk].xhi = x;
207 best[chunk].yhi = y;
208 best[chunk].val = v[k].val;
210 } else {
211 /* First consider a y-step: adding a
212 * symbol from B */
213 vnew = v[k+1];
214 update_value(&vnew, -1, k, x);
215 /* might cross a chunk boundary */
216 if (b->list[y-1].len && b->list[y-1].start[0] == 0) {
217 vnew.c = atoi(b->list[y-1].start+1);
218 vnew.val = 0;
221 /* Not consider an x-step: deleting
222 * a symbol. This cannot be a chunk
223 * boundary as there aren't any in 'A'
225 vnew2 = v[k-1];
226 update_value(&vnew2, 1, k, x);
228 /* Now choose the best. */
229 if (vnew2.val > vnew.val)
230 v[k] = vnew2;
231 else
232 v[k] = vnew;
235 /* extend or contract range */
236 klo--;
237 v[klo] = v[klo+1];
238 x = (klo+f)/2; y = x-klo;
239 update_value(&v[klo], -1, klo, x);
240 if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) {
241 v[klo].c = atoi(b->list[y-1].start+1);
242 v[klo].val = 0;
244 while (klo+2 < (ahi-bhi) &&
245 (y > bhi ||
246 (best_val(&v[klo], min(ahi-x, bhi-y)) < best[v[klo].c].val &&
247 best_val(&v[klo+1], min(ahi-x, bhi-y+1)) < best[v[klo+1].c].val
249 )) {
250 klo += 2;
251 x = (klo+f)/2; y = x-klo;
254 khi++;
255 v[khi] = v[khi-1];
256 x = (khi+f)/2; y = x - khi;
257 update_value(&v[khi], -1, khi, x);
258 while (khi-2 > (ahi-bhi) &&
259 (x > ahi ||
260 (v[khi].c >= 0 &&
261 best_val(&v[khi], min(ahi-x, bhi-y)) < best[v[khi].c].val &&
262 best_val(&v[khi-1], min(ahi-x+1, bhi-y)) < best[v[khi].c].val
264 )) {
265 khi -= 2;
266 x = (khi+f)/2; y = x - khi;
270 free(valloc);
274 * Reduce a file by discarding less interesting words
275 * Words that end with a newline are interesting (so all words
276 * in line-mode are interesting) and words that start with
277 * and alphanumeric are interesting. This excludes spaces and
278 * special characters in word mode
279 * Doing a best-fit comparison on only interesting words is
280 * much faster than on all words, and is nearly as good
283 static inline int is_skipped(struct elmnt e)
285 return !(ends_line(e) ||
286 isalnum(e.start[0]) ||
287 e.start[0] == '_');
290 static struct file reduce(struct file orig)
292 int cnt = 0;
293 int i;
294 struct file rv;
296 for (i = 0; i < orig.elcnt; i++)
297 if (!is_skipped(orig.list[i]))
298 cnt++;
300 if (cnt == orig.elcnt)
301 return orig;
303 rv.elcnt = cnt;
304 rv.list = xmalloc(cnt*sizeof(struct elmnt));
305 cnt = 0;
306 for (i = 0; i < orig.elcnt; i++)
307 if (!is_skipped(orig.list[i]))
308 rv.list[cnt++] = orig.list[i];
309 return rv;
312 /* Given a list of best matches between a1 and b1 which are
313 * subsets of a2 and b2, convert that list to indexes into a2/b2
315 * When we find the location in a2/b2, we expand to include all
316 * immediately surrounding words which were skipped
318 static void remap(struct best *best, int cnt,
319 struct file a1, struct file b1,
320 struct file a2, struct file b2)
322 int b;
323 int pa, pb; /* pointers into the a2 and b2 arrays */
325 pa = pb = 0;
327 if (a1.elcnt == 0 && a2.elcnt == 0)
328 return;
330 for (b = 1; b < cnt; b++)
331 if (best[b].val > 0) {
332 while (pa < a2.elcnt &&
333 a2.list[pa].start != a1.list[best[b].xlo].start)
334 pa++;
335 if (pa == a2.elcnt)
336 abort();
337 while (pb < b2.elcnt &&
338 b2.list[pb].start != b1.list[best[b].ylo].start)
339 pb++;
340 if (pb == b2.elcnt)
341 abort();
343 /* pa,pb is the start of this best bit. Step
344 * backward over ignored words
346 while (pa > 0 && is_skipped(a2.list[pa-1]))
347 pa--;
348 while (pb > 0 && is_skipped(b2.list[pb-1]))
349 pb--;
351 if (pa <= 0)
352 pa = 1;
353 if (pb <= 0)
354 pb = 1;
356 best[b].xlo = pa;
357 best[b].ylo = pb;
359 while (pa < a2.elcnt &&
360 (pa == 0 || (a2.list[pa-1].start
361 != a1.list[best[b].xhi-1].start)))
362 pa++;
363 if (pa == a2.elcnt && best[b].xhi != a1.elcnt)
364 abort();
365 while (pb < b2.elcnt &&
366 (pb == 0 || (b2.list[pb-1].start
367 != b1.list[best[b].yhi-1].start)))
368 pb++;
369 if (pb == b2.elcnt && best[b].yhi != b1.elcnt)
370 abort();
372 /* pa,pb is now the end of the best bit.
373 * Step pa,pb forward over ignored words.
375 while (pa < a2.elcnt && is_skipped(a2.list[pa]))
376 pa++;
377 while (pb < b2.elcnt && is_skipped(b2.list[pb]))
378 pb++;
379 best[b].xhi = pa;
380 best[b].yhi = pb;
384 static void find_best_inorder(struct file *a, struct file *b,
385 int alo, int ahi, int blo, int bhi,
386 struct best *best, int bestlo, int besthi)
388 /* make sure the best matches we find are inorder.
389 * If they aren't we find a overall best, and
390 * recurse either side of that
392 int i;
393 int bad = 0;
394 int bestval, bestpos = 0;
396 for (i = bestlo; i < besthi; i++)
397 best[i].val = 0;
398 find_best(a, b, alo, ahi, blo, bhi, best);
399 for (i = bestlo + 1; i < besthi; i++)
400 if (best[i-1].val > 0 &&
401 best[i].val > 0 &&
402 best[i-1].xhi >= best[i].xlo)
403 bad = 1;
405 if (!bad)
406 return;
407 bestval = 0;
408 for (i = bestlo; i < besthi; i++)
409 if (best[i].val > bestval) {
410 bestval = best[i].val;
411 bestpos = i;
413 if (bestpos > bestlo) {
414 /* move top down below chunk marker */
415 int y = best[bestpos].ylo;
416 while (b->list[y].start[0])
417 y--;
418 find_best_inorder(a, b,
419 alo, best[bestpos].xlo,
420 blo, y,
421 best, bestlo, bestpos);
423 if (bestpos < besthi-1) {
424 /* move bottom up to chunk marker */
425 int y = best[bestpos].yhi;
426 while (b->list[y].start[0])
427 y++;
428 find_best_inorder(a, b,
429 best[bestpos].xhi, ahi,
430 y, bhi,
431 best, bestpos+1, besthi);
435 struct csl *pdiff(struct file a, struct file b, int chunks)
437 struct csl *csl1, *csl2;
438 struct best *best = xmalloc(sizeof(struct best)*(chunks+1));
439 int i;
440 struct file asmall, bsmall;
441 int xmin;
443 asmall = reduce(a);
444 bsmall = reduce(b);
446 for (i = 0; i < chunks+1; i++)
447 best[i].val = 0;
448 find_best_inorder(&asmall, &bsmall,
449 0, asmall.elcnt, 0, bsmall.elcnt,
450 best, 1, chunks+1);
451 remap(best, chunks+1, asmall, bsmall, a, b);
452 if (asmall.list != a.list)
453 free(asmall.list);
454 if (bsmall.list != b.list)
455 free(bsmall.list);
457 csl1 = NULL;
458 xmin = 0;
459 for (i = 1; i <= chunks; i++)
460 if (best[i].val > 0) {
461 /* If we there are any newlines in the hunk before
462 * ylo, then extend xlo back that many newlines if
463 * possible and diff_partial the extended regions.
465 int lines = 0;
466 int ylo = best[i].ylo;
467 int yhi;
468 while (ylo > 0 && b.list[ylo-1].start[0]) {
469 ylo--;
470 lines += !!ends_line(b.list[ylo]);
472 if (lines) {
473 int xlo = best[i].xlo;
474 while (lines && xlo > xmin) {
475 xlo--;
476 lines -= !!ends_line(a.list[xlo]);
478 while (xlo > xmin && !ends_line(a.list[xlo-1]))
479 xlo--;
480 csl2 = diff_partial(a, b,
481 xlo, best[i].xlo,
482 ylo, best[i].ylo);
483 csl1 = csl_join(csl1, csl2);
486 /* Now diff_partial the good bit of the hunk with the
487 * good match
489 csl2 = diff_partial(a, b,
490 best[i].xlo, best[i].xhi,
491 best[i].ylo, best[i].yhi);
492 csl1 = csl_join(csl1, csl2);
494 /* Now if there are newlines at the end of the
495 * hunk that weren't matched, find as many in
496 * original and diff_partial those bits
498 lines = 0;
499 yhi = best[i].yhi;
500 while (yhi < b.elcnt && b.list[yhi].start[0]) {
501 lines += !!ends_line(b.list[yhi]);
502 yhi++;
504 xmin = best[i].xhi;
505 if (lines) {
506 int xhi = best[i].xhi;
507 int xmax = a.elcnt;
508 if (i < chunks)
509 xmax = best[i+1].xlo;
510 while (lines && xhi < xmax) {
511 lines -= !!ends_line(a.list[xhi]);
512 xhi++;
514 csl2 = diff_partial(a, b,
515 best[i].xhi, xhi,
516 best[i].yhi, yhi);
517 csl1 = csl_join(csl1, csl2);
518 xmin = xhi;
520 } else {
521 /* FIXME we just lost a hunk!! */;
523 if (csl1) {
524 for (csl2 = csl1; csl2->len; csl2++)
526 csl2->a = a.elcnt;
527 csl2->b = b.elcnt;
528 } else {
529 csl1 = xmalloc(sizeof(*csl1));
530 csl1->len = 0;
531 csl1->a = a.elcnt;
532 csl1->b = b.elcnt;
534 free(best);
535 return csl1;