Browser: add 'v' to open file in an editor.
[wiggle/upstream.git] / bestmatch.c
blob988a211978c4d684ecf666f07986ee9da2aba96f
1 /*
2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
5 * Copyright (C) 2011 Neil Brown <neilb@suse.de>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 * Author: Neil Brown
23 * Email: <neilb@suse.de>
27 * Find the best match for a patch against a file. A patch is a
28 * sequence of chunks each of which is expected to match a particular
29 * locality of the file. So we expect big gaps between where chunks
30 * match, but only small gaps within chunks.
32 * The matching algorithm is similar to that in diff.c, so you should
33 * understand that first. However it takes fewer shortcuts and
34 * analyses cost in a more detailed way.
36 * We walk the whole matrix in a breadth first fashion following a
37 * 'front' on which x+y is constant. Along this front we examine each
38 * diagonal. For each point we calculate a 'value' for the match so
39 * far. This will be in some particular chunk. For each chunk we
40 * separately record the best value found so far, and where it was.
41 * To choose a new value for each point we calculate based on the
42 * previous value on each neighbouring diagonal and on this diagonal.
44 * This can result is a set of 'best' matches for each chunk which are
45 * not in the same order that the chunks initially were. This
46 * probably isn't desired, so we choose a 'best' best match and
47 * recurse on each side of it.
49 * The quality of a match is a somewhat complex function that is
50 * roughly 3 times the number of matching symbols minus the number
51 * of replaced, added, or deleted. This seems to work.
53 * For any point, the best possible score using that point
54 * is a complete diagonal to the nearest edge. We ignore points
55 * which cannot contibute to a better overall score.
57 * As this is a fairly expensive search we remove uninteresting
58 * symbols before searching. Specifically we only keep alphanumeric
59 * (plus '_') strings. Spaces and punctuation is ignored. This should
60 * contain enough information to achieve a reliable match while scanning
61 * many fewer symbols.
64 #include <ctype.h>
65 #include <stdlib.h>
66 #include "wiggle.h"
68 /* This structure keeps track of the current match at each point.
69 * It holds the start of the match as x,k where k is the
70 * diagonal, so y = x-k.
71 * Also the length of the match so far.
72 * If l == 0, there is no match.
74 struct v {
75 int x, y; /* location of start of match */
76 int val; /* value of match from x,y to here */
77 int k; /* diagonal of last match - if val > 0 */
78 int inmatch; /* 1 if last point was a match */
79 int c; /* chunk number */
83 * Here we must determine the 'value' of a partial match.
84 * The input parameters are:
85 * length - the total number of symbols matched
86 * errs - the total number of insertions or deletions
87 * dif - the absolute difference between number of insertions and deletions.
89 * In general we want length to be high, errs to be low, and dif to be low.
90 * Particular questions that must be answered include:
91 * - When does adding an extra symbol after a small gap improve the match
92 * - When does a match become so bad that we would rather start again.
94 * We would like symmetry in our answers so that a good sequence with
95 * an out-rider on one end is evaluated the same as a good sequence
96 * with an out-rider on the other end.
98 * However to do this we cannot really use the value of the good
99 * sequence to weigh in the out-riders favour as in the case of a
100 * leading outrider, we do not yet know the value of the good
101 * sequence.
103 * First, we need an arbitrary number, X, to say "Given a single
104 * symbol, after X errors, we forget that symbol". 5 seems a good
105 * number.
107 * Next we need to understand how replacements compare to insertions
108 * or deletions. Probably a replacement is the same cost as an
109 * insertion or deletion. Finally, a few large stretches are better
110 * then lots of little ones, so the number of disjoint stretches
111 * should be kept low.
113 * So:
114 * The first match sets the value to 6.
115 * Each consecutive match adds 3
116 * A non-consecutive match which value is still +ve adds 2
117 * Each non-match subtracts one unless it is the other half of a replacement.
118 * A value of 0 causes us to forget where we are and start again.
120 * We need to not only assess the value at a particular location, but
121 * also assess the maximum value we could get if all remaining symbols
122 * matched, to help exclude parts of the matrix. The value of that
123 * possibility is 6 times the number of remaining symbols, -1 if we
124 * just had a match.
126 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
127 static inline void update_value(struct v *v, int dir, int k, int x)
129 if (dir == 0) {
130 if (v->val <= 0) {
131 v->x = x-1;
132 v->y = x-k-1;
133 v->inmatch = 0;
134 v->val = 4;
136 v->val += 2+v->inmatch;
137 v->inmatch = 1;
138 v->k = k;
139 } else if (v->val > 0) {
140 v->inmatch = 0;
141 if (dir * (v->k - k) > 0) {
142 /* other half of replacement */
143 } else {
144 v->val -= 1;
149 /* Calculate the best possible value that this 'struct v'
150 * could reach if there are 'max' symbols remaining
151 * that could possibly be matches.
153 static inline int best_val(struct v *v, int max)
155 if (v->val <= 0)
156 return 4+max*3-1;
157 else
158 return max*3-1+v->inmatch+v->val;
161 struct best {
162 int xlo, ylo;
163 int xhi, yhi;
164 int val;
167 static inline int min(int a, int b)
169 return a < b ? a : b;
172 static void find_best(struct file *a, struct file *b,
173 int alo, int ahi,
174 int blo, int bhi, struct best *best)
176 int klo, khi, k;
177 int f;
179 struct v *valloc = xmalloc(sizeof(struct v)*((ahi-alo)+(bhi-blo)+5));
180 struct v *v = valloc + (bhi-alo+2);
182 k = klo = khi = alo-blo;
183 f = alo+blo; /* front that moves forward */
184 v[k].val = 0;
185 v[k].c = -1;
187 while (f < ahi+bhi) {
188 int x, y;
190 f++;
191 for (k = klo+1; k <= khi-1 ; k += 2) {
192 struct v vnew, vnew2;
193 x = (k+f)/2;
194 y = x-k;
195 /* first consider the diagonal - if possible
196 * it is always preferred
198 if (match(&a->list[x-1], &b->list[y-1])) {
199 vnew = v[k];
200 update_value(&v[k], 0, k, x);
201 if (v[k].c < 0)
202 abort();
203 if (v[k].val > best[v[k].c].val) {
204 int chunk = v[k].c;
205 best[chunk].xlo = v[k].x;
206 best[chunk].ylo = v[k].y;
207 best[chunk].xhi = x;
208 best[chunk].yhi = y;
209 best[chunk].val = v[k].val;
211 } else {
212 /* First consider a y-step: adding a
213 * symbol from B */
214 vnew = v[k+1];
215 update_value(&vnew, -1, k, x);
216 /* might cross a chunk boundary */
217 if (b->list[y-1].len && b->list[y-1].start[0] == 0) {
218 vnew.c = atoi(b->list[y-1].start+1);
219 vnew.val = 0;
222 /* Not consider an x-step: deleting
223 * a symbol. This cannot be a chunk
224 * boundary as there aren't any in 'A'
226 vnew2 = v[k-1];
227 update_value(&vnew2, 1, k, x);
229 /* Now choose the best. */
230 if (vnew2.val > vnew.val)
231 v[k] = vnew2;
232 else
233 v[k] = vnew;
236 /* extend or contract range */
237 klo--;
238 v[klo] = v[klo+1];
239 x = (klo+f)/2; y = x-klo;
240 update_value(&v[klo], -1, klo, x);
241 if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) {
242 v[klo].c = atoi(b->list[y-1].start+1);
243 v[klo].val = 0;
245 while (klo+2 < (ahi-bhi) &&
246 (y > bhi ||
247 (best_val(&v[klo], min(ahi-x, bhi-y)) < best[v[klo].c].val &&
248 best_val(&v[klo+1], min(ahi-x, bhi-y+1)) < best[v[klo+1].c].val
250 )) {
251 klo += 2;
252 x = (klo+f)/2; y = x-klo;
255 khi++;
256 v[khi] = v[khi-1];
257 x = (khi+f)/2; y = x - khi;
258 update_value(&v[khi], -1, khi, x);
259 while (khi-2 > (ahi-bhi) &&
260 (x > ahi ||
261 (v[khi].c >= 0 &&
262 best_val(&v[khi], min(ahi-x, bhi-y)) < best[v[khi].c].val &&
263 best_val(&v[khi-1], min(ahi-x+1, bhi-y)) < best[v[khi].c].val
265 )) {
266 khi -= 2;
267 x = (khi+f)/2; y = x - khi;
271 free(valloc);
275 * Reduce a file by discarding less interesting words
276 * Words that end with a newline are interesting (so all words
277 * in line-mode are interesting) and words that start with
278 * and alphanumeric are interesting. This excludes spaces and
279 * special characters in word mode
280 * Doing a best-fit comparision on only interesting words is
281 * much faster than on all words, and is nearly as good
284 static inline int is_skipped(struct elmnt e)
286 return !(ends_line(e) ||
287 isalnum(e.start[0]) ||
288 e.start[0] == '_');
291 static struct file reduce(struct file orig)
293 int cnt = 0;
294 int i;
295 struct file rv;
297 for (i = 0; i < orig.elcnt; i++)
298 if (!is_skipped(orig.list[i]))
299 cnt++;
301 if (cnt == orig.elcnt)
302 return orig;
304 rv.elcnt = cnt;
305 rv.list = xmalloc(cnt*sizeof(struct elmnt));
306 cnt = 0;
307 for (i = 0; i < orig.elcnt; i++)
308 if (!is_skipped(orig.list[i]))
309 rv.list[cnt++] = orig.list[i];
310 return rv;
313 /* Given a list of best matches between a1 and b1 which are
314 * subsets of a2 and b2, convert that list to indexes into a2/b2
316 * When we find the location in a2/b2, we expand to include all
317 * immediately surrounding words which were skipped
319 static void remap(struct best *best, int cnt,
320 struct file a1, struct file b1,
321 struct file a2, struct file b2)
323 int b;
324 int pa, pb; /* pointers into the a2 and b2 arrays */
326 pa = pb = 0;
328 if (a1.elcnt == 0 && a2.elcnt == 0)
329 return;
331 for (b = 1; b < cnt; b++)
332 if (best[b].val > 0) {
333 while (pa < a2.elcnt &&
334 a2.list[pa].start != a1.list[best[b].xlo].start)
335 pa++;
336 if (pa == a2.elcnt)
337 abort();
338 while (pb < b2.elcnt &&
339 b2.list[pb].start != b1.list[best[b].ylo].start)
340 pb++;
341 if (pb == b2.elcnt)
342 abort();
344 /* pa,pb is the start of this best bit. Step
345 * backward over ignored words
347 while (pa > 0 && is_skipped(a2.list[pa-1]))
348 pa--;
349 while (pb > 0 && is_skipped(b2.list[pb-1]))
350 pb--;
352 if (pa <= 0)
353 pa = 1;
354 if (pb <= 0)
355 pb = 1;
357 best[b].xlo = pa;
358 best[b].ylo = pb;
360 while (pa < a2.elcnt &&
361 (pa == 0 || (a2.list[pa-1].start
362 != a1.list[best[b].xhi-1].start)))
363 pa++;
364 if (pa == a2.elcnt && best[b].xhi != a1.elcnt)
365 abort();
366 while (pb < b2.elcnt &&
367 (pb == 0 || (b2.list[pb-1].start
368 != b1.list[best[b].yhi-1].start)))
369 pb++;
370 if (pb == b2.elcnt && best[b].yhi != b1.elcnt)
371 abort();
373 /* pa,pb is now the end of the best bit.
374 * Step pa,pb forward over ignored words.
376 while (pa < a2.elcnt && is_skipped(a2.list[pa]))
377 pa++;
378 while (pb < b2.elcnt && is_skipped(b2.list[pb]))
379 pb++;
380 best[b].xhi = pa;
381 best[b].yhi = pb;
385 static void find_best_inorder(struct file *a, struct file *b,
386 int alo, int ahi, int blo, int bhi,
387 struct best *best, int bestlo, int besthi)
389 /* make sure the best matches we find are inorder.
390 * If they aren't we find a overall best, and
391 * recurse either side of that
393 int i;
394 int bad = 0;
395 int bestval, bestpos = 0;
397 for (i = bestlo; i < besthi; i++)
398 best[i].val = 0;
399 find_best(a, b, alo, ahi, blo, bhi, best);
400 for (i = bestlo + 1; i < besthi; i++)
401 if (best[i-1].val > 0 &&
402 best[i].val > 0 &&
403 best[i-1].xhi >= best[i].xlo)
404 bad = 1;
406 if (!bad)
407 return;
408 bestval = 0;
409 for (i = bestlo; i < besthi; i++)
410 if (best[i].val > bestval) {
411 bestval = best[i].val;
412 bestpos = i;
414 if (bestpos > bestlo) {
415 /* move top down below chunk marker */
416 int y = best[bestpos].ylo;
417 while (b->list[y].start[0])
418 y--;
419 find_best_inorder(a, b,
420 alo, best[bestpos].xlo,
421 blo, y,
422 best, bestlo, bestpos);
424 if (bestpos < besthi-1) {
425 /* move bottom up to chunk marker */
426 int y = best[bestpos].yhi;
427 while (b->list[y].start[0])
428 y++;
429 find_best_inorder(a, b,
430 best[bestpos].xhi, ahi,
431 y, bhi,
432 best, bestpos+1, besthi);
436 struct csl *pdiff(struct file a, struct file b, int chunks)
438 struct csl *csl1, *csl2;
439 struct best *best = xmalloc(sizeof(struct best)*(chunks+1));
440 int i;
441 struct file asmall, bsmall;
442 int xmin;
444 asmall = reduce(a);
445 bsmall = reduce(b);
447 for (i = 0; i < chunks+1; i++)
448 best[i].val = 0;
449 find_best_inorder(&asmall, &bsmall,
450 0, asmall.elcnt, 0, bsmall.elcnt,
451 best, 1, chunks+1);
452 remap(best, chunks+1, asmall, bsmall, a, b);
453 if (asmall.list != a.list)
454 free(asmall.list);
455 if (bsmall.list != b.list)
456 free(bsmall.list);
458 csl1 = NULL;
459 xmin = 0;
460 for (i = 1; i <= chunks; i++)
461 if (best[i].val > 0) {
462 /* If we there are any newlines in the hunk before
463 * ylo, then extend xlo back that many newlines if
464 * possible and diff_partial the extended regions.
466 int lines = 0;
467 int ylo = best[i].ylo;
468 int yhi;
469 while (ylo > 0 && b.list[ylo-1].start[0]) {
470 ylo--;
471 lines += !!ends_line(b.list[ylo]);
473 if (lines) {
474 int xlo = best[i].xlo;
475 while (lines && xlo > xmin) {
476 xlo--;
477 lines -= !!ends_line(a.list[xlo]);
479 while (xlo > xmin && !ends_line(a.list[xlo-1]))
480 xlo--;
481 csl2 = diff_partial(a, b,
482 xlo, best[i].xlo,
483 ylo, best[i].ylo);
484 csl1 = csl_join(csl1, csl2);
487 /* Now diff_partial the good bit of the hunk with the
488 * good match
490 csl2 = diff_partial(a, b,
491 best[i].xlo, best[i].xhi,
492 best[i].ylo, best[i].yhi);
493 csl1 = csl_join(csl1, csl2);
495 /* Now if there are newlines at the end of the
496 * hunk that weren't matched, find as many in
497 * original and diff_partial those bits
499 lines = 0;
500 yhi = best[i].yhi;
501 while (yhi < b.elcnt && b.list[yhi].start[0]) {
502 lines += !!ends_line(b.list[yhi]);
503 yhi++;
505 xmin = best[i].xhi;
506 if (lines) {
507 int xhi = best[i].xhi;
508 int xmax = a.elcnt;
509 if (i < chunks)
510 xmax = best[i+1].xlo;
511 while (lines && xhi < xmax) {
512 lines -= !!ends_line(a.list[xhi]);
513 xhi++;
515 csl2 = diff_partial(a, b,
516 best[i].xhi, xhi,
517 best[i].yhi, yhi);
518 csl1 = csl_join(csl1, csl2);
519 xmin = xhi;
521 } else {
522 /* FIXME we just lost a hunk!! */;
524 if (csl1) {
525 for (csl2 = csl1; csl2->len; csl2++)
527 csl2->a = a.elcnt;
528 csl2->b = b.elcnt;
529 } else {
530 csl1 = xmalloc(sizeof(*csl1));
531 csl1->len = 0;
532 csl1->a = a.elcnt;
533 csl1->b = b.elcnt;
535 free(best);
536 return csl1;