2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@cse.unsw.edu.au>
24 * School of Computer Science and Engineering
25 * The University of New South Wales
31 * Find the best match for a patch against
33 * The quality of a match is the length of the match minus the
34 * differential between the endpoints.
35 * We progress through the matrix recording the best
36 * match as we find it.
38 * We perform a full diagonal bredth first traversal assessing
39 * the quality of matches at each point.
40 * At each point there are two or three previous points,
41 * up, back or diagonal if there is a match.
42 * We assess the value of the match at each point and choose the
43 * best. No match at all is given a score of -3.
45 * For any point, the best possible score using that point
46 * is a complete diagonal to the nearest edge. We ignore points
47 * which cannot contibute to a better overall score.
51 /* This structure keeps track of the current match at each point.
52 * It holds the start of the match as x,k where k is the
53 * diagonal, so y = x-k.
54 * Also the length of the match so far.
55 * If l == 0, there is no match.
65 int x
,y
; /* location of start of match */
66 int val
; /* value of match from x,y to here */
67 int k
; /* diagonal of last match */
68 int inmatch
; /* 1 if last point was a match */
69 int c
; /* chunk number */
73 * Here must must determine the 'value' of a partial match.
74 * The input parameters are:
75 * length - the total number of symbols matches
76 * errs - the total number of insertions or deletions
77 * dif - the absolute difference between number of insertions and deletions.
79 * In general we want length to be high, errs to be low, and dif to be low.
80 * Particular questions that must be answered include:
81 * - When does adding an extra symbol after a small gap improve the match
82 * - When does a match become so bad that we would rather start again.
84 * We would like symetry in our answers so that a good sequence with an out-rider on
85 * one end is evaluated the same as a good sequence with an out-rider on the other end.
86 * However to do this we cannot really use value of the good sequence to weigh in the
87 * outriders favour as in the case of a leading outrider, we do not yet know the value of
88 * of the good sequence.
89 * First, we need an arbitrary number, X, to say "Given a single symbol, after X errors, we
90 * forget that symbol". 5 seems a good number.
91 * Next we need to understand how replacements compare to insertions or deletions.
92 * Probably a replacement is the same cost as an insertion or deletion.
93 * Finally, a few large stretches are better then lots of little ones, so the number
94 * of disjoint stretches should be kept low.
96 * Each match after the first adds 5 to value.
97 * The first match in a string adds 6.
98 * Each non-match subtracts one unless it is the other half of a replacement.
99 * A value of 0 causes us to forget where we are and start again.
101 * We need to not only assess the value at a particular location, but also
102 * assess the maximum value we could get if all remaining symbols matched, to
103 * help exclude parts of the matrix.
104 * The value of that possibility is 6 times the number of remaining symbols, -1 if we
107 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
108 static inline void update_value(struct v
*v
, int dir
, int k
, int x
)
117 v
->val
+= 2+v
->inmatch
;
122 if (dir
* (v
->k
- k
) > 0) {
123 /* other half of replacement */
129 static inline int best_val(struct v
*v
, int max
)
134 return max
*3-1+v
->inmatch
+v
->val
;
139 #define value(v,kk,xx) (v.l ? (v.l - abs(kk-v.k)): -3)
142 # define value(v,kk,xx) (v.l ? (v.l - (xx-v.x)/2): -3)
144 # define value(v,kk,xx) (v.l ? (v.l - (xx-v.x)*2/v.l): -3)
149 int xlo
,ylo
,xhi
,yhi
,val
;
152 static inline int min(int a
, int b
) {
153 return a
< b
? a
: b
;
156 void find_best(struct file
*a
, struct file
*b
,
158 int blo
, int bhi
, struct best
*best
)
163 struct v
*valloc
= malloc(sizeof(struct v
)*((ahi
-alo
)+(bhi
-blo
)+5));
164 struct v
*v
= valloc
+ (bhi
-alo
+2);
166 k
= klo
= khi
= alo
-blo
;
167 f
= alo
+blo
; /* front that moves forward */
171 while (f
< ahi
+bhi
) {
178 printf("f %d klo %d khi %d\n", f
,klo
,khi
);
180 for (k
=klo
+1; k
<= khi
-1 ; k
+=2) {
181 struct v vnew
, vnew2
;
184 /* first consider the diagonal */
185 if (match(&a
->list
[x
-1], &b
->list
[y
-1])) {
187 update_value(&vnew
, 0, k
, x
);
189 printf("new %d,%d %d,%d (%d) ...",
190 vnew
.x
, vy(vnew
), x
, y
, value(vnew
,k
,x
));
192 if (vnew
.c
< 0) abort();
193 if (vnew
.val
> best
[vnew
.c
].val
) {
195 printf("New best for %d at %d,%d %d,%d, val %d\n",
196 vnew
.c
, vnew
.x
, vnew
.y
,x
,y
,vnew
.val
);
198 best
[vnew
.c
].xlo
= vnew
.x
;
199 best
[vnew
.c
].ylo
= vnew
.y
;
200 best
[vnew
.c
].xhi
= x
;
201 best
[vnew
.c
].yhi
= y
;
202 best
[vnew
.c
].val
= vnew
.val
;
207 update_value(&vnew
, -1, k
,x
);
208 /* might cross a chunk boundary */
209 if (b
->list
[y
-1].len
&& b
->list
[y
-1].start
[0]==0) {
210 vnew
.c
= atoi(b
->list
[y
-1].start
+1);
214 update_value(&vnew2
, 1, k
, x
);
216 if (vnew2
.val
> vnew
.val
)
222 /* extend or contract range */
225 x
= (klo
+f
)/2; y
= x
-klo
;
226 update_value(&v
[klo
],-1,klo
,x
);
227 if (y
<=bhi
&& b
->list
[y
-1].len
&& b
->list
[y
-1].start
[0]==0) {
228 v
[klo
].c
= atoi(b
->list
[y
-1].start
+1);
230 printf("entered %d at %d,%d\n", v
[klo
].c
, x
, y
);
234 while (klo
+2 < (ahi
-bhi
) &&
236 (best_val(&v
[klo
], min(ahi
-x
,bhi
-y
)) < best
[v
[klo
].c
].val
&&
237 best_val(&v
[klo
+1], min(ahi
-x
,bhi
-y
+1)) < best
[v
[klo
+1].c
].val
241 x
= (klo
+f
)/2; y
= x
-klo
;
246 x
= (khi
+f
)/2; y
= x
- khi
;
247 update_value(&v
[khi
],-1,khi
,x
);
248 while(khi
-2 > (ahi
-bhi
) &&
250 (best_val(&v
[khi
], min(ahi
-x
,bhi
-y
)) < best
[v
[khi
].c
].val
&&
251 best_val(&v
[khi
-1], min(ahi
-x
+1,bhi
-y
)) < best
[v
[khi
].c
].val
255 x
= (khi
+f
)/2; y
= x
- khi
;
262 struct csl
*csl_join(struct csl
*c1
, struct csl
*c2
)
264 struct csl
*c
,*cd
, *rv
;
271 cnt
= 1; /* the sentinal */
272 for (c
=c1
; c
->len
; c
++) cnt
++;
273 for (c
=c2
; c
->len
; c
++) cnt
++;
274 cd
= rv
= malloc(sizeof(*rv
)*cnt
);
275 for (c
=c1
; c
->len
; c
++)
277 for (c
=c2
; c
->len
; c
++)
286 static void printword(struct elmnt e
)
289 printf("%.*s", e
.len
, e
.start
);
292 sscanf(e
.start
+1, "%d %d %d", &a
, &b
, &c
);
293 printf("*** %d,%d **** %d\n", b
,c
,a
);
299 * reduce a file by discarding less interesting words
300 * Words that end with a newline are interesting (so all words
301 * in line-mode are interesting) and words that start with
302 * and alphanumeric are interesting. This excludes spaces and
303 * special characters in word mode
304 * Doing a best-fit comparision on only interesting words is
305 * much fast than on all words, and it nearly as good
308 static inline int is_skipped(struct elmnt e
)
310 return !( ends_line(e
) ||
311 isalnum(e
.start
[0]) ||
314 struct file
reduce(struct file orig
)
320 for (i
=0; i
<orig
.elcnt
; i
++)
321 if (!is_skipped(orig
.list
[i
]))
324 if (cnt
== orig
.elcnt
) return orig
;
327 rv
.list
= malloc(cnt
*sizeof(struct elmnt
));
329 for (i
=0; i
<orig
.elcnt
; i
++)
330 if (!is_skipped(orig
.list
[i
]))
331 rv
.list
[cnt
++] = orig
.list
[i
];
335 /* Given a list of best matches between a1 and b1 which are
336 * subsets of a2 and b2, convert that list to indexes into a2/b2
338 * When we find the location in a2/b2, we expand to include all
339 * immediately surrounding words which were skipped
341 void remap(struct best
*best
, int cnt
,
342 struct file a1
, struct file b1
,
343 struct file a2
, struct file b2
)
349 if (a1
.elcnt
== 0 && a2
.elcnt
== 0) return;
351 for (b
=1; b
<cnt
; b
++)
354 printf("best %d,%d %d,%d\n",
355 best
[b
].xlo
,best
[b
].ylo
,
356 best
[b
].xhi
,best
[b
].yhi
);
358 while (pa
<a2
.elcnt
&&
359 a2
.list
[pa
].start
!= a1
.list
[best
[b
].xlo
].start
)
361 if (pa
== a2
.elcnt
) abort();
362 while (pb
<b2
.elcnt
&&
363 b2
.list
[pb
].start
!= b1
.list
[best
[b
].ylo
].start
)
365 if (pb
== b2
.elcnt
) abort();
367 /* pa,pb is the start of this best bit. Step
368 * backward over ignored words
370 while (pa
>0 && is_skipped(a2
.list
[pa
-1]))
372 while (pb
>0 && is_skipped(b2
.list
[pb
-1]))
376 printf("-> %d,%d\n", pa
,pb
);
381 while (pa
<a2
.elcnt
&&
382 a2
.list
[pa
-1].start
!= a1
.list
[best
[b
].xhi
-1].start
)
384 if (pa
== a2
.elcnt
&& best
[b
].xhi
!= a1
.elcnt
) abort();
385 while (pb
<b2
.elcnt
&&
386 b2
.list
[pb
-1].start
!= b1
.list
[best
[b
].yhi
-1].start
)
388 if (pb
== b2
.elcnt
&& best
[b
].yhi
!= b1
.elcnt
) abort();
390 /* now step pa,pb forward over ignored words */
391 while (pa
<a2
.elcnt
&& is_skipped(a2
.list
[pa
]))
393 while (pb
<b2
.elcnt
&& is_skipped(b2
.list
[pb
]))
396 printf("-> %d,%d\n", pa
,pb
);
403 static void find_best_inorder(struct file
*a
, struct file
*b
,
404 int alo
, int ahi
, int blo
, int bhi
,
405 struct best
*best
, int bestlo
, int besthi
)
407 /* make sure the best matches we find are inorder.
408 * If they aren't we find a overall best, and
409 * recurse either side of that
413 int bestval
, bestpos
=0;
414 for (i
=bestlo
; i
<besthi
; i
++) best
[i
].val
= 0;
415 find_best(a
,b
,alo
,ahi
,blo
,bhi
,best
);
416 for (i
=bestlo
+1; i
<besthi
; i
++)
417 if (best
[i
-1].val
> 0 &&
419 best
[i
-1].xhi
>= best
[i
].xlo
)
425 for (i
=bestlo
; i
<besthi
; i
++)
426 if (best
[i
].val
> bestval
) {
427 bestval
= best
[i
].val
;
430 if (bestpos
> bestlo
) {
431 /* move top down below chunk marker */
432 int y
= best
[bestpos
].ylo
;
433 while (b
->list
[y
].start
[0]) y
--;
434 find_best_inorder(a
,b
,
435 alo
, best
[bestpos
].xlo
,
437 best
, bestlo
, bestpos
);
439 if (bestpos
< besthi
-1) {
440 /* move bottom up to chunk marker */
441 int y
= best
[bestpos
].yhi
;
442 while (b
->list
[y
].start
[0]) y
++;
443 find_best_inorder(a
,b
,
444 best
[bestpos
].xhi
, ahi
,
446 best
, bestpos
+1, besthi
);
450 struct csl
*pdiff(struct file a
, struct file b
, int chunks
)
453 struct csl
*csl1
, *csl2
;
454 struct best
*best
= malloc(sizeof(struct best
)*(chunks
+1));
456 struct file asmall
, bsmall
;
464 /* printf("start: %d,%d %d,%d\n", alo,blo,ahi,bhi); */
466 for (i
=0; i
<chunks
+1; i
++)
468 find_best_inorder(&asmall
,&bsmall
,
469 0, asmall
.elcnt
, 0, bsmall
.elcnt
,
472 /* for(i=0; i<b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
473 for (i
=1; i
<=chunks
; i
++) {
474 printf("end: %d,%d %d,%d\n", best
[i
].xlo
,best
[i
].ylo
,best
[i
].xhi
,best
[i
].yhi
);
475 printf("<"); printword(bsmall
.list
[best
[i
].ylo
]); printf("><");
476 printword(bsmall
.list
[best
[i
].yhi
-1]);printf(">\n");
479 remap(best
,chunks
+1,asmall
,bsmall
,a
,b
);
481 /* for(i=0; i<b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
482 for (i
=1; i
<=chunks
; i
++)
483 printf("end: %d,%d %d,%d\n", best
[i
].xlo
,best
[i
].ylo
,best
[i
].xhi
,best
[i
].yhi
);
484 printf("small: a %d b %d --- normal: a %d b %d\n", asmall
.elcnt
, bsmall
.elcnt
, a
.elcnt
, b
.elcnt
);
488 for (i
=1; i
<=chunks
; i
++)
493 for (j
=best
[i
].xlo
; j
<best
[i
].xhi
; j
++)
494 printword(a
.list
[j
]);
496 for (j
=best
[i
].ylo
; j
<best
[i
].yhi
; j
++)
497 printword(b
.list
[j
]);
499 csl2
= diff_partial(a
,b
,
500 best
[i
].xlo
,best
[i
].xhi
,
501 best
[i
].ylo
,best
[i
].yhi
);
502 csl1
= csl_join(csl1
,csl2
);
505 for (csl2
=csl1
; csl2
->len
; csl2
++);
509 csl1
= malloc(sizeof(*csl1
));