bestmatch.c

   1 /*
   2  * wiggle - apply rejected patches
   3  *
   4  * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
   5  * Copyright (C) 2011 Neil Brown <neilb@suse.de>
   6  *
   7  *
   8  *    This program is free software; you can redistribute it and/or modify
   9  *    it under the terms of the GNU General Public License as published by
  10  *    the Free Software Foundation; either version 2 of the License, or
  11  *    (at your option) any later version.
  12  *
  13  *    This program is distributed in the hope that it will be useful,
  14  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  *    GNU General Public License for more details.
  17  *
  18  *    You should have received a copy of the GNU General Public License
  19  *    along with this program; if not, write to the Free Software Foundation, Inc.,
  20  *    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  *    Author: Neil Brown
  23  *    Email: <neilb@suse.de>
  24  */
  25
  26 /*
  27  * Find the best match for a patch against a file.  A patch is a
  28  * sequence of chunks each of which is expected to match a particular
  29  * locality of the file.  So we expect big gaps between where chunks
  30  * match, but only small gaps within chunks.
  31  *
  32  * The matching algorithm is similar to that in diff.c, so you should
  33  * understand that first.  However it takes fewer shortcuts and
  34  * analyses cost in a more detailed way.
  35  *
  36  * We walk the whole matrix in a breadth first fashion following a
  37  * 'front' on which x+y is constant.  Along this front we examine each
  38  * diagonal.  For each point we calculate a 'value' for the match so
  39  * far.  This will be in some particular chunk.  For each chunk we
  40  * separately record the best value found so far, and where it was.
  41  * To choose a new value for each point we calculate based on the
  42  * previous value on each neighbouring diagonal and on this diagonal.
  43  *
  44  * This can result is a set of 'best' matches for each chunk which are
  45  * not in the same order that the chunks initially were.  This
  46  * probably isn't desired, so we choose a 'best' best match and
  47  * recurse on each side of it.
  48  *
  49  * The quality of a match is a somewhat complex function that is
  50  * roughly 3 times the number of matching symbols minus the number
  51  * of replaced, added, or deleted.  This seems to work.
  52  *
  53  * For any point, the best possible score using that point
  54  * is a complete diagonal to the nearest edge.  We ignore points
  55  * which cannot contibute to a better overall score.
  56  *
  57  * As this is a fairly expensive search we remove uninteresting
  58  * symbols before searching.  Specifically we only keep alphanumeric
  59  * (plus '_') strings.  Spaces and punctuation is ignored.  This should
  60  * contain enough information to achieve a reliable match while scanning
  61  * many fewer symbols.
  62  */
  63
  64 #include        <ctype.h>
  65 #include        <stdlib.h>
  66 #include        "wiggle.h"
  67
  68 /* This structure keeps track of the current match at each point.
  69  * It holds the start of the match as x,k where k is the
  70  * diagonal, so y = x-k.
  71  * Also the length of the match so far.
  72  * If l == 0, there is no match.
  73  */
  74 struct v {
  75         int x, y;  /* location of start of match */
  76         int val;  /* value of match from x,y to here */
  77         int k;    /* diagonal of last match - if val > 0 */
  78         int inmatch; /* 1 if last point was a match */
  79         int c; /* chunk number */
  80 };
  81
  82 /*
  83  * Here we must determine the 'value' of a partial match.
  84  * The input parameters are:
  85  *   length - the total number of symbols matched
  86  *   errs  - the total number of insertions or deletions
  87  *   dif   - the absolute difference between number of insertions and deletions.
  88  *
  89  * In general we want length to be high, errs to be low, and dif to be low.
  90  * Particular questions that must be answered include:
  91  *  - When does adding an extra symbol after a small gap improve the match
  92  *  - When does a match become so bad that we would rather start again.
  93  *
  94  * We would like symmetry in our answers so that a good sequence with
  95  * an out-rider on one end is evaluated the same as a good sequence
  96  * with an out-rider on the other end.
  97  *
  98  * However to do this we cannot really use the value of the good
  99  * sequence to weigh in the out-riders favour as in the case of a
 100  * leading outrider, we do not yet know the value of the good
 101  * sequence.
 102  *
 103  * First, we need an arbitrary number, X, to say "Given a single
 104  * symbol, after X errors, we forget that symbol".  5 seems a good
 105  * number.
 106  *
 107  * Next we need to understand how replacements compare to insertions
 108  * or deletions.  Probably a replacement is the same cost as an
 109  * insertion or deletion.  Finally, a few large stretches are better
 110  * then lots of little ones, so the number of disjoint stretches
 111  * should be kept low.
 112  *
 113  * So:
 114  *   The first match sets the value to 6.
 115  *   Each consecutive match adds 3
 116  *   A non-consecutive match which value is still +ve adds 2
 117  *   Each non-match subtracts one unless it is the other half of a replacement.
 118  *   A value of 0 causes us to forget where we are and start again.
 119  *
 120  * We need to not only assess the value at a particular location, but
 121  * also assess the maximum value we could get if all remaining symbols
 122  * matched, to help exclude parts of the matrix.  The value of that
 123  * possibility is 6 times the number of remaining symbols, -1 if we
 124  * just had a match.
 125  */
 126 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
 127 static inline void update_value(struct v *v, int dir, int k, int x)
 128 {
 129         if (dir == 0) {
 130                 if (v->val <= 0) {
 131                         v->x = x-1;
 132                         v->y = x-k-1;
 133                         v->inmatch = 0;
 134                         v->val = 4;
 135                 }
 136                 v->val += 2+v->inmatch;
 137                 v->inmatch = 1;
 138                 v->k = k;
 139         } else if (v->val > 0) {
 140                 v->inmatch = 0;
 141                 if (dir * (v->k - k) > 0) {
 142                         /* other half of replacement */
 143                 } else {
 144                         v->val -= 1;
 145                 }
 146         }
 147 }
 148
 149 /* Calculate the best possible value that this 'struct v'
 150  * could reach if there are 'max' symbols remaining
 151  * that could possibly be matches.
 152  */
 153 static inline int best_val(struct v *v, int max)
 154 {
 155         if (v->val <= 0)
 156                 return 4+max*3-1;
 157         else
 158                 return max*3-1+v->inmatch+v->val;
 159 }
 160
 161 struct best {
 162         int xlo, ylo;
 163         int xhi, yhi;
 164         int val;
 165 };
 166
 167 static inline int min(int a, int b)
 168 {
 169         return a < b ? a : b;
 170 }
 171
 172 static void find_best(struct file *a, struct file *b,
 173                       int alo, int ahi,
 174                       int blo, int bhi, struct best *best)
 175 {
 176         int klo, khi, k;
 177         int f;
 178
 179         struct v *valloc = xmalloc(sizeof(struct v)*((ahi-alo)+(bhi-blo)+5));
 180         struct v *v = valloc + (bhi-alo+2);
 181
 182         k = klo = khi = alo-blo;
 183         f = alo+blo; /* front that moves forward */
 184         v[k].val = 0;
 185         v[k].c = -1;
 186
 187         while (f < ahi+bhi) {
 188                 int x, y;
 189
 190                 f++;
 191                 for (k = klo+1; k <= khi-1 ; k += 2) {
 192                         struct v vnew, vnew2;
 193                         x = (k+f)/2;
 194                         y = x-k;
 195                         /* first consider the diagonal - if possible
 196                          * it is always preferred
 197                          */
 198                         if (match(&a->list[x-1], &b->list[y-1])) {
 199                                 vnew = v[k];
 200                                 update_value(&v[k], 0, k, x);
 201                                 if (v[k].c < 0)
 202                                         abort();
 203                                 if (v[k].val > best[v[k].c].val) {
 204                                         int chunk = v[k].c;
 205                                         best[chunk].xlo = v[k].x;
 206                                         best[chunk].ylo = v[k].y;
 207                                         best[chunk].xhi = x;
 208                                         best[chunk].yhi = y;
 209                                         best[chunk].val = v[k].val;
 210                                 }
 211                         } else {
 212                                 /* First consider a y-step: adding a
 213                                  * symbol from B */
 214                                 vnew = v[k+1];
 215                                 update_value(&vnew, -1, k, x);
 216                                 /* might cross a chunk boundary */
 217                                 if (b->list[y-1].len && b->list[y-1].start[0] == 0) {
 218                                         vnew.c = atoi(b->list[y-1].start+1);
 219                                         vnew.val = 0;
 220                                 }
 221
 222                                 /* Not consider an x-step: deleting
 223                                  * a symbol.  This cannot be a chunk
 224                                  * boundary as there aren't any in 'A'
 225                                  */
 226                                 vnew2 = v[k-1];
 227                                 update_value(&vnew2, 1, k, x);
 228
 229                                 /* Now choose the best. */
 230                                 if (vnew2.val > vnew.val)
 231                                         v[k] = vnew2;
 232                                 else
 233                                         v[k] = vnew;
 234                         }
 235                 }
 236                 /* extend or contract range */
 237                 klo--;
 238                 v[klo] = v[klo+1];
 239                 x = (klo+f)/2; y = x-klo;
 240                 update_value(&v[klo], -1, klo, x);
 241                 if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) {
 242                         v[klo].c = atoi(b->list[y-1].start+1);
 243                         v[klo].val = 0;
 244                 }
 245                 while (klo+2 < (ahi-bhi) &&
 246                        (y > bhi ||
 247                         (best_val(&v[klo], min(ahi-x, bhi-y)) < best[v[klo].c].val &&
 248                          best_val(&v[klo+1], min(ahi-x, bhi-y+1)) < best[v[klo+1].c].val
 249                                 )
 250                                )) {
 251                         klo += 2;
 252                         x = (klo+f)/2; y = x-klo;
 253                 }
 254
 255                 khi++;
 256                 v[khi] = v[khi-1];
 257                 x = (khi+f)/2; y = x - khi;
 258                 update_value(&v[khi], -1, khi, x);
 259                 while (khi-2 > (ahi-bhi) &&
 260                        (x > ahi ||
 261                         (v[khi].c >= 0 &&
 262                          best_val(&v[khi], min(ahi-x, bhi-y)) < best[v[khi].c].val &&
 263                          best_val(&v[khi-1], min(ahi-x+1, bhi-y)) < best[v[khi].c].val
 264                                 )
 265                                )) {
 266                         khi -= 2;
 267                         x = (khi+f)/2; y = x - khi;
 268                 }
 269
 270         }
 271         free(valloc);
 272 }
 273
 274 /* Join two csl lists together.
 275  * Simply allocate new space and copy everything in.
 276  */
 277 static struct csl *csl_join(struct csl *c1, struct csl *c2)
 278 {
 279         struct csl *c, *cd,  *rv;
 280         int cnt;
 281
 282         if (c1 == NULL)
 283                 return c2;
 284         if (c2 == NULL)
 285                 return c1;
 286
 287         cnt = 1; /* the sentinal */
 288         for (c = c1; c->len; c++)
 289                 cnt++;
 290         for (c = c2; c->len; c++)
 291                 cnt++;
 292         cd = rv = xmalloc(sizeof(*rv)*cnt);
 293         for (c = c1; c->len; c++)
 294                 *cd++ = *c;
 295         for (c = c2; c->len; c++)
 296                 *cd++ = *c;
 297         cd->len = 0;
 298         free(c1);
 299         free(c2);
 300         return rv;
 301 }
 302
 303 /*
 304  * Reduce a file by discarding less interesting words
 305  * Words that end with a newline are interesting (so all words
 306  * in line-mode are interesting) and words that start with
 307  * and alphanumeric are interesting.  This excludes spaces and
 308  * special characters in word mode
 309  * Doing a best-fit comparision on only interesting words is
 310  * much faster than on all words, and is nearly as good
 311  */
 312
 313 static inline int is_skipped(struct elmnt e)
 314 {
 315         return !(ends_line(e) ||
 316                  isalnum(e.start[0]) ||
 317                  e.start[0] == '_');
 318 }
 319
 320 static struct file reduce(struct file orig)
 321 {
 322         int cnt = 0;
 323         int i;
 324         struct file rv;
 325
 326         for (i = 0; i < orig.elcnt; i++)
 327                 if (!is_skipped(orig.list[i]))
 328                         cnt++;
 329
 330         if (cnt == orig.elcnt)
 331                 return orig;
 332
 333         rv.elcnt = cnt;
 334         rv.list = xmalloc(cnt*sizeof(struct elmnt));
 335         cnt = 0;
 336         for (i = 0; i < orig.elcnt; i++)
 337                 if (!is_skipped(orig.list[i]))
 338                         rv.list[cnt++] = orig.list[i];
 339         return rv;
 340 }
 341
 342 /* Given a list of best matches between a1 and b1 which are
 343  * subsets of a2 and b2, convert that list to indexes into a2/b2
 344  *
 345  * When we find the location in a2/b2, we expand to include all
 346  * immediately surrounding words which were skipped
 347  */
 348 static void remap(struct best *best, int cnt,
 349                   struct file a1, struct file b1,
 350                   struct file a2, struct file b2)
 351 {
 352         int b;
 353         int pa, pb; /* pointers into the a2 and b2 arrays */
 354
 355         pa = pb = 0;
 356
 357         if (a1.elcnt == 0 && a2.elcnt == 0)
 358                 return;
 359
 360         for (b = 1; b < cnt; b++)
 361             if (best[b].val > 0) {
 362                 while (pa < a2.elcnt &&
 363                        a2.list[pa].start != a1.list[best[b].xlo].start)
 364                         pa++;
 365                 if (pa == a2.elcnt)
 366                         abort();
 367                 while (pb < b2.elcnt &&
 368                        b2.list[pb].start != b1.list[best[b].ylo].start)
 369                         pb++;
 370                 if (pb == b2.elcnt)
 371                         abort();
 372
 373                 /* pa,pb is the start of this best bit.  Step
 374                  * backward over ignored words
 375                  */
 376                 while (pa > 0 && is_skipped(a2.list[pa-1]))
 377                         pa--;
 378                 while (pb > 0 && is_skipped(b2.list[pb-1]))
 379                         pb--;
 380
 381                 if (pa <= 0)
 382                         pa = 1;
 383                 if (pb <= 0)
 384                         pb = 1;
 385
 386                 best[b].xlo = pa;
 387                 best[b].ylo = pb;
 388
 389                 while (pa < a2.elcnt &&
 390                        (pa == 0 || (a2.list[pa-1].start
 391                                     != a1.list[best[b].xhi-1].start)))
 392                         pa++;
 393                 if (pa == a2.elcnt && best[b].xhi != a1.elcnt)
 394                         abort();
 395                 while (pb < b2.elcnt &&
 396                        (pb == 0 || (b2.list[pb-1].start
 397                                     != b1.list[best[b].yhi-1].start)))
 398                         pb++;
 399                 if (pb == b2.elcnt && best[b].yhi != b1.elcnt)
 400                         abort();
 401
 402                 /* pa,pb is now the end of the best bit.
 403                  * Step pa,pb forward over ignored words.
 404                  */
 405                 while (pa < a2.elcnt && is_skipped(a2.list[pa]))
 406                         pa++;
 407                 while (pb < b2.elcnt && is_skipped(b2.list[pb]))
 408                         pb++;
 409                 best[b].xhi = pa;
 410                 best[b].yhi = pb;
 411         }
 412 }
 413
 414 static void find_best_inorder(struct file *a, struct file *b,
 415                               int alo, int ahi, int blo, int bhi,
 416                               struct best *best, int bestlo, int besthi)
 417 {
 418         /* make sure the best matches we find are inorder.
 419          * If they aren't we find a overall best, and
 420          * recurse either side of that
 421          */
 422         int i;
 423         int bad = 0;
 424         int bestval, bestpos = 0;
 425
 426         for (i = bestlo; i < besthi; i++)
 427                 best[i].val = 0;
 428         find_best(a, b, alo, ahi, blo, bhi, best);
 429         for (i = bestlo + 1; i < besthi; i++)
 430                 if (best[i-1].val > 0 &&
 431                     best[i].val > 0 &&
 432                     best[i-1].xhi >= best[i].xlo)
 433                         bad = 1;
 434
 435         if (!bad)
 436                 return;
 437         bestval = 0;
 438         for (i = bestlo; i < besthi; i++)
 439                 if (best[i].val > bestval) {
 440                         bestval = best[i].val;
 441                         bestpos = i;
 442                 }
 443         if (bestpos > bestlo) {
 444                 /* move top down below chunk marker */
 445                 int y = best[bestpos].ylo;
 446                 while (b->list[y].start[0])
 447                         y--;
 448                 find_best_inorder(a, b,
 449                                   alo, best[bestpos].xlo,
 450                                   blo, y,
 451                                   best, bestlo, bestpos);
 452         }
 453         if (bestpos < besthi-1) {
 454                 /* move bottom up to chunk marker */
 455                 int y = best[bestpos].yhi;
 456                 while (b->list[y].start[0])
 457                         y++;
 458                 find_best_inorder(a, b,
 459                                   best[bestpos].xhi, ahi,
 460                                   y, bhi,
 461                                   best, bestpos+1, besthi);
 462         }
 463 }
 464
 465 struct csl *pdiff(struct file a, struct file b, int chunks)
 466 {
 467         struct csl *csl1, *csl2;
 468         struct best *best = xmalloc(sizeof(struct best)*(chunks+1));
 469         int i;
 470         struct file asmall, bsmall;
 471
 472         asmall = reduce(a);
 473         bsmall = reduce(b);
 474
 475         for (i = 0; i < chunks+1; i++)
 476                 best[i].val = 0;
 477         find_best_inorder(&asmall, &bsmall,
 478                           0, asmall.elcnt, 0, bsmall.elcnt,
 479                           best, 1, chunks+1);
 480         remap(best, chunks+1, asmall, bsmall, a, b);
 481         if (asmall.list != a.list)
 482                 free(asmall.list);
 483         if (bsmall.list != b.list)
 484                 free(bsmall.list);
 485
 486         csl1 = NULL;
 487         for (i = 1; i <= chunks; i++)
 488                 if (best[i].val > 0) {
 489                         csl2 = diff_partial(a, b,
 490                                             best[i].xlo, best[i].xhi,
 491                                             best[i].ylo, best[i].yhi);
 492                         csl1 = csl_join(csl1, csl2);
 493                 }
 494         if (csl1) {
 495                 for (csl2 = csl1; csl2->len; csl2++)
 496                         ;
 497                 csl2->a = a.elcnt;
 498                 csl2->b = b.elcnt;
 499         } else {
 500                 csl1 = xmalloc(sizeof(*csl1));
 501                 csl1->len = 0;
 502                 csl1->a = a.elcnt;
 503                 csl1->b = b.elcnt;
 504         }
 505         free(best);
 506         return csl1;
 507 }