gnu/dist/gettext/gettext-tools/lib/fstrcmp.c

   1 /* Functions to make fuzzy comparisons between strings
   2    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2 of the License, or (at
   7    your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful, but
  10    WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18
  19    Derived from GNU diff 2.7, analyze.c et al.
  20
  21    The basic algorithm is described in:
  22    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
  23    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
  24    see especially section 4.2, which describes the variation used below.
  25
  26    The basic algorithm was independently discovered as described in:
  27    "Algorithms for Approximate String Matching", E. Ukkonen,
  28    Information and Control Vol. 64, 1985, pp. 100-118.
  29
  30    Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
  31    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
  32    at the price of producing suboptimal output for large inputs with
  33    many differences.
  34
  35    Modified to work on strings rather than files
  36    by Peter Miller <pmiller@agso.gov.au>, October 1995 */
  37
  38 #ifdef HAVE_CONFIG_H
  39 # include "config.h"
  40 #endif
  41
  42 /* Specification.  */
  43 #include "fstrcmp.h"
  44
  45 #include <string.h>
  46 #include <stdio.h>
  47 #include <limits.h>
  48
  49 #include "xalloc.h"
  50
  51
  52 /*
  53  * Data on one input string being compared.
  54  */
  55 struct string_data
  56 {
  57   /* The string to be compared. */
  58   const char *data;
  59
  60   /* The length of the string to be compared. */
  61   int data_length;
  62
  63   /* The number of characters inserted or deleted. */
  64   int edit_count;
  65 };
  66
  67 static struct string_data string[2];
  68
  69
  70 #ifdef MINUS_H_FLAG
  71
  72 /* This corresponds to the diff -H flag.  With this heuristic, for
  73    strings with a constant small density of changes, the algorithm is
  74    linear in the strings size.  This is unlikely in typical uses of
  75    fstrcmp, and so is usually compiled out.  Besides, there is no
  76    interface to set it true.  */
  77 static int heuristic;
  78
  79 #endif
  80
  81
  82 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
  83    point furthest along the given diagonal in the forward search of the
  84    edit matrix.  */
  85 static int *fdiag;
  86
  87 /* Vector, indexed by diagonal, containing the X coordinate of the point
  88    furthest along the given diagonal in the backward search of the edit
  89    matrix.  */
  90 static int *bdiag;
  91
  92 /* Edit scripts longer than this are too expensive to compute.  */
  93 static int too_expensive;
  94
  95 /* Snakes bigger than this are considered `big'.  */
  96 #define SNAKE_LIMIT     20
  97
  98 struct partition
  99 {
 100   /* Midpoints of this partition.  */
 101   int xmid, ymid;
 102
 103   /* Nonzero if low half will be analyzed minimally.  */
 104   int lo_minimal;
 105
 106   /* Likewise for high half.  */
 107   int hi_minimal;
 108 };
 109
 110
 111 /* NAME
 112         diag - find diagonal path
 113
 114    SYNOPSIS
 115         int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
 116                 struct partition *part);
 117
 118    DESCRIPTION
 119         Find the midpoint of the shortest edit script for a specified
 120         portion of the two strings.
 121
 122         Scan from the beginnings of the strings, and simultaneously from
 123         the ends, doing a breadth-first search through the space of
 124         edit-sequence.  When the two searches meet, we have found the
 125         midpoint of the shortest edit sequence.
 126
 127         If MINIMAL is nonzero, find the minimal edit script regardless
 128         of expense.  Otherwise, if the search is too expensive, use
 129         heuristics to stop the search and report a suboptimal answer.
 130
 131    RETURNS
 132         Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
 133         number XMID - YMID equals the number of inserted characters
 134         minus the number of deleted characters (counting only characters
 135         before the midpoint).  Return the approximate edit cost; this is
 136         the total number of characters inserted or deleted (counting
 137         only characters before the midpoint), unless a heuristic is used
 138         to terminate the search prematurely.
 139
 140         Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
 141         for the left half of the partition is known; similarly for
 142         PART->RIGHT_MINIMAL.
 143
 144    CAVEAT
 145         This function assumes that the first characters of the specified
 146         portions of the two strings do not match, and likewise that the
 147         last characters do not match.  The caller must trim matching
 148         characters from the beginning and end of the portions it is
 149         going to specify.
 150
 151         If we return the "wrong" partitions, the worst this can do is
 152         cause suboptimal diff output.  It cannot cause incorrect diff
 153         output.  */
 154
 155 static int
 156 diag (int xoff, int xlim, int yoff, int ylim, int minimal,
 157       struct partition *part)
 158 {
 159   int *const fd = fdiag;        /* Give the compiler a chance. */
 160   int *const bd = bdiag;        /* Additional help for the compiler. */
 161   const char *const xv = string[0].data;        /* Still more help for the compiler. */
 162   const char *const yv = string[1].data;        /* And more and more . . . */
 163   const int dmin = xoff - ylim; /* Minimum valid diagonal. */
 164   const int dmax = xlim - yoff; /* Maximum valid diagonal. */
 165   const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
 166   const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
 167   int fmin = fmid;
 168   int fmax = fmid;              /* Limits of top-down search. */
 169   int bmin = bmid;
 170   int bmax = bmid;              /* Limits of bottom-up search. */
 171   int c;                        /* Cost. */
 172   int odd = (fmid - bmid) & 1;
 173
 174   /*
 175          * True if southeast corner is on an odd diagonal with respect
 176          * to the northwest.
 177          */
 178   fd[fmid] = xoff;
 179   bd[bmid] = xlim;
 180   for (c = 1;; ++c)
 181     {
 182       int d;                    /* Active diagonal. */
 183       int big_snake;
 184
 185       big_snake = 0;
 186       /* Extend the top-down search by an edit step in each diagonal. */
 187       if (fmin > dmin)
 188         fd[--fmin - 1] = -1;
 189       else
 190         ++fmin;
 191       if (fmax < dmax)
 192         fd[++fmax + 1] = -1;
 193       else
 194         --fmax;
 195       for (d = fmax; d >= fmin; d -= 2)
 196         {
 197           int x;
 198           int y;
 199           int oldx;
 200           int tlo;
 201           int thi;
 202
 203           tlo = fd[d - 1],
 204             thi = fd[d + 1];
 205
 206           if (tlo >= thi)
 207             x = tlo + 1;
 208           else
 209             x = thi;
 210           oldx = x;
 211           y = x - d;
 212           while (x < xlim && y < ylim && xv[x] == yv[y])
 213             {
 214               ++x;
 215               ++y;
 216             }
 217           if (x - oldx > SNAKE_LIMIT)
 218             big_snake = 1;
 219           fd[d] = x;
 220           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
 221             {
 222               part->xmid = x;
 223               part->ymid = y;
 224               part->lo_minimal = part->hi_minimal = 1;
 225               return 2 * c - 1;
 226             }
 227         }
 228       /* Similarly extend the bottom-up search.  */
 229       if (bmin > dmin)
 230         bd[--bmin - 1] = INT_MAX;
 231       else
 232         ++bmin;
 233       if (bmax < dmax)
 234         bd[++bmax + 1] = INT_MAX;
 235       else
 236         --bmax;
 237       for (d = bmax; d >= bmin; d -= 2)
 238         {
 239           int x;
 240           int y;
 241           int oldx;
 242           int tlo;
 243           int thi;
 244
 245           tlo = bd[d - 1],
 246             thi = bd[d + 1];
 247           if (tlo < thi)
 248             x = tlo;
 249           else
 250             x = thi - 1;
 251           oldx = x;
 252           y = x - d;
 253           while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
 254             {
 255               --x;
 256               --y;
 257             }
 258           if (oldx - x > SNAKE_LIMIT)
 259             big_snake = 1;
 260           bd[d] = x;
 261           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
 262             {
 263               part->xmid = x;
 264               part->ymid = y;
 265               part->lo_minimal = part->hi_minimal = 1;
 266               return 2 * c;
 267             }
 268         }
 269
 270       if (minimal)
 271         continue;
 272
 273 #ifdef MINUS_H_FLAG
 274       /* Heuristic: check occasionally for a diagonal that has made lots
 275          of progress compared with the edit distance.  If we have any
 276          such, find the one that has made the most progress and return
 277          it as if it had succeeded.
 278
 279          With this heuristic, for strings with a constant small density
 280          of changes, the algorithm is linear in the strings size.  */
 281       if (c > 200 && big_snake && heuristic)
 282         {
 283           int best;
 284
 285           best = 0;
 286           for (d = fmax; d >= fmin; d -= 2)
 287             {
 288               int dd;
 289               int x;
 290               int y;
 291               int v;
 292
 293               dd = d - fmid;
 294               x = fd[d];
 295               y = x - d;
 296               v = (x - xoff) * 2 - dd;
 297
 298               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 299                 {
 300                   if
 301                     (
 302                       v > best
 303                       &&
 304                       xoff + SNAKE_LIMIT <= x
 305                       &&
 306                       x < xlim
 307                       &&
 308                       yoff + SNAKE_LIMIT <= y
 309                       &&
 310                       y < ylim
 311                     )
 312                     {
 313                       /* We have a good enough best diagonal; now insist
 314                          that it end with a significant snake.  */
 315                       int k;
 316
 317                       for (k = 1; xv[x - k] == yv[y - k]; k++)
 318                         {
 319                           if (k == SNAKE_LIMIT)
 320                             {
 321                               best = v;
 322                               part->xmid = x;
 323                               part->ymid = y;
 324                               break;
 325                             }
 326                         }
 327                     }
 328                 }
 329             }
 330           if (best > 0)
 331             {
 332               part->lo_minimal = 1;
 333               part->hi_minimal = 0;
 334               return 2 * c - 1;
 335             }
 336           best = 0;
 337           for (d = bmax; d >= bmin; d -= 2)
 338             {
 339               int dd;
 340               int x;
 341               int y;
 342               int v;
 343
 344               dd = d - bmid;
 345               x = bd[d];
 346               y = x - d;
 347               v = (xlim - x) * 2 + dd;
 348
 349               if (v > 12 * (c + (dd < 0 ? -dd : dd)))
 350                 {
 351                   if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
 352                       yoff < y && y <= ylim - SNAKE_LIMIT)
 353                     {
 354                       /* We have a good enough best diagonal; now insist
 355                          that it end with a significant snake.  */
 356                       int k;
 357
 358                       for (k = 0; xv[x + k] == yv[y + k]; k++)
 359                         {
 360                           if (k == SNAKE_LIMIT - 1)
 361                             {
 362                               best = v;
 363                               part->xmid = x;
 364                               part->ymid = y;
 365                               break;
 366                             }
 367                         }
 368                     }
 369                 }
 370             }
 371           if (best > 0)
 372             {
 373               part->lo_minimal = 0;
 374               part->hi_minimal = 1;
 375               return 2 * c - 1;
 376             }
 377         }
 378 #endif /* MINUS_H_FLAG */
 379
 380       /* Heuristic: if we've gone well beyond the call of duty, give up
 381          and report halfway between our best results so far.  */
 382       if (c >= too_expensive)
 383         {
 384           int fxybest;
 385           int fxbest;
 386           int bxybest;
 387           int bxbest;
 388
 389           /* Pacify `gcc -Wall'. */
 390           fxbest = 0;
 391           bxbest = 0;
 392
 393           /* Find forward diagonal that maximizes X + Y.  */
 394           fxybest = -1;
 395           for (d = fmax; d >= fmin; d -= 2)
 396             {
 397               int x;
 398               int y;
 399
 400               x = fd[d] < xlim ? fd[d] : xlim;
 401               y = x - d;
 402
 403               if (ylim < y)
 404                 {
 405                   x = ylim + d;
 406                   y = ylim;
 407                 }
 408               if (fxybest < x + y)
 409                 {
 410                   fxybest = x + y;
 411                   fxbest = x;
 412                 }
 413             }
 414           /* Find backward diagonal that minimizes X + Y.  */
 415           bxybest = INT_MAX;
 416           for (d = bmax; d >= bmin; d -= 2)
 417             {
 418               int x;
 419               int y;
 420
 421               x = xoff > bd[d] ? xoff : bd[d];
 422               y = x - d;
 423
 424               if (y < yoff)
 425                 {
 426                   x = yoff + d;
 427                   y = yoff;
 428                 }
 429               if (x + y < bxybest)
 430                 {
 431                   bxybest = x + y;
 432                   bxbest = x;
 433                 }
 434             }
 435           /* Use the better of the two diagonals.  */
 436           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
 437             {
 438               part->xmid = fxbest;
 439               part->ymid = fxybest - fxbest;
 440               part->lo_minimal = 1;
 441               part->hi_minimal = 0;
 442             }
 443           else
 444             {
 445               part->xmid = bxbest;
 446               part->ymid = bxybest - bxbest;
 447               part->lo_minimal = 0;
 448               part->hi_minimal = 1;
 449             }
 450           return 2 * c - 1;
 451         }
 452     }
 453 }
 454
 455
 456 /* NAME
 457         compareseq - find edit sequence
 458
 459    SYNOPSIS
 460         void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
 461
 462    DESCRIPTION
 463         Compare in detail contiguous subsequences of the two strings
 464         which are known, as a whole, to match each other.
 465
 466         The subsequence of string 0 is [XOFF, XLIM) and likewise for
 467         string 1.
 468
 469         Note that XLIM, YLIM are exclusive bounds.  All character
 470         numbers are origin-0.
 471
 472         If MINIMAL is nonzero, find a minimal difference no matter how
 473         expensive it is.  */
 474
 475 static void
 476 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal)
 477 {
 478   const char *const xv = string[0].data;        /* Help the compiler.  */
 479   const char *const yv = string[1].data;
 480
 481   /* Slide down the bottom initial diagonal. */
 482   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
 483     {
 484       ++xoff;
 485       ++yoff;
 486     }
 487
 488   /* Slide up the top initial diagonal. */
 489   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
 490     {
 491       --xlim;
 492       --ylim;
 493     }
 494
 495   /* Handle simple cases. */
 496   if (xoff == xlim)
 497     {
 498       while (yoff < ylim)
 499         {
 500           ++string[1].edit_count;
 501           ++yoff;
 502         }
 503     }
 504   else if (yoff == ylim)
 505     {
 506       while (xoff < xlim)
 507         {
 508           ++string[0].edit_count;
 509           ++xoff;
 510         }
 511     }
 512   else
 513     {
 514       int c;
 515       struct partition part;
 516
 517       /* Find a point of correspondence in the middle of the strings.  */
 518       c = diag (xoff, xlim, yoff, ylim, minimal, &part);
 519       if (c == 1)
 520         {
 521 #if 0
 522           /* This should be impossible, because it implies that one of
 523              the two subsequences is empty, and that case was handled
 524              above without calling `diag'.  Let's verify that this is
 525              true.  */
 526           abort ();
 527 #else
 528           /* The two subsequences differ by a single insert or delete;
 529              record it and we are done.  */
 530           if (part.xmid - part.ymid < xoff - yoff)
 531             ++string[1].edit_count;
 532           else
 533             ++string[0].edit_count;
 534 #endif
 535         }
 536       else
 537         {
 538           /* Use the partitions to split this problem into subproblems.  */
 539           compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
 540           compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
 541         }
 542     }
 543 }
 544
 545
 546 /* NAME
 547         fstrcmp - fuzzy string compare
 548
 549    SYNOPSIS
 550         double fstrcmp(const char *, const char *);
 551
 552    DESCRIPTION
 553         The fstrcmp function may be used to compare two string for
 554         similarity.  It is very useful in reducing "cascade" or
 555         "secondary" errors in compilers or other situations where
 556         symbol tables occur.
 557
 558    RETURNS
 559         double; 0 if the strings are entirly dissimilar, 1 if the
 560         strings are identical, and a number in between if they are
 561         similar.  */
 562
 563 double
 564 fstrcmp (const char *string1, const char *string2)
 565 {
 566   int i;
 567
 568   size_t fdiag_len;
 569   static int *fdiag_buf;
 570   static size_t fdiag_max;
 571
 572   /* set the info for each string.  */
 573   string[0].data = string1;
 574   string[0].data_length = strlen (string1);
 575   string[1].data = string2;
 576   string[1].data_length = strlen (string2);
 577
 578   /* short-circuit obvious comparisons */
 579   if (string[0].data_length == 0 && string[1].data_length == 0)
 580     return 1.0;
 581   if (string[0].data_length == 0 || string[1].data_length == 0)
 582     return 0.0;
 583
 584   /* Set TOO_EXPENSIVE to be approximate square root of input size,
 585      bounded below by 256.  */
 586   too_expensive = 1;
 587   for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
 588     too_expensive <<= 1;
 589   if (too_expensive < 256)
 590     too_expensive = 256;
 591
 592   /* Because fstrcmp is typically called multiple times, while scanning
 593      symbol tables, etc, attempt to minimize the number of memory
 594      allocations performed.  Thus, we use a static buffer for the
 595      diagonal vectors, and never free them.  */
 596   fdiag_len = string[0].data_length + string[1].data_length + 3;
 597   if (fdiag_len > fdiag_max)
 598     {
 599       fdiag_max = fdiag_len;
 600       fdiag_buf = xrealloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
 601     }
 602   fdiag = fdiag_buf + string[1].data_length + 1;
 603   bdiag = fdiag + fdiag_len;
 604
 605   /* Now do the main comparison algorithm */
 606   string[0].edit_count = 0;
 607   string[1].edit_count = 0;
 608   compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
 609
 610   /* The result is
 611         ((number of chars in common) / (average length of the strings)).
 612      This is admittedly biased towards finding that the strings are
 613      similar, however it does produce meaningful results.  */
 614   return ((double) (string[0].data_length + string[1].data_length
 615                     - string[1].edit_count - string[0].edit_count)
 616           / (string[0].data_length + string[1].data_length));
 617 }