changed: update version strings for beta4
[xbmc.git] / xbmc / utils / fstrcmp.cpp
blob16912e50c49ef0696afd1e834caa201390387397
1 /* Functions to make fuzzy comparisons between strings
2 Copyright (C) 1988, 1989, 1992, 1993, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or (at
7 your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 Derived from GNU diff 2.7, analyze.c et al.
21 The basic algorithm is described in:
22 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24 see especially section 4.2, which describes the variation used below.
26 The basic algorithm was independently discovered as described in:
27 "Algorithms for Approximate String Matching", E. Ukkonen,
28 Information and Control Vol. 64, 1985, pp. 100-118.
30 Modified to work on strings rather than files
31 by Peter Miller <pmiller@agso.gov.au>, October 1995
33 Modified to accept a "minimum similarity limit" to stop analyzing the
34 string when the similarity drops below the given limit by Marc Lehmann
35 <pcg@goof.com>.
39 #include <string.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <limits.h>
44 #include "fstrcmp.h"
46 #ifndef __GNUC__
47 #pragma warning( disable : 4244 )
48 #endif
51 * Data on one input string being compared.
53 struct string_data
55 /* The string to be compared. */
56 const char *data;
58 /* The length of the string to be compared. */
59 int data_length;
61 /* The number of characters inserted or deleted. */
62 int edit_count;
65 static struct string_data string[2];
67 static int max_edits; /* compareseq stops when edits > max_edits */
69 #ifdef MINUS_H_FLAG
71 /* This corresponds to the diff -H flag. With this heuristic, for
72 strings with a constant small density of changes, the algorithm is
73 linear in the strings size. This is unlikely in typical uses of
74 fstrcmp, and so is usually compiled out. Besides, there is no
75 interface to set it true. */
76 static int heuristic;
78 #endif
81 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
82 point furthest along the given diagonal in the forward search of the
83 edit matrix. */
84 static int *fdiag;
86 /* Vector, indexed by diagonal, containing the X coordinate of the point
87 furthest along the given diagonal in the backward search of the edit
88 matrix. */
89 static int *bdiag;
91 /* Edit scripts longer than this are too expensive to compute. */
92 static int too_expensive;
94 /* Snakes bigger than this are considered `big'. */
95 #define SNAKE_LIMIT 20
97 struct partition
99 /* Midpoints of this partition. */
100 int xmid, ymid;
102 /* Nonzero if low half will be analyzed minimally. */
103 int lo_minimal;
105 /* Likewise for high half. */
106 int hi_minimal;
110 /* NAME
111 diag - find diagonal path
113 SYNOPSIS
114 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
115 struct partition *part);
117 DESCRIPTION
118 Find the midpoint of the shortest edit script for a specified
119 portion of the two strings.
121 Scan from the beginnings of the strings, and simultaneously from
122 the ends, doing a breadth-first search through the space of
123 edit-sequence. When the two searches meet, we have found the
124 midpoint of the shortest edit sequence.
126 If MINIMAL is nonzero, find the minimal edit script regardless
127 of expense. Otherwise, if the search is too expensive, use
128 heuristics to stop the search and report a suboptimal answer.
130 RETURNS
131 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
132 number XMID - YMID equals the number of inserted characters
133 minus the number of deleted characters (counting only characters
134 before the midpoint). Return the approximate edit cost; this is
135 the total number of characters inserted or deleted (counting
136 only characters before the midpoint), unless a heuristic is used
137 to terminate the search prematurely.
139 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
140 for the left half of the partition is known; similarly for
141 PART->RIGHT_MINIMAL.
143 CAVEAT
144 This function assumes that the first characters of the specified
145 portions of the two strings do not match, and likewise that the
146 last characters do not match. The caller must trim matching
147 characters from the beginning and end of the portions it is
148 going to specify.
150 If we return the "wrong" partitions, the worst this can do is
151 cause suboptimal diff output. It cannot cause incorrect diff
152 output. */
154 static int diag PARAMS ((int, int, int, int, int, struct partition *));
156 static int
157 diag (int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part)
159 int *const fd = fdiag; /* Give the compiler a chance. */
160 int *const bd = bdiag; /* Additional help for the compiler. */
161 const char *const xv = string[0].data; /* Still more help for the compiler. */
162 const char *const yv = string[1].data; /* And more and more . . . */
163 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
164 const int dmax = xlim - yoff; /* Maximum valid diagonal. */
165 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
166 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
167 int fmin = fmid;
168 int fmax = fmid; /* Limits of top-down search. */
169 int bmin = bmid;
170 int bmax = bmid; /* Limits of bottom-up search. */
171 int c; /* Cost. */
172 int odd = (fmid - bmid) & 1;
175 * True if southeast corner is on an odd diagonal with respect
176 * to the northwest.
178 fd[fmid] = xoff;
179 bd[bmid] = xlim;
180 for (c = 1;; ++c)
182 int d; /* Active diagonal. */
183 int big_snake;
185 big_snake = 0;
186 /* Extend the top-down search by an edit step in each diagonal. */
187 if (fmin > dmin)
188 fd[--fmin - 1] = -1;
189 else
190 ++fmin;
191 if (fmax < dmax)
192 fd[++fmax + 1] = -1;
193 else
194 --fmax;
195 for (d = fmax; d >= fmin; d -= 2)
197 int x;
198 int y;
199 int oldx;
200 int tlo;
201 int thi;
203 tlo = fd[d - 1],
204 thi = fd[d + 1];
206 if (tlo >= thi)
207 x = tlo + 1;
208 else
209 x = thi;
210 oldx = x;
211 y = x - d;
212 while (x < xlim && y < ylim && xv[x] == yv[y])
214 ++x;
215 ++y;
217 if (x - oldx > SNAKE_LIMIT)
218 big_snake = 1;
219 fd[d] = x;
220 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
222 part->xmid = x;
223 part->ymid = y;
224 part->lo_minimal = part->hi_minimal = 1;
225 return 2 * c - 1;
228 /* Similarly extend the bottom-up search. */
229 if (bmin > dmin)
230 bd[--bmin - 1] = INT_MAX;
231 else
232 ++bmin;
233 if (bmax < dmax)
234 bd[++bmax + 1] = INT_MAX;
235 else
236 --bmax;
237 for (d = bmax; d >= bmin; d -= 2)
239 int x;
240 int y;
241 int oldx;
242 int tlo;
243 int thi;
245 tlo = bd[d - 1],
246 thi = bd[d + 1];
247 if (tlo < thi)
248 x = tlo;
249 else
250 x = thi - 1;
251 oldx = x;
252 y = x - d;
253 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
255 --x;
256 --y;
258 if (oldx - x > SNAKE_LIMIT)
259 big_snake = 1;
260 bd[d] = x;
261 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
263 part->xmid = x;
264 part->ymid = y;
265 part->lo_minimal = part->hi_minimal = 1;
266 return 2 * c;
270 if (minimal)
271 continue;
273 #ifdef MINUS_H_FLAG
274 /* Heuristic: check occasionally for a diagonal that has made lots
275 of progress compared with the edit distance. If we have any
276 such, find the one that has made the most progress and return
277 it as if it had succeeded.
279 With this heuristic, for strings with a constant small density
280 of changes, the algorithm is linear in the strings size. */
281 if (c > 200 && big_snake && heuristic)
283 int best;
285 best = 0;
286 for (d = fmax; d >= fmin; d -= 2)
288 int dd;
289 int x;
290 int y;
291 int v;
293 dd = d - fmid;
294 x = fd[d];
295 y = x - d;
296 v = (x - xoff) * 2 - dd;
298 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
302 v > best
304 xoff + SNAKE_LIMIT <= x
306 x < xlim
308 yoff + SNAKE_LIMIT <= y
310 y < ylim
313 /* We have a good enough best diagonal; now insist
314 that it end with a significant snake. */
315 int k;
317 for (k = 1; xv[x - k] == yv[y - k]; k++)
319 if (k == SNAKE_LIMIT)
321 best = v;
322 part->xmid = x;
323 part->ymid = y;
324 break;
330 if (best > 0)
332 part->lo_minimal = 1;
333 part->hi_minimal = 0;
334 return 2 * c - 1;
336 best = 0;
337 for (d = bmax; d >= bmin; d -= 2)
339 int dd;
340 int x;
341 int y;
342 int v;
344 dd = d - bmid;
345 x = bd[d];
346 y = x - d;
347 v = (xlim - x) * 2 + dd;
349 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
351 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
352 yoff < y && y <= ylim - SNAKE_LIMIT)
354 /* We have a good enough best diagonal; now insist
355 that it end with a significant snake. */
356 int k;
358 for (k = 0; xv[x + k] == yv[y + k]; k++)
360 if (k == SNAKE_LIMIT - 1)
362 best = v;
363 part->xmid = x;
364 part->ymid = y;
365 break;
371 if (best > 0)
373 part->lo_minimal = 0;
374 part->hi_minimal = 1;
375 return 2 * c - 1;
378 #endif /* MINUS_H_FLAG */
380 /* Heuristic: if we've gone well beyond the call of duty, give up
381 and report halfway between our best results so far. */
382 if (c >= too_expensive)
384 int fxybest;
385 int fxbest;
386 int bxybest;
387 int bxbest;
389 /* Pacify `gcc -Wall'. */
390 fxbest = 0;
391 bxbest = 0;
393 /* Find forward diagonal that maximizes X + Y. */
394 fxybest = -1;
395 for (d = fmax; d >= fmin; d -= 2)
397 int x;
398 int y;
400 x = fd[d] < xlim ? fd[d] : xlim;
401 y = x - d;
403 if (ylim < y)
405 x = ylim + d;
406 y = ylim;
408 if (fxybest < x + y)
410 fxybest = x + y;
411 fxbest = x;
414 /* Find backward diagonal that minimizes X + Y. */
415 bxybest = INT_MAX;
416 for (d = bmax; d >= bmin; d -= 2)
418 int x;
419 int y;
421 x = xoff > bd[d] ? xoff : bd[d];
422 y = x - d;
424 if (y < yoff)
426 x = yoff + d;
427 y = yoff;
429 if (x + y < bxybest)
431 bxybest = x + y;
432 bxbest = x;
435 /* Use the better of the two diagonals. */
436 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
438 part->xmid = fxbest;
439 part->ymid = fxybest - fxbest;
440 part->lo_minimal = 1;
441 part->hi_minimal = 0;
443 else
445 part->xmid = bxbest;
446 part->ymid = bxybest - bxbest;
447 part->lo_minimal = 0;
448 part->hi_minimal = 1;
450 return 2 * c - 1;
456 /* NAME
457 compareseq - find edit sequence
459 SYNOPSIS
460 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
462 DESCRIPTION
463 Compare in detail contiguous subsequences of the two strings
464 which are known, as a whole, to match each other.
466 The subsequence of string 0 is [XOFF, XLIM) and likewise for
467 string 1.
469 Note that XLIM, YLIM are exclusive bounds. All character
470 numbers are origin-0.
472 If MINIMAL is nonzero, find a minimal difference no matter how
473 expensive it is. */
475 static void compareseq PARAMS ((int, int, int, int, int));
477 static void
478 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal)
480 const char *const xv = string[0].data; /* Help the compiler. */
481 const char *const yv = string[1].data;
483 if (string[1].edit_count + string[0].edit_count > max_edits)
484 return ;
486 /* Slide down the bottom initial diagonal. */
487 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
489 ++xoff;
490 ++yoff;
493 /* Slide up the top initial diagonal. */
494 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
496 --xlim;
497 --ylim;
500 /* Handle simple cases. */
501 if (xoff == xlim)
503 while (yoff < ylim)
505 ++string[1].edit_count;
506 ++yoff;
509 else if (yoff == ylim)
511 while (xoff < xlim)
513 ++string[0].edit_count;
514 ++xoff;
517 else
519 int c;
520 struct partition part;
522 /* Find a point of correspondence in the middle of the strings. */
523 c = diag (xoff, xlim, yoff, ylim, minimal, &part);
524 if (c == 1)
527 #if 0
528 / * This should be impossible, because it implies that one of
529 the two subsequences is empty, and that case was handled
530 above without calling `diag'. Let's verify that this is
531 true. * /
532 abort ();
533 #else
535 /* The two subsequences differ by a single insert or delete;
536 record it and we are done. */
537 if (part.xmid - part.ymid < xoff - yoff)
538 ++string[1].edit_count;
539 else
540 ++string[0].edit_count;
541 //#endif
543 else
545 /* Use the partitions to split this problem into subproblems. */
546 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
547 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
553 /* NAME
554 fstrcmp - fuzzy string compare
556 SYNOPSIS
557 double fstrcmp(const char *, const char *, double);
559 DESCRIPTION
560 The fstrcmp function may be used to compare two string for
561 similarity. It is very useful in reducing "cascade" or
562 "secondary" errors in compilers or other situations where
563 symbol tables occur.
565 RETURNS
566 double; 0 if the strings are entirly dissimilar, 1 if the
567 strings are identical, and a number in between if they are
568 similar. */
570 double
571 fstrcmp (const char *string1, const char *string2, double minimum)
573 int i;
575 size_t fdiag_len;
576 static int *fdiag_buf;
577 static size_t fdiag_max;
579 /* set the info for each string. */
580 string[0].data = string1;
581 string[0].data_length = (int)strlen (string1);
582 string[1].data = string2;
583 string[1].data_length = (int)strlen (string2);
585 /* short-circuit obvious comparisons */
586 if (string[0].data_length == 0 && string[1].data_length == 0)
587 return 1.0;
588 if (string[0].data_length == 0 || string[1].data_length == 0)
589 return 0.0;
591 /* Set TOO_EXPENSIVE to be approximate square root of input size,
592 bounded below by 256. */
593 too_expensive = 1;
594 for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
595 too_expensive <<= 1;
596 if (too_expensive < 256)
597 too_expensive = 256;
599 /* Because fstrcmp is typically called multiple times, while scanning
600 symbol tables, etc, attempt to minimize the number of memory
601 allocations performed. Thus, we use a static buffer for the
602 diagonal vectors, and never free them. */
603 fdiag_len = string[0].data_length + string[1].data_length + 3;
604 if (fdiag_len > fdiag_max)
606 fdiag_max = fdiag_len;
607 fdiag_buf = (int*)realloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
609 fdiag = fdiag_buf + string[1].data_length + 1;
610 bdiag = fdiag + fdiag_len;
612 max_edits = (int) (1 + (string[0].data_length + string[1].data_length) * (1. - minimum));
614 /* Now do the main comparison algorithm */
615 string[0].edit_count = 0;
616 string[1].edit_count = 0;
617 compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
619 /* The result is
620 ((number of chars in common) / (average length of the strings)).
621 This is admittedly biased towards finding that the strings are
622 similar, however it does produce meaningful results. */
623 return ((double)
624 (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
625 / (string[0].data_length + string[1].data_length));