1 /* Functions to make fuzzy comparisons between strings
2 Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or (at
7 your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 Derived from GNU diff 2.7, analyze.c et al.
21 The basic algorithm is described in:
22 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24 see especially section 4.2, which describes the variation used below.
26 The basic algorithm was independently discovered as described in:
27 "Algorithms for Approximate String Matching", E. Ukkonen,
28 Information and Control Vol. 64, 1985, pp. 100-118.
30 Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
31 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
32 at the price of producing suboptimal output for large inputs with
35 Modified to work on strings rather than files
36 by Peter Miller <pmiller@agso.gov.au>, October 1995 */
53 * Data on one input string being compared.
57 /* The string to be compared. */
60 /* The length of the string to be compared. */
63 /* The number of characters inserted or deleted. */
67 static struct string_data string
[2];
72 /* This corresponds to the diff -H flag. With this heuristic, for
73 strings with a constant small density of changes, the algorithm is
74 linear in the strings size. This is unlikely in typical uses of
75 fstrcmp, and so is usually compiled out. Besides, there is no
76 interface to set it true. */
82 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
83 point furthest along the given diagonal in the forward search of the
87 /* Vector, indexed by diagonal, containing the X coordinate of the point
88 furthest along the given diagonal in the backward search of the edit
92 /* Edit scripts longer than this are too expensive to compute. */
93 static int too_expensive
;
95 /* Snakes bigger than this are considered `big'. */
96 #define SNAKE_LIMIT 20
100 /* Midpoints of this partition. */
103 /* Nonzero if low half will be analyzed minimally. */
106 /* Likewise for high half. */
112 diag - find diagonal path
115 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
116 struct partition *part);
119 Find the midpoint of the shortest edit script for a specified
120 portion of the two strings.
122 Scan from the beginnings of the strings, and simultaneously from
123 the ends, doing a breadth-first search through the space of
124 edit-sequence. When the two searches meet, we have found the
125 midpoint of the shortest edit sequence.
127 If MINIMAL is nonzero, find the minimal edit script regardless
128 of expense. Otherwise, if the search is too expensive, use
129 heuristics to stop the search and report a suboptimal answer.
132 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
133 number XMID - YMID equals the number of inserted characters
134 minus the number of deleted characters (counting only characters
135 before the midpoint). Return the approximate edit cost; this is
136 the total number of characters inserted or deleted (counting
137 only characters before the midpoint), unless a heuristic is used
138 to terminate the search prematurely.
140 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
141 for the left half of the partition is known; similarly for
145 This function assumes that the first characters of the specified
146 portions of the two strings do not match, and likewise that the
147 last characters do not match. The caller must trim matching
148 characters from the beginning and end of the portions it is
151 If we return the "wrong" partitions, the worst this can do is
152 cause suboptimal diff output. It cannot cause incorrect diff
156 diag (int xoff
, int xlim
, int yoff
, int ylim
, int minimal
,
157 struct partition
*part
)
159 int *const fd
= fdiag
; /* Give the compiler a chance. */
160 int *const bd
= bdiag
; /* Additional help for the compiler. */
161 const char *const xv
= string
[0].data
; /* Still more help for the compiler. */
162 const char *const yv
= string
[1].data
; /* And more and more . . . */
163 const int dmin
= xoff
- ylim
; /* Minimum valid diagonal. */
164 const int dmax
= xlim
- yoff
; /* Maximum valid diagonal. */
165 const int fmid
= xoff
- yoff
; /* Center diagonal of top-down search. */
166 const int bmid
= xlim
- ylim
; /* Center diagonal of bottom-up search. */
168 int fmax
= fmid
; /* Limits of top-down search. */
170 int bmax
= bmid
; /* Limits of bottom-up search. */
172 int odd
= (fmid
- bmid
) & 1;
175 * True if southeast corner is on an odd diagonal with respect
182 int d
; /* Active diagonal. */
186 /* Extend the top-down search by an edit step in each diagonal. */
195 for (d
= fmax
; d
>= fmin
; d
-= 2)
212 while (x
< xlim
&& y
< ylim
&& xv
[x
] == yv
[y
])
217 if (x
- oldx
> SNAKE_LIMIT
)
220 if (odd
&& bmin
<= d
&& d
<= bmax
&& bd
[d
] <= x
)
224 part
->lo_minimal
= part
->hi_minimal
= 1;
228 /* Similarly extend the bottom-up search. */
230 bd
[--bmin
- 1] = INT_MAX
;
234 bd
[++bmax
+ 1] = INT_MAX
;
237 for (d
= bmax
; d
>= bmin
; d
-= 2)
253 while (x
> xoff
&& y
> yoff
&& xv
[x
- 1] == yv
[y
- 1])
258 if (oldx
- x
> SNAKE_LIMIT
)
261 if (!odd
&& fmin
<= d
&& d
<= fmax
&& x
<= fd
[d
])
265 part
->lo_minimal
= part
->hi_minimal
= 1;
274 /* Heuristic: check occasionally for a diagonal that has made lots
275 of progress compared with the edit distance. If we have any
276 such, find the one that has made the most progress and return
277 it as if it had succeeded.
279 With this heuristic, for strings with a constant small density
280 of changes, the algorithm is linear in the strings size. */
281 if (c
> 200 && big_snake
&& heuristic
)
286 for (d
= fmax
; d
>= fmin
; d
-= 2)
296 v
= (x
- xoff
) * 2 - dd
;
298 if (v
> 12 * (c
+ (dd
< 0 ? -dd
: dd
)))
304 xoff
+ SNAKE_LIMIT
<= x
308 yoff
+ SNAKE_LIMIT
<= y
313 /* We have a good enough best diagonal; now insist
314 that it end with a significant snake. */
317 for (k
= 1; xv
[x
- k
] == yv
[y
- k
]; k
++)
319 if (k
== SNAKE_LIMIT
)
332 part
->lo_minimal
= 1;
333 part
->hi_minimal
= 0;
337 for (d
= bmax
; d
>= bmin
; d
-= 2)
347 v
= (xlim
- x
) * 2 + dd
;
349 if (v
> 12 * (c
+ (dd
< 0 ? -dd
: dd
)))
351 if (v
> best
&& xoff
< x
&& x
<= xlim
- SNAKE_LIMIT
&&
352 yoff
< y
&& y
<= ylim
- SNAKE_LIMIT
)
354 /* We have a good enough best diagonal; now insist
355 that it end with a significant snake. */
358 for (k
= 0; xv
[x
+ k
] == yv
[y
+ k
]; k
++)
360 if (k
== SNAKE_LIMIT
- 1)
373 part
->lo_minimal
= 0;
374 part
->hi_minimal
= 1;
378 #endif /* MINUS_H_FLAG */
380 /* Heuristic: if we've gone well beyond the call of duty, give up
381 and report halfway between our best results so far. */
382 if (c
>= too_expensive
)
389 /* Pacify `gcc -Wall'. */
393 /* Find forward diagonal that maximizes X + Y. */
395 for (d
= fmax
; d
>= fmin
; d
-= 2)
400 x
= fd
[d
] < xlim
? fd
[d
] : xlim
;
414 /* Find backward diagonal that minimizes X + Y. */
416 for (d
= bmax
; d
>= bmin
; d
-= 2)
421 x
= xoff
> bd
[d
] ? xoff
: bd
[d
];
435 /* Use the better of the two diagonals. */
436 if ((xlim
+ ylim
) - bxybest
< fxybest
- (xoff
+ yoff
))
439 part
->ymid
= fxybest
- fxbest
;
440 part
->lo_minimal
= 1;
441 part
->hi_minimal
= 0;
446 part
->ymid
= bxybest
- bxbest
;
447 part
->lo_minimal
= 0;
448 part
->hi_minimal
= 1;
457 compareseq - find edit sequence
460 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
463 Compare in detail contiguous subsequences of the two strings
464 which are known, as a whole, to match each other.
466 The subsequence of string 0 is [XOFF, XLIM) and likewise for
469 Note that XLIM, YLIM are exclusive bounds. All character
470 numbers are origin-0.
472 If MINIMAL is nonzero, find a minimal difference no matter how
476 compareseq (int xoff
, int xlim
, int yoff
, int ylim
, int minimal
)
478 const char *const xv
= string
[0].data
; /* Help the compiler. */
479 const char *const yv
= string
[1].data
;
481 /* Slide down the bottom initial diagonal. */
482 while (xoff
< xlim
&& yoff
< ylim
&& xv
[xoff
] == yv
[yoff
])
488 /* Slide up the top initial diagonal. */
489 while (xlim
> xoff
&& ylim
> yoff
&& xv
[xlim
- 1] == yv
[ylim
- 1])
495 /* Handle simple cases. */
500 ++string
[1].edit_count
;
504 else if (yoff
== ylim
)
508 ++string
[0].edit_count
;
515 struct partition part
;
517 /* Find a point of correspondence in the middle of the strings. */
518 c
= diag (xoff
, xlim
, yoff
, ylim
, minimal
, &part
);
522 /* This should be impossible, because it implies that one of
523 the two subsequences is empty, and that case was handled
524 above without calling `diag'. Let's verify that this is
528 /* The two subsequences differ by a single insert or delete;
529 record it and we are done. */
530 if (part
.xmid
- part
.ymid
< xoff
- yoff
)
531 ++string
[1].edit_count
;
533 ++string
[0].edit_count
;
538 /* Use the partitions to split this problem into subproblems. */
539 compareseq (xoff
, part
.xmid
, yoff
, part
.ymid
, part
.lo_minimal
);
540 compareseq (part
.xmid
, xlim
, part
.ymid
, ylim
, part
.hi_minimal
);
547 fstrcmp - fuzzy string compare
550 double fstrcmp(const char *, const char *);
553 The fstrcmp function may be used to compare two string for
554 similarity. It is very useful in reducing "cascade" or
555 "secondary" errors in compilers or other situations where
559 double; 0 if the strings are entirly dissimilar, 1 if the
560 strings are identical, and a number in between if they are
564 fstrcmp (const char *string1
, const char *string2
)
569 static int *fdiag_buf
;
570 static size_t fdiag_max
;
572 /* set the info for each string. */
573 string
[0].data
= string1
;
574 string
[0].data_length
= strlen (string1
);
575 string
[1].data
= string2
;
576 string
[1].data_length
= strlen (string2
);
578 /* short-circuit obvious comparisons */
579 if (string
[0].data_length
== 0 && string
[1].data_length
== 0)
581 if (string
[0].data_length
== 0 || string
[1].data_length
== 0)
584 /* Set TOO_EXPENSIVE to be approximate square root of input size,
585 bounded below by 256. */
587 for (i
= string
[0].data_length
+ string
[1].data_length
; i
!= 0; i
>>= 2)
589 if (too_expensive
< 256)
592 /* Because fstrcmp is typically called multiple times, while scanning
593 symbol tables, etc, attempt to minimize the number of memory
594 allocations performed. Thus, we use a static buffer for the
595 diagonal vectors, and never free them. */
596 fdiag_len
= string
[0].data_length
+ string
[1].data_length
+ 3;
597 if (fdiag_len
> fdiag_max
)
599 fdiag_max
= fdiag_len
;
600 fdiag_buf
= xrealloc (fdiag_buf
, fdiag_max
* (2 * sizeof (int)));
602 fdiag
= fdiag_buf
+ string
[1].data_length
+ 1;
603 bdiag
= fdiag
+ fdiag_len
;
605 /* Now do the main comparison algorithm */
606 string
[0].edit_count
= 0;
607 string
[1].edit_count
= 0;
608 compareseq (0, string
[0].data_length
, 0, string
[1].data_length
, 0);
611 ((number of chars in common) / (average length of the strings)).
612 This is admittedly biased towards finding that the strings are
613 similar, however it does produce meaningful results. */
614 return ((double) (string
[0].data_length
+ string
[1].data_length
615 - string
[1].edit_count
- string
[0].edit_count
)
616 / (string
[0].data_length
+ string
[1].data_length
));