1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 Weighted Levenshtein Distance
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapeable with backslash, "\*" or "\?"
29 WLD if WLD <= nLimit, else nLimit+1
33 -WLD if Replace and Insert and Delete <= nLimit
36 Recursive definition of WLD:
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
52 Instead of recursions a dynamic algorithm is used.
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
60 #if defined( _MSC_VER )
61 #pragma warning(once: 4068)
70 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
71 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
73 static sal_Int32
Impl_WLD_StringLen( const sal_Unicode
* pStr
)
75 const sal_Unicode
* pTempStr
= pStr
;
78 return (sal_Int32
)(pTempStr
-pStr
);
81 // Distance from string to pattern
82 int WLevDistance::WLD( const sal_Unicode
* cString
, sal_Int32 nStringLen
)
84 int nSPMin
= 0; // penalty point Minimum
85 int nRepS
= 0; // for SplitCount
87 // length difference between pattern and string
88 int nLenDiff
= nPatternLen
- nStars
- nStringLen
;
89 // more insertions or deletions necessary as the limit? Then leave
90 if ( (nLenDiff
* nInsQ0
> nLimit
)
91 || ((nStars
== 0) && (nLenDiff
* nDelR0
< -nLimit
)) )
94 // comparative String greater than instantaneous array
95 // -> adapt array size
96 if ( nStringLen
>= nArrayLen
)
98 // increase size much more to avoid reallocation
99 if ( nStringLen
< LEVDISDOUBLEBUF
)
100 nArrayLen
= 2 * nStringLen
;
102 nArrayLen
= nStringLen
+ 1;
103 npDistance
= aDisMem
.NewMem( nArrayLen
);
106 // Calculate start values of the second column (first pattern value).
107 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
108 // therefore the minimum is 0
109 if ( nPatternLen
== 0 )
111 // Count of deletions to reach pattern
112 for ( sal_Int32 i
=0; i
<= nStringLen
; i
++ )
113 npDistance
[i
] = i
* nDelR0
;
115 else if ( cpPattern
[0] == '*' && bpPatIsWild
[0] )
117 // instead of a '*' you can fit in anything
118 for ( sal_Int32 i
=0; i
<= nStringLen
; i
++ )
126 if ( c
== '?' && bpPatIsWild
[0] )
127 nP
= 0; // a '?' could be any character.
129 // Minimum of replacement and deletion+insertion weighting
130 nP
= Min3( nRepP0
, nRepP0
, nDelR0
+ nInsQ0
);
131 npDistance
[0] = nInsQ0
; // start with simple insert
132 npDistance
[1] = nInsQ0
;
133 npDistance
[2] = nInsQ0
;
134 int nReplacePos
= -1; // tristate flag
136 for ( sal_Int32 i
=1; i
<= nStringLen
; i
++, nDelCnt
+= nDelR0
)
138 if ( cString
[i
-1] == c
)
139 nP
= 0; // Replace from this position is 0
140 // Deletions to match pattern + Replace
141 npDistance
[i
] = nDelCnt
+ nP
;
144 if ( nReplacePos
< 0 && nP
)
145 { // this position will be replaced
149 else if ( nReplacePos
> 0 && !nP
)
152 int nBalance
= levdisbalance( 0, i
-1, c
, cString
, nStringLen
);
154 { // one was replaced that was an insertion instead
161 nSPMin
= Min3( npDistance
[0], npDistance
[1], npDistance
[2] );
164 // calculate distance matrix
165 sal_Int32 j
= 0; // for all columns of the pattern, till limit is not reached
166 while ( (j
< nPatternLen
-1)
167 && nSPMin
<= (bSplitCount
? 2 * nLimit
: nLimit
) )
170 int nP
, nQ
, nR
, nPij
, d2
;
174 if ( bpPatIsWild
[j
] ) // '*' or '?' not escaped
175 nP
= 0; // could be replaced without penalty
178 if ( c
== '*' && bpPatIsWild
[j
] )
180 nQ
= 0; // instertion and deletion without penalty
185 nQ
= nInsQ0
; // usual weighting
189 // increase insert count to get from null string to pattern
190 npDistance
[0] = npDistance
[0] + nQ
;
191 nSPMin
= npDistance
[0];
192 int nReplacePos
= -1; // tristate flag
193 // for each pattern column run through the string
194 for ( sal_Int32 i
=1; i
<= nStringLen
; i
++ )
196 int d1
= d2
; // WLD( X(i-1), Y(j-1) )
197 d2
= npDistance
[i
]; // WLD( X(i) , Y(j-1) )
198 if ( cString
[i
-1] == c
)
201 if ( nReplacePos
< 0 )
204 int nBalance
= levdisbalance( j
, i
-1, c
, cString
, nStringLen
);
206 nReplacePos
= 0; // no replacement
211 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
212 // WLD( X(i) , Y(j-1) ) + q ,
213 // WLD( X(i-1), Y(j) ) + r )
214 npDistance
[i
] = Min3( d1
+ nPij
, d2
+ nQ
, npDistance
[i
-1] + nR
);
215 if ( npDistance
[i
] < nSPMin
)
216 nSPMin
= npDistance
[i
];
219 if ( nReplacePos
< 0 && nPij
&& npDistance
[i
] == d1
+ nPij
)
220 { // this position will be replaced
224 else if ( nReplacePos
> 0 && !nPij
)
226 // character is equal in string and pattern
228 // If from this point:
229 // * pattern and string have the same count of this
231 // * and character count is the same before this position
232 // then the replace was none.
234 // Scrambled letters are recognized here and the nRepS
235 // replacement is withdrawn, whereby the double limit kicks
239 int nBalance
= levdisbalance( j
, i
-1, c
, cString
, nStringLen
);
241 { // one was replaced that was an insertion instead
249 if ( (nSPMin
<= nLimit
) && (npDistance
[nStringLen
] <= nLimit
) )
250 return npDistance
[nStringLen
];
255 if ( nRepS
&& nLenDiff
> 0 )
256 nRepS
-= nLenDiff
; // Inserts were counted
257 if ( (nSPMin
<= 2 * nLimit
)
258 && (npDistance
[nStringLen
] <= 2 * nLimit
)
259 && (nRepS
* nRepP0
<= nLimit
) )
260 return -npDistance
[nStringLen
];
267 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
268 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
269 int WLevDistance::CalcLPQR( int nX
, int nY
, int nZ
, bool bRelaxed
)
271 if ( nX
< 0 ) nX
= 0; // only positive values
272 if ( nY
< 0 ) nY
= 0;
273 if ( nZ
< 0 ) nZ
= 0;
274 if (0 == Min3( nX
, nY
, nZ
)) // at least one 0
277 nMax
= Max3( nX
, nY
, nZ
); // either 0 for three 0s or Max
278 if ( 0 == (nMid
= Mid3( nX
, nY
, nZ
)) ) // even two 0
279 nLimit
= nMax
; // either 0 or the only one >0
281 nLimit
= LCM( nMid
, nMax
);
283 else // all three of them are not 0
284 nLimit
= LCM( LCM( nX
, nY
), nZ
);
285 nRepP0
= ( nX
? nLimit
/ nX
: nLimit
+ 1 );
286 nInsQ0
= ( nY
? nLimit
/ nY
: nLimit
+ 1 );
287 nDelR0
= ( nZ
? nLimit
/ nZ
: nLimit
+ 1 );
288 bSplitCount
= bRelaxed
;
292 // greatest common divisior according to Euklid (chaindivision)
293 // special case: 0 plus anything produces 1
294 int WLevDistance::GCD( int a
, int b
)
310 // least common multiple : a * b / GCD(a,b)
311 int WLevDistance::LCM( int a
, int b
)
313 if ( a
> b
) // decrease owerflow chance
314 return( (a
/ GCD(a
,b
)) * b
);
316 return( (b
/ GCD(a
,b
)) * a
);
319 // Minimum of three values
320 inline int WLevDistance::Min3( int x
, int y
, int z
)
323 return( x
< z
? x
: z
);
325 return( y
< z
? y
: z
);
328 // The value in the middle
329 int WLevDistance::Mid3( int x
, int y
, int z
)
331 int min
= Min3(x
,y
,z
);
333 return( y
< z
? y
: z
);
335 return( x
< z
? x
: z
);
337 return( x
< y
? x
: y
);
340 // Maximum of three values
341 int WLevDistance::Max3( int x
, int y
, int z
)
344 return( x
> z
? x
: z
);
346 return( y
> z
? y
: z
);
349 // initialize data from CTOR
350 void WLevDistance::InitData( const sal_Unicode
* cPattern
)
352 cpPattern
= aPatMem
.GetcPtr();
353 bpPatIsWild
= aPatMem
.GetbPtr();
354 npDistance
= aDisMem
.GetPtr();
356 const sal_Unicode
* cp1
= cPattern
;
357 sal_Unicode
* cp2
= cpPattern
;
358 bool* bp
= bpPatIsWild
;
359 // copy pattern, count asterisks, escaped Jokers
362 if ( *cp1
== '\\' ) // maybe escaped
364 if ( *(cp1
+1) == '*' || *(cp1
+1) == '?' ) // next Joker?
371 else if ( *cp1
== '*' || *cp1
== '?' ) // Joker
384 WLevDistance::WLevDistance( const sal_Unicode
* cPattern
,
385 int nOtherX
, int nShorterY
, int nLongerZ
,
387 nPatternLen( Impl_WLD_StringLen(cPattern
) ),
388 aPatMem( nPatternLen
+ 1 ),
389 nArrayLen( nPatternLen
+ 1 ),
392 InitData( cPattern
);
393 CalcLPQR( nOtherX
, nShorterY
, nLongerZ
, bRelaxed
);
397 WLevDistance::WLevDistance( const WLevDistance
& rWLD
) :
398 nPatternLen( rWLD
.nPatternLen
),
399 aPatMem( nPatternLen
+ 1 ),
400 nArrayLen( nPatternLen
+ 1 ),
401 aDisMem( nArrayLen
),
402 nLimit( rWLD
.nLimit
),
403 nRepP0( rWLD
.nRepP0
),
404 nInsQ0( rWLD
.nInsQ0
),
405 nDelR0( rWLD
.nDelR0
),
406 nStars( rWLD
.nStars
),
407 bSplitCount( rWLD
.bSplitCount
)
409 cpPattern
= aPatMem
.GetcPtr();
410 bpPatIsWild
= aPatMem
.GetbPtr();
411 npDistance
= aDisMem
.GetPtr();
413 for ( i
=0; i
<nPatternLen
; i
++ )
415 cpPattern
[i
] = rWLD
.cpPattern
[i
];
416 bpPatIsWild
[i
] = rWLD
.bpPatIsWild
[i
];
422 WLevDistance::~WLevDistance()
426 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */