1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 Weighted Levenshtein Distance
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapable with backslash, "\*" or "\?"
29 WLD if WLD <= nLimit, else nLimit+1
33 -WLD if Replace and Insert and Delete <= nLimit
36 Recursive definition of WLD:
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
52 Instead of recursions a dynamic algorithm is used.
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
62 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
63 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
65 static sal_Int32
Impl_WLD_StringLen( const sal_Unicode
* pStr
)
67 const sal_Unicode
* pTempStr
= pStr
;
70 return static_cast<sal_Int32
>(pTempStr
-pStr
);
73 // Distance from string to pattern
74 int WLevDistance::WLD( const sal_Unicode
* cString
, sal_Int32 nStringLen
)
76 int nSPMin
= 0; // penalty point Minimum
77 int nRepS
= 0; // for SplitCount
79 // length difference between pattern and string
80 int nLenDiff
= nPatternLen
- nStars
- nStringLen
;
81 // more insertions or deletions necessary as the limit? Then leave
82 if ( (nLenDiff
* nInsQ0
> nLimit
)
83 || ((nStars
== 0) && (nLenDiff
* nDelR0
< -nLimit
)) )
86 // comparative String greater than instantaneous array
87 // -> adapt array size
88 if ( nStringLen
>= nArrayLen
)
90 // increase size much more to avoid reallocation
91 if ( nStringLen
< LEVDISDOUBLEBUF
)
92 nArrayLen
= 2 * nStringLen
;
94 nArrayLen
= nStringLen
+ 1;
95 npDistance
= aDisMem
.NewMem( nArrayLen
);
98 // Calculate start values of the second column (first pattern value).
99 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
100 // therefore the minimum is 0
101 if ( nPatternLen
== 0 )
103 // Count of deletions to reach pattern
104 for ( sal_Int32 i
=0; i
<= nStringLen
; i
++ )
105 npDistance
[i
] = i
* nDelR0
;
107 else if ( cpPattern
[0] == '*' && bpPatIsWild
[0] )
109 // instead of a '*' you can fit in anything
110 for ( sal_Int32 i
=0; i
<= nStringLen
; i
++ )
118 if ( c
== '?' && bpPatIsWild
[0] )
119 nP
= 0; // a '?' could be any character.
121 // Minimum of replacement and deletion+insertion weighting
122 nP
= Min3( nRepP0
, nRepP0
, nDelR0
+ nInsQ0
);
123 npDistance
[0] = nInsQ0
; // start with simple insert
124 npDistance
[1] = nInsQ0
;
125 npDistance
[2] = nInsQ0
;
126 int nReplacePos
= -1; // tristate flag
128 for ( sal_Int32 i
=1; i
<= nStringLen
; i
++, nDelCnt
+= nDelR0
)
130 if ( cString
[i
-1] == c
)
131 nP
= 0; // Replace from this position is 0
132 // Deletions to match pattern + Replace
133 npDistance
[i
] = nDelCnt
+ nP
;
136 if ( nReplacePos
< 0 && nP
)
137 { // this position will be replaced
141 else if ( nReplacePos
> 0 && !nP
)
144 int nBalance
= levdisbalance( 0, i
-1, c
, cString
, nStringLen
);
146 { // one was replaced that was an insertion instead
153 nSPMin
= Min3( npDistance
[0], npDistance
[1], npDistance
[2] );
156 // calculate distance matrix
157 sal_Int32 j
= 0; // for all columns of the pattern, till limit is not reached
158 while ( (j
< nPatternLen
-1)
159 && nSPMin
<= (bSplitCount
? 2 * nLimit
: nLimit
) )
162 int nP
, nQ
, nR
, nPij
, d2
;
166 if ( bpPatIsWild
[j
] ) // '*' or '?' not escaped
167 nP
= 0; // could be replaced without penalty
170 if ( c
== '*' && bpPatIsWild
[j
] )
172 nQ
= 0; // insertion and deletion without penalty
177 nQ
= nInsQ0
; // usual weighting
181 // increase insert count to get from null string to pattern
182 npDistance
[0] = npDistance
[0] + nQ
;
183 nSPMin
= npDistance
[0];
184 int nReplacePos
= -1; // tristate flag
185 // for each pattern column run through the string
186 for ( sal_Int32 i
=1; i
<= nStringLen
; i
++ )
188 int d1
= d2
; // WLD( X(i-1), Y(j-1) )
189 d2
= npDistance
[i
]; // WLD( X(i) , Y(j-1) )
190 if ( cString
[i
-1] == c
)
193 if ( nReplacePos
< 0 )
196 int nBalance
= levdisbalance( j
, i
-1, c
, cString
, nStringLen
);
198 nReplacePos
= 0; // no replacement
203 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
204 // WLD( X(i) , Y(j-1) ) + q ,
205 // WLD( X(i-1), Y(j) ) + r )
206 npDistance
[i
] = Min3( d1
+ nPij
, d2
+ nQ
, npDistance
[i
-1] + nR
);
207 if ( npDistance
[i
] < nSPMin
)
208 nSPMin
= npDistance
[i
];
211 if ( nReplacePos
< 0 && nPij
&& npDistance
[i
] == d1
+ nPij
)
212 { // this position will be replaced
216 else if ( nReplacePos
> 0 && !nPij
)
218 // character is equal in string and pattern
220 // If from this point:
221 // * pattern and string have the same count of this
223 // * and character count is the same before this position
224 // then the replace was none.
226 // Scrambled letters are recognized here and the nRepS
227 // replacement is withdrawn, whereby the double limit kicks
231 int nBalance
= levdisbalance( j
, i
-1, c
, cString
, nStringLen
);
233 { // one was replaced that was an insertion instead
241 if ( (nSPMin
<= nLimit
) && (npDistance
[nStringLen
] <= nLimit
) )
242 return npDistance
[nStringLen
];
247 if ( nRepS
&& nLenDiff
> 0 )
248 nRepS
-= nLenDiff
; // Inserts were counted
249 if ( (nSPMin
<= 2 * nLimit
)
250 && (npDistance
[nStringLen
] <= 2 * nLimit
)
251 && (nRepS
* nRepP0
<= nLimit
) )
252 return -npDistance
[nStringLen
];
259 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
260 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
261 void WLevDistance::CalcLPQR( int nX
, int nY
, int nZ
, bool bRelaxed
)
263 if ( nX
< 0 ) nX
= 0; // only positive values
264 if ( nY
< 0 ) nY
= 0;
265 if ( nZ
< 0 ) nZ
= 0;
266 if (0 == Min3( nX
, nY
, nZ
)) // at least one 0
269 nMax
= Max3( nX
, nY
, nZ
); // either 0 for three 0s or Max
270 if ( 0 == (nMid
= Mid3( nX
, nY
, nZ
)) ) // even two 0
271 nLimit
= nMax
; // either 0 or the only one >0
273 nLimit
= LCM( nMid
, nMax
);
275 else // all three of them are not 0
276 nLimit
= LCM( LCM( nX
, nY
), nZ
);
277 nRepP0
= ( nX
? nLimit
/ nX
: nLimit
+ 1 );
278 nInsQ0
= ( nY
? nLimit
/ nY
: nLimit
+ 1 );
279 nDelR0
= ( nZ
? nLimit
/ nZ
: nLimit
+ 1 );
280 bSplitCount
= bRelaxed
;
283 // greatest common divisor according to Euklid (chaindivision)
284 // special case: 0 plus anything produces 1
285 int WLevDistance::GCD( int a
, int b
)
301 // least common multiple : a * b / GCD(a,b)
302 int WLevDistance::LCM( int a
, int b
)
304 if ( a
> b
) // decrease overflow chance
305 return( (a
/ GCD(a
,b
)) * b
);
307 return( (b
/ GCD(a
,b
)) * a
);
310 // Minimum of three values
311 inline int WLevDistance::Min3( int x
, int y
, int z
)
314 return std::min(x
, z
);
316 return std::min(y
, z
);
319 // The value in the middle
320 int WLevDistance::Mid3( int x
, int y
, int z
)
322 int min
= Min3(x
,y
,z
);
324 return std::min(y
, z
);
326 return std::min(x
, z
);
328 return std::min(x
, y
);
331 // Maximum of three values
332 int WLevDistance::Max3( int x
, int y
, int z
)
335 return std::max(x
, z
);
337 return std::max(y
, z
);
340 // initialize data from CTOR
341 void WLevDistance::InitData( const sal_Unicode
* cPattern
)
343 cpPattern
= aPatMem
.GetcPtr();
344 bpPatIsWild
= aPatMem
.GetbPtr();
345 npDistance
= aDisMem
.GetPtr();
347 const sal_Unicode
* cp1
= cPattern
;
348 sal_Unicode
* cp2
= cpPattern
;
349 bool* bp
= bpPatIsWild
;
350 // copy pattern, count asterisks, escaped Jokers
353 if ( *cp1
== '\\' ) // maybe escaped
355 if ( *(cp1
+1) == '*' || *(cp1
+1) == '?' ) // next Joker?
362 else if ( *cp1
== '*' || *cp1
== '?' ) // Joker
375 WLevDistance::WLevDistance( const sal_Unicode
* cPattern
,
376 int nOtherX
, int nShorterY
, int nLongerZ
,
378 nPatternLen( Impl_WLD_StringLen(cPattern
) ),
379 aPatMem( nPatternLen
+ 1 ),
380 nArrayLen( nPatternLen
+ 1 ),
383 InitData( cPattern
);
384 CalcLPQR( nOtherX
, nShorterY
, nLongerZ
, bRelaxed
);
388 WLevDistance::WLevDistance( const WLevDistance
& rWLD
) :
389 nPatternLen( rWLD
.nPatternLen
),
390 aPatMem( nPatternLen
+ 1 ),
391 nArrayLen( nPatternLen
+ 1 ),
392 aDisMem( nArrayLen
),
393 nLimit( rWLD
.nLimit
),
394 nRepP0( rWLD
.nRepP0
),
395 nInsQ0( rWLD
.nInsQ0
),
396 nDelR0( rWLD
.nDelR0
),
397 nStars( rWLD
.nStars
),
398 bSplitCount( rWLD
.bSplitCount
)
400 cpPattern
= aPatMem
.GetcPtr();
401 bpPatIsWild
= aPatMem
.GetbPtr();
402 npDistance
= aDisMem
.GetPtr();
404 for ( i
=0; i
<nPatternLen
; i
++ )
406 cpPattern
[i
] = rWLD
.cpPattern
[i
];
407 bpPatIsWild
[i
] = rWLD
.bpPatIsWild
[i
];
413 WLevDistance::~WLevDistance()
417 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */