1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 Weighted Levenshtein Distance
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapable with backslash, "\*" or "\?"
29 WLD if WLD <= nLimit, else nLimit+1
33 -WLD if Replace and Insert and Delete <= nLimit
36 Recursive definition of WLD:
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
52 Instead of recursions a dynamic algorithm is used.
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
63 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
64 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
66 static sal_Int32
Impl_WLD_StringLen( const sal_Unicode
* pStr
)
68 const sal_Unicode
* pTempStr
= pStr
;
71 return static_cast<sal_Int32
>(pTempStr
-pStr
);
74 // Distance from string to pattern
75 int WLevDistance::WLD( const sal_Unicode
* cString
, sal_Int32 nStringLen
)
77 int nSPMin
= 0; // penalty point Minimum
78 int nRepS
= 0; // for SplitCount
80 // length difference between pattern and string
81 int nLenDiff
= nPatternLen
- nStars
- nStringLen
;
82 // more insertions or deletions necessary as the limit? Then leave
83 if ( (nLenDiff
* nInsQ0
> nLimit
)
84 || ((nStars
== 0) && (nLenDiff
* nDelR0
< -nLimit
)) )
87 // comparative String greater than instantaneous array
88 // -> adapt array size
89 if ( nStringLen
>= nArrayLen
)
91 // increase size much more to avoid reallocation
92 if ( nStringLen
< LEVDISDOUBLEBUF
)
93 nArrayLen
= 2 * nStringLen
;
95 nArrayLen
= nStringLen
+ 1;
96 npDistance
= aDisMem
.NewMem( nArrayLen
);
99 // Calculate start values of the second column (first pattern value).
100 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
101 // therefore the minimum is 0
102 if ( nPatternLen
== 0 )
104 // Count of deletions to reach pattern
105 for ( sal_Int32 i
=0; i
<= nStringLen
; i
++ )
106 npDistance
[i
] = i
* nDelR0
;
108 else if ( cpPattern
[0] == '*' && bpPatIsWild
[0] )
110 // instead of a '*' you can fit in anything
111 for ( sal_Int32 i
=0; i
<= nStringLen
; i
++ )
119 if ( c
== '?' && bpPatIsWild
[0] )
120 nP
= 0; // a '?' could be any character.
122 // Minimum of replacement and deletion+insertion weighting
123 nP
= std::min({ nRepP0
, nRepP0
, nDelR0
+ nInsQ0
});
124 npDistance
[0] = nInsQ0
; // start with simple insert
125 npDistance
[1] = nInsQ0
;
126 npDistance
[2] = nInsQ0
;
127 int nReplacePos
= -1; // tristate flag
129 for ( sal_Int32 i
=1; i
<= nStringLen
; i
++, nDelCnt
+= nDelR0
)
131 if ( cString
[i
-1] == c
)
132 nP
= 0; // Replace from this position is 0
133 // Deletions to match pattern + Replace
134 npDistance
[i
] = nDelCnt
+ nP
;
137 if ( nReplacePos
< 0 && nP
)
138 { // this position will be replaced
142 else if ( nReplacePos
> 0 && !nP
)
145 int nBalance
= levdisbalance( 0, i
-1, c
, cString
, nStringLen
);
147 { // one was replaced that was an insertion instead
154 nSPMin
= std::min({ npDistance
[0], npDistance
[1], npDistance
[2] });
157 // calculate distance matrix
158 sal_Int32 j
= 0; // for all columns of the pattern, till limit is not reached
159 while ( (j
< nPatternLen
-1)
160 && nSPMin
<= (bSplitCount
? 2 * nLimit
: nLimit
) )
163 int nP
, nQ
, nR
, nPij
, d2
;
167 if ( bpPatIsWild
[j
] ) // '*' or '?' not escaped
168 nP
= 0; // could be replaced without penalty
171 if ( c
== '*' && bpPatIsWild
[j
] )
173 nQ
= 0; // insertion and deletion without penalty
178 nQ
= nInsQ0
; // usual weighting
182 // increase insert count to get from null string to pattern
183 npDistance
[0] = npDistance
[0] + nQ
;
184 nSPMin
= npDistance
[0];
185 int nReplacePos
= -1; // tristate flag
186 // for each pattern column run through the string
187 for ( sal_Int32 i
=1; i
<= nStringLen
; i
++ )
189 int d1
= d2
; // WLD( X(i-1), Y(j-1) )
190 d2
= npDistance
[i
]; // WLD( X(i) , Y(j-1) )
191 if ( cString
[i
-1] == c
)
194 if ( nReplacePos
< 0 )
197 int nBalance
= levdisbalance( j
, i
-1, c
, cString
, nStringLen
);
199 nReplacePos
= 0; // no replacement
204 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
205 // WLD( X(i) , Y(j-1) ) + q ,
206 // WLD( X(i-1), Y(j) ) + r )
207 npDistance
[i
] = std::min({ d1
+ nPij
, d2
+ nQ
, npDistance
[i
-1] + nR
});
208 if ( npDistance
[i
] < nSPMin
)
209 nSPMin
= npDistance
[i
];
212 if ( nReplacePos
< 0 && nPij
&& npDistance
[i
] == d1
+ nPij
)
213 { // this position will be replaced
217 else if ( nReplacePos
> 0 && !nPij
)
219 // character is equal in string and pattern
221 // If from this point:
222 // * pattern and string have the same count of this
224 // * and character count is the same before this position
225 // then the replace was none.
227 // Scrambled letters are recognized here and the nRepS
228 // replacement is withdrawn, whereby the double limit kicks
232 int nBalance
= levdisbalance( j
, i
-1, c
, cString
, nStringLen
);
234 { // one was replaced that was an insertion instead
242 if ( (nSPMin
<= nLimit
) && (npDistance
[nStringLen
] <= nLimit
) )
243 return npDistance
[nStringLen
];
248 if ( nRepS
&& nLenDiff
> 0 )
249 nRepS
-= nLenDiff
; // Inserts were counted
250 if ( (nSPMin
<= 2 * nLimit
)
251 && (npDistance
[nStringLen
] <= 2 * nLimit
)
252 && (nRepS
* nRepP0
<= nLimit
) )
253 return -npDistance
[nStringLen
];
260 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
261 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
262 void WLevDistance::CalcLPQR( int nX
, int nY
, int nZ
, bool bRelaxed
)
264 if ( nX
< 0 ) nX
= 0; // only positive values
265 if ( nY
< 0 ) nY
= 0;
266 if ( nZ
< 0 ) nZ
= 0;
267 if (0 == std::min({ nX
, nY
, nZ
})) // at least one 0
270 nMax
= std::max({ nX
, nY
, nZ
}); // either 0 for three 0s or Max
271 if ( 0 == (nMid
= Mid3( nX
, nY
, nZ
)) ) // even two 0
272 nLimit
= nMax
; // either 0 or the only one >0
274 nLimit
= std::lcm( nMid
, nMax
);
276 else // all three of them are not 0
277 nLimit
= std::lcm(std::lcm(nX
, nY
), nZ
);
278 nRepP0
= ( nX
? nLimit
/ nX
: nLimit
+ 1 );
279 nInsQ0
= ( nY
? nLimit
/ nY
: nLimit
+ 1 );
280 nDelR0
= ( nZ
? nLimit
/ nZ
: nLimit
+ 1 );
281 bSplitCount
= bRelaxed
;
284 // The value in the middle
285 int WLevDistance::Mid3( int x
, int y
, int z
)
287 int min
= std::min({ x
, y
, z
});
289 return std::min(y
, z
);
291 return std::min(x
, z
);
293 return std::min(x
, y
);
296 // initialize data from CTOR
297 void WLevDistance::InitData( const sal_Unicode
* cPattern
)
299 cpPattern
= aPatMem
.GetcPtr();
300 bpPatIsWild
= aPatMem
.GetbPtr();
301 npDistance
= aDisMem
.GetPtr();
303 const sal_Unicode
* cp1
= cPattern
;
304 sal_Unicode
* cp2
= cpPattern
;
305 bool* bp
= bpPatIsWild
;
306 // copy pattern, count asterisks, escaped Jokers
309 if ( *cp1
== '\\' ) // maybe escaped
311 if ( *(cp1
+1) == '*' || *(cp1
+1) == '?' ) // next Joker?
318 else if ( *cp1
== '*' || *cp1
== '?' ) // Joker
331 WLevDistance::WLevDistance( const sal_Unicode
* cPattern
,
332 int nOtherX
, int nShorterY
, int nLongerZ
,
334 nPatternLen( Impl_WLD_StringLen(cPattern
) ),
335 aPatMem( nPatternLen
+ 1 ),
336 nArrayLen( nPatternLen
+ 1 ),
339 InitData( cPattern
);
340 CalcLPQR( nOtherX
, nShorterY
, nLongerZ
, bRelaxed
);
344 WLevDistance::WLevDistance( const WLevDistance
& rWLD
) :
345 nPatternLen( rWLD
.nPatternLen
),
346 aPatMem( nPatternLen
+ 1 ),
347 nArrayLen( nPatternLen
+ 1 ),
348 aDisMem( nArrayLen
),
349 nLimit( rWLD
.nLimit
),
350 nRepP0( rWLD
.nRepP0
),
351 nInsQ0( rWLD
.nInsQ0
),
352 nDelR0( rWLD
.nDelR0
),
353 nStars( rWLD
.nStars
),
354 bSplitCount( rWLD
.bSplitCount
)
356 cpPattern
= aPatMem
.GetcPtr();
357 bpPatIsWild
= aPatMem
.GetbPtr();
358 npDistance
= aDisMem
.GetPtr();
360 for ( i
=0; i
<nPatternLen
; i
++ )
362 cpPattern
[i
] = rWLD
.cpPattern
[i
];
363 bpPatIsWild
[i
] = rWLD
.bpPatIsWild
[i
];
369 WLevDistance::~WLevDistance()
373 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */