Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / source / search / levdis.cxx
blob5842abd1eef5c012eb9cc5d29b0401b5905c9c71
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 Weighted Levenshtein Distance
23 including wildcards
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapable with backslash, "\*" or "\?"
28 Return:
29 WLD if WLD <= nLimit, else nLimit+1
31 or, if bSplitCount:
32 WLD if WLD <= nLimit
33 -WLD if Replace and Insert and Delete <= nLimit
34 else nLimit+1
36 Recursive definition of WLD:
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
45 p else
47 Boundary conditions:
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
52 Instead of recursions a dynamic algorithm is used.
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
58 #include <algorithm>
60 #include "levdis.hxx"
62 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
63 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
65 static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
67 const sal_Unicode* pTempStr = pStr;
68 while( *pTempStr )
69 pTempStr++;
70 return static_cast<sal_Int32>(pTempStr-pStr);
73 // Distance from string to pattern
74 int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
76 int nSPMin = 0; // penalty point Minimum
77 int nRepS = 0; // for SplitCount
79 // length difference between pattern and string
80 int nLenDiff = nPatternLen - nStars - nStringLen;
81 // more insertions or deletions necessary as the limit? Then leave
82 if ( (nLenDiff * nInsQ0 > nLimit)
83 || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
84 return LEVDISBIG;
86 // comparative String greater than instantaneous array
87 // -> adapt array size
88 if ( nStringLen >= nArrayLen )
90 // increase size much more to avoid reallocation
91 if ( nStringLen < LEVDISDOUBLEBUF )
92 nArrayLen = 2 * nStringLen;
93 else
94 nArrayLen = nStringLen + 1;
95 npDistance = aDisMem.NewMem( nArrayLen );
98 // Calculate start values of the second column (first pattern value).
99 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
100 // therefore the minimum is 0
101 if ( nPatternLen == 0 )
103 // Count of deletions to reach pattern
104 for ( sal_Int32 i=0; i <= nStringLen; i++ )
105 npDistance[i] = i * nDelR0;
107 else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
109 // instead of a '*' you can fit in anything
110 for ( sal_Int32 i=0; i <= nStringLen; i++ )
111 npDistance[i] = 0;
113 else
115 sal_Unicode c;
116 int nP;
117 c = cpPattern[0];
118 if ( c == '?' && bpPatIsWild[0] )
119 nP = 0; // a '?' could be any character.
120 else
121 // Minimum of replacement and deletion+insertion weighting
122 nP = Min3( nRepP0, nRepP0, nDelR0 + nInsQ0 );
123 npDistance[0] = nInsQ0; // start with simple insert
124 npDistance[1] = nInsQ0;
125 npDistance[2] = nInsQ0;
126 int nReplacePos = -1; // tristate flag
127 int nDelCnt = 0;
128 for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
130 if ( cString[i-1] == c )
131 nP = 0; // Replace from this position is 0
132 // Deletions to match pattern + Replace
133 npDistance[i] = nDelCnt + nP;
134 if ( bSplitCount )
136 if ( nReplacePos < 0 && nP )
137 { // this position will be replaced
138 nRepS++;
139 nReplacePos = i;
141 else if ( nReplacePos > 0 && !nP )
143 // same count of c
144 int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
145 if ( !nBalance )
146 { // one was replaced that was an insertion instead
147 nRepS--;
148 nReplacePos = 0;
153 nSPMin = Min3( npDistance[0], npDistance[1], npDistance[2] );
156 // calculate distance matrix
157 sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
158 while ( (j < nPatternLen-1)
159 && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
161 sal_Unicode c;
162 int nP, nQ, nR, nPij, d2;
164 j++;
165 c = cpPattern[j];
166 if ( bpPatIsWild[j] ) // '*' or '?' not escaped
167 nP = 0; // could be replaced without penalty
168 else
169 nP = nRepP0;
170 if ( c == '*' && bpPatIsWild[j] )
172 nQ = 0; // insertion and deletion without penalty
173 nR = 0;
175 else
177 nQ = nInsQ0; // usual weighting
178 nR = nDelR0;
180 d2 = npDistance[0];
181 // increase insert count to get from null string to pattern
182 npDistance[0] = npDistance[0] + nQ;
183 nSPMin = npDistance[0];
184 int nReplacePos = -1; // tristate flag
185 // for each pattern column run through the string
186 for ( sal_Int32 i=1; i <= nStringLen; i++ )
188 int d1 = d2; // WLD( X(i-1), Y(j-1) )
189 d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
190 if ( cString[i-1] == c )
192 nPij = 0; // p(i,j)
193 if ( nReplacePos < 0 )
195 // same count of c
196 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
197 if ( !nBalance )
198 nReplacePos = 0; // no replacement
201 else
202 nPij = nP;
203 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
204 // WLD( X(i) , Y(j-1) ) + q ,
205 // WLD( X(i-1), Y(j) ) + r )
206 npDistance[i] = Min3( d1 + nPij, d2 + nQ, npDistance[i-1] + nR );
207 if ( npDistance[i] < nSPMin )
208 nSPMin = npDistance[i];
209 if ( bSplitCount )
211 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
212 { // this position will be replaced
213 nRepS++;
214 nReplacePos = i;
216 else if ( nReplacePos > 0 && !nPij )
218 // character is equal in string and pattern
220 // If from this point:
221 // * pattern and string have the same count of this
222 // character
223 // * and character count is the same before this position
224 // then the replace was none.
226 // Scrambled letters are recognized here and the nRepS
227 // replacement is withdrawn, whereby the double limit kicks
228 // in.
230 // Same count of c
231 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
232 if ( !nBalance )
233 { // one was replaced that was an insertion instead
234 nRepS--;
235 nReplacePos = 0;
241 if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
242 return npDistance[nStringLen];
243 else
245 if ( bSplitCount )
247 if ( nRepS && nLenDiff > 0 )
248 nRepS -= nLenDiff; // Inserts were counted
249 if ( (nSPMin <= 2 * nLimit)
250 && (npDistance[nStringLen] <= 2 * nLimit)
251 && (nRepS * nRepP0 <= nLimit) )
252 return -npDistance[nStringLen];
253 return LEVDISBIG;
255 return LEVDISBIG;
259 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
260 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
261 void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
263 if ( nX < 0 ) nX = 0; // only positive values
264 if ( nY < 0 ) nY = 0;
265 if ( nZ < 0 ) nZ = 0;
266 if (0 == Min3( nX, nY, nZ )) // at least one 0
268 int nMid, nMax;
269 nMax = Max3( nX, nY, nZ ); // either 0 for three 0s or Max
270 if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
271 nLimit = nMax; // either 0 or the only one >0
272 else // one is 0
273 nLimit = LCM( nMid, nMax );
275 else // all three of them are not 0
276 nLimit = LCM( LCM( nX, nY ), nZ );
277 nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
278 nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
279 nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
280 bSplitCount = bRelaxed;
283 // greatest common divisor according to Euklid (chaindivision)
284 // special case: 0 plus anything produces 1
285 int WLevDistance::GCD( int a, int b )
287 if ( !a || !b )
288 return 1;
289 if ( a < 0 ) a = -a;
290 if ( b < 0 ) b = -b;
293 if ( a > b )
294 a -= int(a / b) * b;
295 else
296 b -= int(b / a) * a;
297 } while ( a && b );
298 return( a ? a : b);
301 // least common multiple : a * b / GCD(a,b)
302 int WLevDistance::LCM( int a, int b )
304 if ( a > b ) // decrease overflow chance
305 return( (a / GCD(a,b)) * b );
306 else
307 return( (b / GCD(a,b)) * a );
310 // Minimum of three values
311 inline int WLevDistance::Min3( int x, int y, int z )
313 if ( x < y )
314 return std::min(x, z);
315 else
316 return std::min(y, z);
319 // The value in the middle
320 int WLevDistance::Mid3( int x, int y, int z )
322 int min = Min3(x,y,z);
323 if ( x == min )
324 return std::min(y, z);
325 else if ( y == min )
326 return std::min(x, z);
327 else // z == min
328 return std::min(x, y);
331 // Maximum of three values
332 int WLevDistance::Max3( int x, int y, int z )
334 if ( x > y )
335 return std::max(x, z);
336 else
337 return std::max(y, z);
340 // initialize data from CTOR
341 void WLevDistance::InitData( const sal_Unicode* cPattern )
343 cpPattern = aPatMem.GetcPtr();
344 bpPatIsWild = aPatMem.GetbPtr();
345 npDistance = aDisMem.GetPtr();
346 nStars = 0;
347 const sal_Unicode* cp1 = cPattern;
348 sal_Unicode* cp2 = cpPattern;
349 bool* bp = bpPatIsWild;
350 // copy pattern, count asterisks, escaped Jokers
351 while ( *cp1 )
353 if ( *cp1 == '\\' ) // maybe escaped
355 if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
357 cp1++; // skip '\\'
358 nPatternLen--;
360 *bp++ = false;
362 else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
364 if ( *cp1 == '*' )
365 nStars++;
366 *bp++ = true;
368 else
369 *bp++ = false;
370 *cp2++ = *cp1++;
372 *cp2 = '\0';
375 WLevDistance::WLevDistance( const sal_Unicode* cPattern,
376 int nOtherX, int nShorterY, int nLongerZ,
377 bool bRelaxed ) :
378 nPatternLen( Impl_WLD_StringLen(cPattern) ),
379 aPatMem( nPatternLen + 1 ),
380 nArrayLen( nPatternLen + 1 ),
381 aDisMem( nArrayLen )
383 InitData( cPattern );
384 CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
387 // CopyCTor
388 WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
389 nPatternLen( rWLD.nPatternLen ),
390 aPatMem( nPatternLen + 1 ),
391 nArrayLen( nPatternLen + 1 ),
392 aDisMem( nArrayLen ),
393 nLimit( rWLD.nLimit ),
394 nRepP0( rWLD.nRepP0 ),
395 nInsQ0( rWLD.nInsQ0 ),
396 nDelR0( rWLD.nDelR0 ),
397 nStars( rWLD.nStars ),
398 bSplitCount( rWLD.bSplitCount )
400 cpPattern = aPatMem.GetcPtr();
401 bpPatIsWild = aPatMem.GetbPtr();
402 npDistance = aDisMem.GetPtr();
403 sal_Int32 i;
404 for ( i=0; i<nPatternLen; i++ )
406 cpPattern[i] = rWLD.cpPattern[i];
407 bpPatIsWild[i] = rWLD.bpPatIsWild[i];
409 cpPattern[i] = '\0';
412 // DTor
413 WLevDistance::~WLevDistance()
417 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */