update emoji autocorrect entries from po-files
[LibreOffice.git] / i18npool / source / search / levdis.cxx
blobc0dc37a0ee8055b73c11709d5917aa0637130cf0
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 Weighted Levenshtein Distance
23 including wildcards
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapeable with backslash, "\*" or "\?"
28 Return:
29 WLD if WLD <= nLimit, else nLimit+1
31 or, if bSplitCount:
32 WLD if WLD <= nLimit
33 -WLD if Replace and Insert and Delete <= nLimit
34 else nLimit+1
36 Recursive definition of WLD:
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
45 p else
47 Boundary conditions:
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
52 Instead of recursions a dynamic algorithm is used.
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
58 #include <string.h>
60 #if defined( _MSC_VER )
61 #pragma warning(once: 4068)
62 #endif
64 #include "levdis.hxx"
66 #ifdef SOLARIS
67 #undef min
68 #endif
70 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
71 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
73 static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
75 const sal_Unicode* pTempStr = pStr;
76 while( *pTempStr )
77 pTempStr++;
78 return (sal_Int32)(pTempStr-pStr);
81 // Distance from string to pattern
82 int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
84 int nSPMin = 0; // penalty point Minimum
85 int nRepS = 0; // for SplitCount
87 // length difference between pattern and string
88 int nLenDiff = nPatternLen - nStars - nStringLen;
89 // more insertions or deletions necessary as the limit? Then leave
90 if ( (nLenDiff * nInsQ0 > nLimit)
91 || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
92 return LEVDISBIG;
94 // comparative String greater than instantaneous array
95 // -> adapt array size
96 if ( nStringLen >= nArrayLen )
98 // increase size much more to avoid reallocation
99 if ( nStringLen < LEVDISDOUBLEBUF )
100 nArrayLen = 2 * nStringLen;
101 else
102 nArrayLen = nStringLen + 1;
103 npDistance = aDisMem.NewMem( nArrayLen );
106 // Calculate start values of the second column (first pattern value).
107 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
108 // therefore the minimum is 0
109 if ( nPatternLen == 0 )
111 // Count of deletions to reach pattern
112 for ( sal_Int32 i=0; i <= nStringLen; i++ )
113 npDistance[i] = i * nDelR0;
115 else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
117 // instead of a '*' you can fit in anything
118 for ( sal_Int32 i=0; i <= nStringLen; i++ )
119 npDistance[i] = 0;
121 else
123 sal_Unicode c;
124 int nP;
125 c = cpPattern[0];
126 if ( c == '?' && bpPatIsWild[0] )
127 nP = 0; // a '?' could be any character.
128 else
129 // Minimum of replacement and deletion+insertion weighting
130 nP = Min3( nRepP0, nRepP0, nDelR0 + nInsQ0 );
131 npDistance[0] = nInsQ0; // start with simple insert
132 npDistance[1] = nInsQ0;
133 npDistance[2] = nInsQ0;
134 int nReplacePos = -1; // tristate flag
135 int nDelCnt = 0;
136 for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
138 if ( cString[i-1] == c )
139 nP = 0; // Replace from this position is 0
140 // Deletions to match pattern + Replace
141 npDistance[i] = nDelCnt + nP;
142 if ( bSplitCount )
144 if ( nReplacePos < 0 && nP )
145 { // this position will be replaced
146 nRepS++;
147 nReplacePos = i;
149 else if ( nReplacePos > 0 && !nP )
151 // same count of c
152 int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
153 if ( !nBalance )
154 { // one was replaced that was an insertion instead
155 nRepS--;
156 nReplacePos = 0;
161 nSPMin = Min3( npDistance[0], npDistance[1], npDistance[2] );
164 // calculate distance matrix
165 sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
166 while ( (j < nPatternLen-1)
167 && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
169 sal_Unicode c;
170 int nP, nQ, nR, nPij, d2;
172 j++;
173 c = cpPattern[j];
174 if ( bpPatIsWild[j] ) // '*' or '?' not escaped
175 nP = 0; // could be replaced without penalty
176 else
177 nP = nRepP0;
178 if ( c == '*' && bpPatIsWild[j] )
180 nQ = 0; // instertion and deletion without penalty
181 nR = 0;
183 else
185 nQ = nInsQ0; // usual weighting
186 nR = nDelR0;
188 d2 = npDistance[0];
189 // increase insert count to get from null string to pattern
190 npDistance[0] = npDistance[0] + nQ;
191 nSPMin = npDistance[0];
192 int nReplacePos = -1; // tristate flag
193 // for each pattern column run through the string
194 for ( sal_Int32 i=1; i <= nStringLen; i++ )
196 int d1 = d2; // WLD( X(i-1), Y(j-1) )
197 d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
198 if ( cString[i-1] == c )
200 nPij = 0; // p(i,j)
201 if ( nReplacePos < 0 )
203 // same count of c
204 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
205 if ( !nBalance )
206 nReplacePos = 0; // no replacement
209 else
210 nPij = nP;
211 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
212 // WLD( X(i) , Y(j-1) ) + q ,
213 // WLD( X(i-1), Y(j) ) + r )
214 npDistance[i] = Min3( d1 + nPij, d2 + nQ, npDistance[i-1] + nR );
215 if ( npDistance[i] < nSPMin )
216 nSPMin = npDistance[i];
217 if ( bSplitCount )
219 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
220 { // this position will be replaced
221 nRepS++;
222 nReplacePos = i;
224 else if ( nReplacePos > 0 && !nPij )
226 // character is equal in string and pattern
228 // If from this point:
229 // * pattern and string have the same count of this
230 // character
231 // * and character count is the same before this position
232 // then the replace was none.
234 // Scrambled letters are recognized here and the nRepS
235 // replacement is withdrawn, whereby the double limit kicks
236 // in.
238 // Same count of c
239 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
240 if ( !nBalance )
241 { // one was replaced that was an insertion instead
242 nRepS--;
243 nReplacePos = 0;
249 if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
250 return npDistance[nStringLen];
251 else
253 if ( bSplitCount )
255 if ( nRepS && nLenDiff > 0 )
256 nRepS -= nLenDiff; // Inserts were counted
257 if ( (nSPMin <= 2 * nLimit)
258 && (npDistance[nStringLen] <= 2 * nLimit)
259 && (nRepS * nRepP0 <= nLimit) )
260 return -npDistance[nStringLen];
261 return LEVDISBIG;
263 return LEVDISBIG;
267 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
268 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
269 int WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
271 if ( nX < 0 ) nX = 0; // only positive values
272 if ( nY < 0 ) nY = 0;
273 if ( nZ < 0 ) nZ = 0;
274 if (0 == Min3( nX, nY, nZ )) // at least one 0
276 int nMid, nMax;
277 nMax = Max3( nX, nY, nZ ); // either 0 for three 0s or Max
278 if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
279 nLimit = nMax; // either 0 or the only one >0
280 else // one is 0
281 nLimit = LCM( nMid, nMax );
283 else // all three of them are not 0
284 nLimit = LCM( LCM( nX, nY ), nZ );
285 nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
286 nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
287 nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
288 bSplitCount = bRelaxed;
289 return nLimit;
292 // greatest common divisior according to Euklid (chaindivision)
293 // special case: 0 plus anything produces 1
294 int WLevDistance::GCD( int a, int b )
296 if ( !a || !b )
297 return 1;
298 if ( a < 0 ) a = -a;
299 if ( b < 0 ) b = -b;
302 if ( a > b )
303 a -= int(a / b) * b;
304 else
305 b -= int(b / a) * a;
306 } while ( a && b );
307 return( a ? a : b);
310 // least common multiple : a * b / GCD(a,b)
311 int WLevDistance::LCM( int a, int b )
313 if ( a > b ) // decrease owerflow chance
314 return( (a / GCD(a,b)) * b );
315 else
316 return( (b / GCD(a,b)) * a );
319 // Minimum of three values
320 inline int WLevDistance::Min3( int x, int y, int z )
322 if ( x < y )
323 return( x < z ? x : z );
324 else
325 return( y < z ? y : z );
328 // The value in the middle
329 int WLevDistance::Mid3( int x, int y, int z )
331 int min = Min3(x,y,z);
332 if ( x == min )
333 return( y < z ? y : z);
334 else if ( y == min )
335 return( x < z ? x : z);
336 else // z == min
337 return( x < y ? x : y);
340 // Maximum of three values
341 int WLevDistance::Max3( int x, int y, int z )
343 if ( x > y )
344 return( x > z ? x : z );
345 else
346 return( y > z ? y : z );
349 // initialize data from CTOR
350 void WLevDistance::InitData( const sal_Unicode* cPattern )
352 cpPattern = aPatMem.GetcPtr();
353 bpPatIsWild = aPatMem.GetbPtr();
354 npDistance = aDisMem.GetPtr();
355 nStars = 0;
356 const sal_Unicode* cp1 = cPattern;
357 sal_Unicode* cp2 = cpPattern;
358 bool* bp = bpPatIsWild;
359 // copy pattern, count asterisks, escaped Jokers
360 while ( *cp1 )
362 if ( *cp1 == '\\' ) // maybe escaped
364 if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
366 cp1++; // skip '\\'
367 nPatternLen--;
369 *bp++ = false;
371 else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
373 if ( *cp1 == '*' )
374 nStars++;
375 *bp++ = true;
377 else
378 *bp++ = false;
379 *cp2++ = *cp1++;
381 *cp2 = '\0';
384 WLevDistance::WLevDistance( const sal_Unicode* cPattern,
385 int nOtherX, int nShorterY, int nLongerZ,
386 bool bRelaxed ) :
387 nPatternLen( Impl_WLD_StringLen(cPattern) ),
388 aPatMem( nPatternLen + 1 ),
389 nArrayLen( nPatternLen + 1 ),
390 aDisMem( nArrayLen )
392 InitData( cPattern );
393 CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
396 // CopyCTor
397 WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
398 nPatternLen( rWLD.nPatternLen ),
399 aPatMem( nPatternLen + 1 ),
400 nArrayLen( nPatternLen + 1 ),
401 aDisMem( nArrayLen ),
402 nLimit( rWLD.nLimit ),
403 nRepP0( rWLD.nRepP0 ),
404 nInsQ0( rWLD.nInsQ0 ),
405 nDelR0( rWLD.nDelR0 ),
406 nStars( rWLD.nStars ),
407 bSplitCount( rWLD.bSplitCount )
409 cpPattern = aPatMem.GetcPtr();
410 bpPatIsWild = aPatMem.GetbPtr();
411 npDistance = aDisMem.GetPtr();
412 sal_Int32 i;
413 for ( i=0; i<nPatternLen; i++ )
415 cpPattern[i] = rWLD.cpPattern[i];
416 bpPatIsWild[i] = rWLD.bpPatIsWild[i];
418 cpPattern[i] = '\0';
421 // DTor
422 WLevDistance::~WLevDistance()
426 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */