tdf#130857 qt weld: Implement QtInstanceWidget::get_text_height
[LibreOffice.git] / i18npool / source / search / levdis.cxx
blobdd9f8fbf587a4596fb305ac11d7bc2a6610faa25
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 Weighted Levenshtein Distance
23 including wildcards
24 '*' for any number (0 or more) of arbitrary characters
25 '?' for exactly one arbitrary character
26 escapable with backslash, "\*" or "\?"
28 Return:
29 WLD if WLD <= nLimit, else nLimit+1
31 or, if bSplitCount:
32 WLD if WLD <= nLimit
33 -WLD if Replace and Insert and Delete <= nLimit
34 else nLimit+1
36 Recursive definition of WLD:
38 WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39 WLD( X(i) , Y(j-1) ) + q ,
40 WLD( X(i-1), Y(j) ) + r )
42 X(i) := the first i characters of the word X
43 Y(j) := the first j characters of the word Y
44 p(i,j) := 0 if i-th character of X == j-th character of Y,
45 p else
47 Boundary conditions:
48 WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
49 WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
50 WLD( X(0), Y(0) ) := 0
52 Instead of recursions a dynamic algorithm is used.
54 See also: German computer magazine
55 c't 07/89 pages 192-208 and c't 03/94 pages 230-239
58 #include <algorithm>
59 #include <numeric>
61 #include "levdis.hxx"
63 #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
64 #define LEVDISDOUBLEBUF 2048 // no doubling atop this border
66 static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
68 const sal_Unicode* pTempStr = pStr;
69 while( *pTempStr )
70 pTempStr++;
71 return static_cast<sal_Int32>(pTempStr-pStr);
74 // Distance from string to pattern
75 int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
77 int nSPMin = 0; // penalty point Minimum
78 int nRepS = 0; // for SplitCount
80 // length difference between pattern and string
81 int nLenDiff = nPatternLen - nStars - nStringLen;
82 // more insertions or deletions necessary as the limit? Then leave
83 if ( (nLenDiff * nInsQ0 > nLimit)
84 || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
85 return LEVDISBIG;
87 // comparative String greater than instantaneous array
88 // -> adapt array size
89 if ( nStringLen >= nArrayLen )
91 // increase size much more to avoid reallocation
92 if ( nStringLen < LEVDISDOUBLEBUF )
93 nArrayLen = 2 * nStringLen;
94 else
95 nArrayLen = nStringLen + 1;
96 npDistance = aDisMem.NewMem( nArrayLen );
99 // Calculate start values of the second column (first pattern value).
100 // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
101 // therefore the minimum is 0
102 if ( nPatternLen == 0 )
104 // Count of deletions to reach pattern
105 for ( sal_Int32 i=0; i <= nStringLen; i++ )
106 npDistance[i] = i * nDelR0;
108 else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
110 // instead of a '*' you can fit in anything
111 for ( sal_Int32 i=0; i <= nStringLen; i++ )
112 npDistance[i] = 0;
114 else
116 sal_Unicode c;
117 int nP;
118 c = cpPattern[0];
119 if ( c == '?' && bpPatIsWild[0] )
120 nP = 0; // a '?' could be any character.
121 else
122 // Minimum of replacement and deletion+insertion weighting
123 nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
124 npDistance[0] = nInsQ0; // start with simple insert
125 npDistance[1] = nInsQ0;
126 npDistance[2] = nInsQ0;
127 int nReplacePos = -1; // tristate flag
128 int nDelCnt = 0;
129 for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
131 if ( cString[i-1] == c )
132 nP = 0; // Replace from this position is 0
133 // Deletions to match pattern + Replace
134 npDistance[i] = nDelCnt + nP;
135 if ( bSplitCount )
137 if ( nReplacePos < 0 && nP )
138 { // this position will be replaced
139 nRepS++;
140 nReplacePos = i;
142 else if ( nReplacePos > 0 && !nP )
144 // same count of c
145 int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
146 if ( !nBalance )
147 { // one was replaced that was an insertion instead
148 nRepS--;
149 nReplacePos = 0;
154 nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
157 // calculate distance matrix
158 sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
159 while ( (j < nPatternLen-1)
160 && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
162 sal_Unicode c;
163 int nP, nQ, nR, nPij, d2;
165 j++;
166 c = cpPattern[j];
167 if ( bpPatIsWild[j] ) // '*' or '?' not escaped
168 nP = 0; // could be replaced without penalty
169 else
170 nP = nRepP0;
171 if ( c == '*' && bpPatIsWild[j] )
173 nQ = 0; // insertion and deletion without penalty
174 nR = 0;
176 else
178 nQ = nInsQ0; // usual weighting
179 nR = nDelR0;
181 d2 = npDistance[0];
182 // increase insert count to get from null string to pattern
183 npDistance[0] = npDistance[0] + nQ;
184 nSPMin = npDistance[0];
185 int nReplacePos = -1; // tristate flag
186 // for each pattern column run through the string
187 for ( sal_Int32 i=1; i <= nStringLen; i++ )
189 int d1 = d2; // WLD( X(i-1), Y(j-1) )
190 d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
191 if ( cString[i-1] == c )
193 nPij = 0; // p(i,j)
194 if ( nReplacePos < 0 )
196 // same count of c
197 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
198 if ( !nBalance )
199 nReplacePos = 0; // no replacement
202 else
203 nPij = nP;
204 // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
205 // WLD( X(i) , Y(j-1) ) + q ,
206 // WLD( X(i-1), Y(j) ) + r )
207 npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
208 if ( npDistance[i] < nSPMin )
209 nSPMin = npDistance[i];
210 if ( bSplitCount )
212 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
213 { // this position will be replaced
214 nRepS++;
215 nReplacePos = i;
217 else if ( nReplacePos > 0 && !nPij )
219 // character is equal in string and pattern
221 // If from this point:
222 // * pattern and string have the same count of this
223 // character
224 // * and character count is the same before this position
225 // then the replace was none.
227 // Scrambled letters are recognized here and the nRepS
228 // replacement is withdrawn, whereby the double limit kicks
229 // in.
231 // Same count of c
232 int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
233 if ( !nBalance )
234 { // one was replaced that was an insertion instead
235 nRepS--;
236 nReplacePos = 0;
242 if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
243 return npDistance[nStringLen];
244 else
246 if ( bSplitCount )
248 if ( nRepS && nLenDiff > 0 )
249 nRepS -= nLenDiff; // Inserts were counted
250 if ( (nSPMin <= 2 * nLimit)
251 && (npDistance[nStringLen] <= 2 * nLimit)
252 && (nRepS * nRepP0 <= nLimit) )
253 return -npDistance[nStringLen];
254 return LEVDISBIG;
256 return LEVDISBIG;
260 // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
261 // from user values nOtherX, nShorterY, nLongerZ, bRelaxed
262 void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
264 if ( nX < 0 ) nX = 0; // only positive values
265 if ( nY < 0 ) nY = 0;
266 if ( nZ < 0 ) nZ = 0;
267 if (0 == std::min({ nX, nY, nZ })) // at least one 0
269 int nMid, nMax;
270 nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
271 if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
272 nLimit = nMax; // either 0 or the only one >0
273 else // one is 0
274 nLimit = std::lcm( nMid, nMax );
276 else // all three of them are not 0
277 nLimit = std::lcm(std::lcm(nX, nY), nZ);
278 nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
279 nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
280 nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
281 bSplitCount = bRelaxed;
284 // The value in the middle
285 int WLevDistance::Mid3( int x, int y, int z )
287 int min = std::min({ x, y, z });
288 if ( x == min )
289 return std::min(y, z);
290 else if ( y == min )
291 return std::min(x, z);
292 else // z == min
293 return std::min(x, y);
296 // initialize data from CTOR
297 void WLevDistance::InitData( const sal_Unicode* cPattern )
299 cpPattern = aPatMem.GetcPtr();
300 bpPatIsWild = aPatMem.GetbPtr();
301 npDistance = aDisMem.GetPtr();
302 nStars = 0;
303 const sal_Unicode* cp1 = cPattern;
304 sal_Unicode* cp2 = cpPattern;
305 bool* bp = bpPatIsWild;
306 // copy pattern, count asterisks, escaped Jokers
307 while ( *cp1 )
309 if ( *cp1 == '\\' ) // maybe escaped
311 if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
313 cp1++; // skip '\\'
314 nPatternLen--;
316 *bp++ = false;
318 else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
320 if ( *cp1 == '*' )
321 nStars++;
322 *bp++ = true;
324 else
325 *bp++ = false;
326 *cp2++ = *cp1++;
328 *cp2 = '\0';
331 WLevDistance::WLevDistance( const sal_Unicode* cPattern,
332 int nOtherX, int nShorterY, int nLongerZ,
333 bool bRelaxed ) :
334 nPatternLen( Impl_WLD_StringLen(cPattern) ),
335 aPatMem( nPatternLen + 1 ),
336 nArrayLen( nPatternLen + 1 ),
337 aDisMem( nArrayLen )
339 InitData( cPattern );
340 CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
343 // CopyCTor
344 WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
345 nPatternLen( rWLD.nPatternLen ),
346 aPatMem( nPatternLen + 1 ),
347 nArrayLen( nPatternLen + 1 ),
348 aDisMem( nArrayLen ),
349 nLimit( rWLD.nLimit ),
350 nRepP0( rWLD.nRepP0 ),
351 nInsQ0( rWLD.nInsQ0 ),
352 nDelR0( rWLD.nDelR0 ),
353 nStars( rWLD.nStars ),
354 bSplitCount( rWLD.bSplitCount )
356 cpPattern = aPatMem.GetcPtr();
357 bpPatIsWild = aPatMem.GetbPtr();
358 npDistance = aDisMem.GetPtr();
359 sal_Int32 i;
360 for ( i=0; i<nPatternLen; i++ )
362 cpPattern[i] = rWLD.cpPattern[i];
363 bpPatIsWild[i] = rWLD.bpPatIsWild[i];
365 cpPattern[i] = '\0';
368 // DTor
369 WLevDistance::~WLevDistance()
373 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */