1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #ifndef INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
21 #define INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
23 #include <sal/types.h>
26 // Sensible default values for a user interface could be:
27 // LEVDISDEFAULT_XOTHER 2
28 // Maximum X replacements to match query, found data may be different by X
29 // characters from query.
30 // LEVDISDEFAULT_YSHORTER 1
31 // Maximum Y insertions to match query, found data may be Y characters
32 // shorter than query.
33 // LEVDISDEFAULT_ZLONGER 3
34 // Maximum Z deletions to match query, found data may be Z characters
36 // LEVDISDEFAULT_RELAXED TRUE
37 // Use relaxed SplitCount instead of mathematical WLD.
39 // Joker/wildcards ('?' and '*') of course do not count as
40 // replacement/insertion/deletion. At a '?' a replacement is not counted, for a
41 // '*' the found data may be any number of characters longer than the query.
43 // Strict mathematical WLD: EITHER maximum X replacements OR Y characters
44 // shorter OR Z characters longer.
45 // Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
46 // Z characters longer. Any combination of actions is valid.
48 // The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
49 // integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
51 // The corresponding internal default weigh values for these user interface
53 // LEVDISDEFAULTLIMIT 6
54 // Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
56 // Default nRepP0, weight of replacements.
58 // Default nInsQ0, weight of insertions.
60 // Default nDelR0, weight of deletions.
62 // The transformation of user input values to weighs is done using CalcLPQR().
63 // One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
64 // characters longer than pattern) then no character can be replaced any more.
65 // This can be circumvented by increasing nX or/and nZ, but of course with the
66 // side effect of being less strict then... or the other solution is to use
67 // relaxed SplitCount (see below), which is the default when using CalcLPQR().
69 // Attention: shorter = WLD.Insert, longer = WLD.Delete
71 // View and counting is always from data string to pattern, a deletion means
72 // that something is deleted from data to match pattern.
74 // Deletions weigh less in this example because usually less is known than is
75 // searched for. Replacements get middle weight, for example because of
76 // misspellings. Insertions are expensive.
78 // Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
79 // Allowed are maximum 4 replacements, no insertion, no deletion.
80 // Matches the user interface values X = 3, Y = 0, Z = 0
82 // bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
83 // of WLD() then isn't necessarily the Levenshtein-Distance, but can be
84 // negative (-WLD) if the WLD is greater than nLimit but single values are
86 // For the above default values that could mean: even if the found string is
87 // already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
88 // to reach pattern. Additionally, character swaps count as one replacement.
89 // Mathematically completely incorrect, but meets user expectations ;-)
91 // Explanation: in the real WLD all actions are withdrawn from a common 100%
92 // pool, if one gets all there's nothing left for others. With bSplitCount
93 // replacements have their own pool.
96 /** "Safe" memory allocation in ctor */
97 class WLevDisPatternMem
99 std::unique_ptr
<sal_Unicode
[]> cp
;
100 std::unique_ptr
<bool[]> bp
;
102 explicit WLevDisPatternMem( sal_Int32 s
)
103 : cp(new sal_Unicode
[s
])
108 sal_Unicode
* GetcPtr() const { return cp
.get(); }
109 bool* GetbPtr() const { return bp
.get(); }
112 class WLevDisDistanceMem
114 std::unique_ptr
<int[]> p
;
116 explicit WLevDisDistanceMem( size_t s
)
120 int* GetPtr() const { return p
.get(); }
121 int* NewMem( size_t s
)
123 p
.reset(new int[ s
<3 ? 3 : s
]);
128 /** Weighted Levenshtein Distance (WLD)
130 For a more detailed explanation see documentation in
131 i18npool/source/search/levdis.hxx
135 sal_Int32 nPatternLen
; ///< length of pattern
136 WLevDisPatternMem aPatMem
; ///< manage allocation of pattern array
137 sal_Unicode
* cpPattern
; ///< pointer to pattern array
138 bool* bpPatIsWild
; ///< pointer to bool array whether pattern is wildcard
139 sal_Int32 nArrayLen
; ///< length of distance array
140 WLevDisDistanceMem aDisMem
; ///< manage allocation of distance array
141 int* npDistance
; ///< pointer to distance array
142 int nLimit
; ///< WLD limit replacements/insertions/deletions
143 int nRepP0
; ///< replacement weigh
144 int nInsQ0
; ///< insertion weigh
145 int nDelR0
; ///< deletion weigh
146 int nStars
; ///< count of '*' wildcards in pattern
147 bool bSplitCount
; ///< if TRUE, Rep/Ins/Del are counted separately
149 void InitData( const sal_Unicode
* cPattern
);
150 static inline int Min3( int x
, int y
, int z
); ///< minimum value of 3 values
151 static int Mid3( int x
, int y
, int z
); ///< middle value of 3 values
152 static int Max3( int x
, int y
, int z
); ///< maximum value of 3 values
153 static int GCD( int a
, int b
); ///< Greatest Common Divisor
154 static int LCM( int a
, int b
); ///< Least Common Multiple
158 /** CTor with user input. Internally calls CalcLPQR().
160 After this, obtain the resulting limit using GetLimit().
162 @param bRelaxed the mathematically incorrect method is default (TRUE)
164 WLevDistance( const sal_Unicode
* cPattern
, int nOtherX
, int nShorterY
,
165 int nLongerZ
, bool bRelaxed
);
167 WLevDistance( const WLevDistance
& rWLD
);
170 /** Calculate the Weighted Levenshtein Distance from string to pattern. */
171 int WLD( const sal_Unicode
* cString
, sal_Int32 nStringLen
);
173 /** Calculate the internal weighs corresponding to the user input values.
174 @returns nLimit for later comparison with WLD()
176 void CalcLPQR( int nOtherX
, int nShorterY
, int nLongerZ
,
179 int GetLimit() const { return nLimit
; }
181 // Calculate current balance, keep this inline for performance reasons!
182 // c == cpPattern[jj] == cString[ii]
183 // First seek up to found place, if the balance is still equal there then
184 // also compare after the found place.
185 int levdisbalance(sal_Int32 jj
, sal_Int32 ii
, sal_Unicode c
, const sal_Unicode
* cString
, sal_Int32 nStringLen
)
193 for ( k
=0; k
< jj
; k
++ )
194 if ( cpPattern
[k
] == c
)
197 for ( k
=0; k
< ii
; k
++ )
198 if ( cString
[k
] == c
)
202 for ( k
=jj
+1; k
< nPatternLen
; k
++ )
203 if ( cpPattern
[k
] == c
)
205 for ( k
=ii
+1; k
< nStringLen
; k
++ )
206 if ( cString
[k
] == c
)
218 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */