Version 5.4.3.2, tag libreoffice-5.4.3.2
[LibreOffice.git] / external / hunspell / 0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch
blobff2530cfe23dcd14647c5ac21033c2796bc1633f
1 From cf0967951a25a2daa10a636092193af5c5497aa2 Mon Sep 17 00:00:00 2001
2 From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
3 Date: Fri, 10 Feb 2017 16:36:27 +0000
4 Subject: [PATCH 3/4] hoist string lowering from ngram to ngsuggest
6 only lower when we have to and reuse scratch buffers as
7 tolower destination
9 kcachegrind reports 830,529,143 -> 779,887,690 on
11 echo Hollo | valgrind --tool=callgrind ./src/tools/.libs/hunspell -d nl_NL
12 ---
13 src/hunspell/suggestmgr.cxx | 143 +++++++++++++++++++++++++++++---------------
14 1 file changed, 95 insertions(+), 48 deletions(-)
16 diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
17 index 54a474f..ea52707 100644
18 --- a/src/hunspell/suggestmgr.cxx
19 +++ b/src/hunspell/suggestmgr.cxx
20 @@ -1075,10 +1075,8 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
21 u8_u16(w_target, target);
24 - std::vector<w_char> w_entry;
25 std::string f;
26 std::vector<w_char> w_f;
27 - std::vector<w_char> w_target2;
29 for (size_t i = 0; i < rHMgr.size(); ++i) {
30 while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
31 @@ -1091,13 +1089,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
32 continue;
34 if (utf8) {
35 - w_entry.clear();
36 - u8_u16(w_entry, HENTRY_WORD(hp));
37 - sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
38 - leftcommonsubstring(w_word, w_entry);
39 + w_f.clear();
40 + u8_u16(w_f, HENTRY_WORD(hp));
42 + int leftcommon = leftcommonsubstring(w_word, w_f);
43 + if (low) {
44 + // lowering dictionary word
45 + mkallsmall_utf(w_f, langnum);
46 + }
47 + sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
48 } else {
49 - sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
50 - leftcommonsubstring(word, HENTRY_WORD(hp));
51 + f.assign(HENTRY_WORD(hp));
53 + int leftcommon = leftcommonsubstring(word, f.c_str());
54 + if (low) {
55 + // lowering dictionary word
56 + mkallsmall(f, csconv);
57 + }
58 + sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
61 // check special pronounciation
62 @@ -1108,11 +1117,20 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
63 if (utf8) {
64 w_f.clear();
65 u8_u16(w_f, f);
66 - sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
67 - leftcommonsubstring(w_word, w_f);
69 + int leftcommon = leftcommonsubstring(w_word, w_f);
70 + if (low) {
71 + // lowering dictionary word
72 + mkallsmall_utf(w_f, langnum);
73 + }
74 + sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
75 } else {
76 - sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
77 - leftcommonsubstring(word, f.c_str());
78 + int leftcommon = leftcommonsubstring(word, f.c_str());
79 + if (low) {
80 + // lowering dictionary word
81 + mkallsmall(f, csconv);
82 + }
83 + sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
85 if (sc2 > sc)
86 sc = sc2;
87 @@ -1129,14 +1147,14 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
88 candidate = HENTRY_WORD(hp);
89 mkallcap(candidate, csconv);
91 - std::string target2 = phonet(candidate, *ph);
92 - w_target2.clear();
93 + f = phonet(candidate, *ph);
94 + w_f.clear();
95 if (utf8) {
96 - u8_u16(w_target2, target2);
97 - scphon = 2 * ngram(3, w_target, w_target2,
98 + u8_u16(w_f, f);
99 + scphon = 2 * ngram(3, w_target, w_f,
100 NGRAM_LONGER_WORSE);
101 } else {
102 - scphon = 2 * ngram(3, target, target2,
103 + scphon = 2 * ngram(3, target, f,
104 NGRAM_LONGER_WORSE);
107 @@ -1177,12 +1195,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
108 w_mw[k].l = '*';
109 w_mw[k].h = 0;
111 - thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
113 + if (low) {
114 + // lowering dictionary word
115 + mkallsmall_utf(w_mw, langnum);
118 + thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH);
119 } else {
120 std::string mw = word;
121 for (int k = sp; k < n; k += 4)
122 mw[k] = '*';
123 - thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
125 + if (low) {
126 + // lowering dictionary word
127 + mkallsmall(mw, csconv);
130 + thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH);
133 thresh = thresh / 3;
134 @@ -1210,7 +1240,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
135 return;
138 - std::vector<w_char> w_glst_word;
139 for (int i = 0; i < MAX_ROOTS; i++) {
140 if (roots[i]) {
141 struct hentry* rp = roots[i];
142 @@ -1225,15 +1254,26 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
144 for (int k = 0; k < nw; k++) {
145 if (utf8) {
146 - w_glst_word.clear();
147 - u8_u16(w_glst_word, glst[k].word);
148 - sc = ngram(n, w_word, w_glst_word,
149 - NGRAM_ANY_MISMATCH + low) +
150 - leftcommonsubstring(w_word, w_glst_word);
151 + w_f.clear();
152 + u8_u16(w_f, glst[k].word);
154 + int leftcommon = leftcommonsubstring(w_word, w_f);
155 + if (low) {
156 + // lowering dictionary word
157 + mkallsmall_utf(w_f, langnum);
160 + sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon;
161 } else {
162 - sc = ngram(n, word, glst[k].word,
163 - NGRAM_ANY_MISMATCH + low) +
164 - leftcommonsubstring(word, glst[k].word);
165 + f = glst[k].word;
167 + int leftcommon = leftcommonsubstring(word, f.c_str());
168 + if (low) {
169 + // lowering dictionary word
170 + mkallsmall(f, csconv);
173 + sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon;
176 if (sc > thresh) {
177 @@ -1318,19 +1358,37 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
178 w_gl.clear();
179 if (utf8) {
180 u8_u16(w_gl, gl);
181 - re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
182 - ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
183 + //w_gl is lowercase already at this point
184 + re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
185 + if (low) {
186 + w_f = w_word;
187 + // lowering dictionary word
188 + mkallsmall_utf(w_f, langnum);
189 + re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
190 + } else {
191 + re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
193 } else {
194 - re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
195 - ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
196 + //gl is lowercase already at this point
197 + re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
198 + if (low) {
199 + f = word;
200 + // lowering dictionary word
201 + mkallsmall(f, csconv);
202 + re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
203 + } else {
204 + re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
208 int ngram_score, leftcommon_score;
209 if (utf8) {
210 - ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
211 + //w_gl is lowercase already at this point
212 + ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH);
213 leftcommon_score = leftcommonsubstring(w_word, w_gl);
214 } else {
215 - ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
216 + //gl is lowercase already at this point
217 + ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH);
218 leftcommon_score = leftcommonsubstring(word, gl.c_str());
220 gscore[i] =
221 @@ -1802,14 +1860,6 @@ int SuggestMgr::ngram(int n,
222 l2 = su2.size();
223 if (l2 == 0)
224 return 0;
225 - // lowering dictionary word
226 - const std::vector<w_char>* p_su2 = &su2;
227 - std::vector<w_char> su2_copy;
228 - if (opt & NGRAM_LOWERING) {
229 - su2_copy = su2;
230 - mkallsmall_utf(su2_copy, langnum);
231 - p_su2 = &su2_copy;
233 for (int j = 1; j <= n; j++) {
234 ns = 0;
235 for (int i = 0; i <= (l1 - j); i++) {
236 @@ -1817,7 +1867,7 @@ int SuggestMgr::ngram(int n,
237 for (int l = 0; l <= (l2 - j); l++) {
238 for (k = 0; k < j; k++) {
239 const w_char& c1 = su1[i + k];
240 - const w_char& c2 = (*p_su2)[l + k];
241 + const w_char& c2 = su2[l + k];
242 if ((c1.l != c2.l) || (c1.h != c2.h))
243 break;
245 @@ -1862,14 +1912,11 @@ int SuggestMgr::ngram(int n,
246 if (l2 == 0)
247 return 0;
248 l1 = s1.size();
249 - std::string t(s2);
250 - if (opt & NGRAM_LOWERING)
251 - mkallsmall(t, csconv);
252 for (int j = 1; j <= n; j++) {
253 ns = 0;
254 for (int i = 0; i <= (l1 - j); i++) {
255 - //t is haystack, s1[i..i+j) is needle
256 - if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
257 + //s2 is haystack, s1[i..i+j) is needle
258 + if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) {
259 ns++;
260 } else if (opt & NGRAM_WEIGHTED) {
261 ns--;
263 2.9.3