external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch

   1 From cf0967951a25a2daa10a636092193af5c5497aa2 Mon Sep 17 00:00:00 2001
   2 From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
   3 Date: Fri, 10 Feb 2017 16:36:27 +0000
   4 Subject: [PATCH 3/4] hoist string lowering from ngram to ngsuggest
   5
   6 only lower when we have to and reuse scratch buffers as
   7 tolower destination
   8
   9 kcachegrind reports 830,529,143 -> 779,887,690 on
  10
  11 echo Hollo | valgrind --tool=callgrind ./src/tools/.libs/hunspell -d nl_NL
  12 ---
  13  src/hunspell/suggestmgr.cxx | 143 +++++++++++++++++++++++++++++---------------
  14  1 file changed, 95 insertions(+), 48 deletions(-)
  15
  16 diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
  17 index 54a474f..ea52707 100644
  18 --- a/src/hunspell/suggestmgr.cxx
  19 +++ b/src/hunspell/suggestmgr.cxx
  20 @@ -1075,10 +1075,8 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  21      u8_u16(w_target, target);
  22    }
  23
  24 -  std::vector<w_char> w_entry;
  25    std::string f;
  26    std::vector<w_char> w_f;
  27 -  std::vector<w_char> w_target2;
  28
  29    for (size_t i = 0; i < rHMgr.size(); ++i) {
  30      while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
  31 @@ -1091,13 +1089,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  32          continue;
  33
  34        if (utf8) {
  35 -        w_entry.clear();
  36 -        u8_u16(w_entry, HENTRY_WORD(hp));
  37 -        sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
  38 -             leftcommonsubstring(w_word, w_entry);
  39 +        w_f.clear();
  40 +        u8_u16(w_f, HENTRY_WORD(hp));
  41 +
  42 +        int leftcommon = leftcommonsubstring(w_word, w_f);
  43 +        if (low) {
  44 +          // lowering dictionary word
  45 +          mkallsmall_utf(w_f, langnum);
  46 +        }
  47 +        sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
  48        } else {
  49 -        sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
  50 -             leftcommonsubstring(word, HENTRY_WORD(hp));
  51 +        f.assign(HENTRY_WORD(hp));
  52 +
  53 +        int leftcommon = leftcommonsubstring(word, f.c_str());
  54 +        if (low) {
  55 +          // lowering dictionary word
  56 +          mkallsmall(f, csconv);
  57 +        }
  58 +        sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
  59        }
  60
  61        // check special pronounciation
  62 @@ -1108,11 +1117,20 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  63          if (utf8) {
  64            w_f.clear();
  65            u8_u16(w_f, f);
  66 -          sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
  67 -                leftcommonsubstring(w_word, w_f);
  68 +
  69 +          int leftcommon = leftcommonsubstring(w_word, w_f);
  70 +          if (low) {
  71 +            // lowering dictionary word
  72 +            mkallsmall_utf(w_f, langnum);
  73 +          }
  74 +          sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
  75          } else {
  76 -          sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
  77 -                leftcommonsubstring(word, f.c_str());
  78 +          int leftcommon = leftcommonsubstring(word, f.c_str());
  79 +          if (low) {
  80 +            // lowering dictionary word
  81 +            mkallsmall(f, csconv);
  82 +          }
  83 +          sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
  84          }
  85          if (sc2 > sc)
  86            sc = sc2;
  87 @@ -1129,14 +1147,14 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  88            candidate = HENTRY_WORD(hp);
  89            mkallcap(candidate, csconv);
  90          }
  91 -        std::string target2 = phonet(candidate, *ph);
  92 -        w_target2.clear();
  93 +        f = phonet(candidate, *ph);
  94 +        w_f.clear();
  95          if (utf8) {
  96 -          u8_u16(w_target2, target2);
  97 -          scphon = 2 * ngram(3, w_target, w_target2,
  98 +          u8_u16(w_f, f);
  99 +          scphon = 2 * ngram(3, w_target, w_f,
 100                               NGRAM_LONGER_WORSE);
 101          } else {
 102 -          scphon = 2 * ngram(3, target, target2,
 103 +          scphon = 2 * ngram(3, target, f,
 104                               NGRAM_LONGER_WORSE);
 105          }
 106        }
 107 @@ -1177,12 +1195,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
 108          w_mw[k].l = '*';
 109          w_mw[k].h = 0;
 110        }
 111 -      thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
 112 +
 113 +      if (low) {
 114 +        // lowering dictionary word
 115 +        mkallsmall_utf(w_mw, langnum);
 116 +      }
 117 +
 118 +      thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH);
 119      } else {
 120        std::string mw = word;
 121        for (int k = sp; k < n; k += 4)
 122          mw[k] = '*';
 123 -      thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
 124 +
 125 +      if (low) {
 126 +        // lowering dictionary word
 127 +        mkallsmall(mw, csconv);
 128 +      }
 129 +
 130 +      thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH);
 131      }
 132    }
 133    thresh = thresh / 3;
 134 @@ -1210,7 +1240,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
 135      return;
 136    }
 137
 138 -  std::vector<w_char> w_glst_word;
 139    for (int i = 0; i < MAX_ROOTS; i++) {
 140      if (roots[i]) {
 141        struct hentry* rp = roots[i];
 142 @@ -1225,15 +1254,26 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
 143
 144        for (int k = 0; k < nw; k++) {
 145          if (utf8) {
 146 -          w_glst_word.clear();
 147 -          u8_u16(w_glst_word, glst[k].word);
 148 -          sc = ngram(n, w_word, w_glst_word,
 149 -                     NGRAM_ANY_MISMATCH + low) +
 150 -               leftcommonsubstring(w_word, w_glst_word);
 151 +          w_f.clear();
 152 +          u8_u16(w_f, glst[k].word);
 153 +
 154 +          int leftcommon = leftcommonsubstring(w_word, w_f);
 155 +          if (low) {
 156 +            // lowering dictionary word
 157 +            mkallsmall_utf(w_f, langnum);
 158 +          }
 159 +
 160 +          sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon;
 161          } else {
 162 -          sc = ngram(n, word, glst[k].word,
 163 -                     NGRAM_ANY_MISMATCH + low) +
 164 -               leftcommonsubstring(word, glst[k].word);
 165 +          f = glst[k].word;
 166 +
 167 +          int leftcommon = leftcommonsubstring(word, f.c_str());
 168 +          if (low) {
 169 +            // lowering dictionary word
 170 +            mkallsmall(f, csconv);
 171 +          }
 172 +
 173 +          sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon;
 174          }
 175
 176          if (sc > thresh) {
 177 @@ -1318,19 +1358,37 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
 178        w_gl.clear();
 179        if (utf8) {
 180          u8_u16(w_gl, gl);
 181 -        re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
 182 -             ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
 183 +        //w_gl is lowercase already at this point
 184 +        re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
 185 +        if (low) {
 186 +          w_f = w_word;
 187 +          // lowering dictionary word
 188 +          mkallsmall_utf(w_f, langnum);
 189 +          re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
 190 +        } else {
 191 +          re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
 192 +        }
 193        } else {
 194 -        re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
 195 -             ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
 196 +        //gl is lowercase already at this point
 197 +        re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
 198 +        if (low) {
 199 +          f = word;
 200 +          // lowering dictionary word
 201 +          mkallsmall(f, csconv);
 202 +          re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
 203 +        } else {
 204 +          re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
 205 +        }
 206        }
 207
 208        int ngram_score, leftcommon_score;
 209        if (utf8) {
 210 -        ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
 211 +        //w_gl is lowercase already at this point
 212 +        ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH);
 213          leftcommon_score = leftcommonsubstring(w_word, w_gl);
 214        } else {
 215 -        ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
 216 +        //gl is lowercase already at this point
 217 +        ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH);
 218          leftcommon_score = leftcommonsubstring(word, gl.c_str());
 219        }
 220        gscore[i] =
 221 @@ -1802,14 +1860,6 @@ int SuggestMgr::ngram(int n,
 222    l2 = su2.size();
 223    if (l2 == 0)
 224      return 0;
 225 -  // lowering dictionary word
 226 -  const std::vector<w_char>* p_su2 = &su2;
 227 -  std::vector<w_char> su2_copy;
 228 -  if (opt & NGRAM_LOWERING) {
 229 -    su2_copy = su2;
 230 -    mkallsmall_utf(su2_copy, langnum);
 231 -    p_su2 = &su2_copy;
 232 -  }
 233    for (int j = 1; j <= n; j++) {
 234      ns = 0;
 235      for (int i = 0; i <= (l1 - j); i++) {
 236 @@ -1817,7 +1867,7 @@ int SuggestMgr::ngram(int n,
 237        for (int l = 0; l <= (l2 - j); l++) {
 238          for (k = 0; k < j; k++) {
 239            const w_char& c1 = su1[i + k];
 240 -          const w_char& c2 = (*p_su2)[l + k];
 241 +          const w_char& c2 = su2[l + k];
 242            if ((c1.l != c2.l) || (c1.h != c2.h))
 243              break;
 244          }
 245 @@ -1862,14 +1912,11 @@ int SuggestMgr::ngram(int n,
 246    if (l2 == 0)
 247      return 0;
 248    l1 = s1.size();
 249 -  std::string t(s2);
 250 -  if (opt & NGRAM_LOWERING)
 251 -    mkallsmall(t, csconv);
 252    for (int j = 1; j <= n; j++) {
 253      ns = 0;
 254      for (int i = 0; i <= (l1 - j); i++) {
 255 -      //t is haystack, s1[i..i+j) is needle
 256 -      if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
 257 +      //s2 is haystack, s1[i..i+j) is needle
 258 +      if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) {
 259          ns++;
 260        } else if (opt & NGRAM_WEIGHTED) {
 261          ns--;
 262 --
 263 2.9.3
 264