libvspell/spell.cpp

   1 #include "config.h"             // -*- tab-width: 2 -*-
   2 #include <iterator>
   3 #include <algorithm>
   4 #include <sstream>
   5 #include <iostream>
   6 #include <fstream>
   7 #include "vspell.h"
   8 #include "sentence.h"
   9 #include "syllable.h"
  10 #include "pfs.h"
  11 #include <map>
  12 #include "propername.h"
  13 #include "keyboard.h"
  14 #include "shuffle.h"
  15 #include "penalty.h"
  16 #include "bellman.h"
  17 #include <math.h>
  18
  19 // stolen from glib
  20 typedef unsigned int guint32;
  21 typedef unsigned int guint;
  22 typedef guint32 gunichar;
  23 typedef char gchar;
  24 typedef unsigned char guchar;
  25 typedef long glong;
  26 typedef signed int gssize;
  27 #define UTF8_COMPUTE(Char, Mask, Len)                                     \
  28   if (Char < 128)                                                                                                       \
  29     {                                                                                                                                                   \
  30       Len                                                               = 1;                                                              \
  31       Mask                                                              = 0x7f;                                                   \
  32     }                                                                                                                                                   \
  33   else if ((Char & 0xe0) == 0xc0)                                             \
  34     {                                                                                                                                                   \
  35       Len                                                               = 2;                                                              \
  36       Mask                                                              = 0x1f;                                                   \
  37     }                                                                                                                                                   \
  38   else if ((Char & 0xf0) == 0xe0)                                             \
  39     {                                                                                                                                                   \
  40       Len                                                               = 3;                                                              \
  41       Mask                                                              = 0x0f;                                                   \
  42     }                                                                                                                                                   \
  43   else if ((Char & 0xf8) == 0xf0)                                             \
  44     {                                                                                                                                                   \
  45       Len                                                               = 4;                                                              \
  46       Mask                                                              = 0x07;                                                   \
  47     }                                                                                                                                                   \
  48   else if ((Char & 0xfc) == 0xf8)                                             \
  49     {                                                                                                                                                   \
  50       Len                                                               = 5;                                                              \
  51       Mask                                                              = 0x03;                                                   \
  52     }                                                                                                                                                   \
  53   else if ((Char & 0xfe) == 0xfc)                                             \
  54     {                                                                                                                                                   \
  55       Len                                                               = 6;                                                              \
  56       Mask                                                              = 0x01;                                                   \
  57     }                                                                                                                                                   \
  58   else                                                                                                                                          \
  59     Len                                                                         = -1;
  60
  61 #define UTF8_GET(Result, Chars, Count, Mask, Len)                       \
  62   (Result) = (Chars)[0] & (Mask);                                                                                       \
  63   for ((Count) = 1; (Count) < (Len); ++(Count))                         \
  64     {                                                                                                                                                                                                   \
  65       if (((Chars)[(Count)] & 0xc0) != 0x80)                                    \
  66                                 {                                                                                                                                                                                       \
  67                                         (Result) = (gunichar)-1;                                                                                \
  68                                         break;                                                                                                                                                  \
  69                                 }                                                                                                                                                                                       \
  70       (Result) <<= 6;                                                                                                                                   \
  71       (Result) |= ((Chars)[(Count)] & 0x3f);                                    \
  72     }
  73
  74 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
  75 static const gchar utf8_skip_data[256] = {
  76   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  77   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  78   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  79   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  80   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  81   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  82   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  83   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
  84 };
  85
  86 static const gchar * const g_utf8_skip = utf8_skip_data;
  87 static int
  88 g_unichar_to_utf8 (gunichar c,
  89                    gchar   *outbuf)
  90 {
  91   guint len = 0;
  92   int first;
  93   int i;
  94
  95   if (c < 0x80)
  96     {
  97       first = 0;
  98       len = 1;
  99     }
 100   else if (c < 0x800)
 101     {
 102       first = 0xc0;
 103       len = 2;
 104     }
 105   else if (c < 0x10000)
 106     {
 107       first = 0xe0;
 108       len = 3;
 109     }
 110    else if (c < 0x200000)
 111     {
 112       first = 0xf0;
 113       len = 4;
 114     }
 115   else if (c < 0x4000000)
 116     {
 117       first = 0xf8;
 118       len = 5;
 119     }
 120   else
 121     {
 122       first = 0xfc;
 123       len = 6;
 124     }
 125
 126   if (outbuf)
 127     {
 128       for (i = len - 1; i > 0; --i)
 129         {
 130           outbuf[i] = (c & 0x3f) | 0x80;
 131           c >>= 6;
 132         }
 133       outbuf[0] = c | first;
 134     }
 135
 136   return len;
 137 }
 138
 139 static gunichar
 140 g_utf8_get_char (const gchar *p)
 141 {
 142   int i, mask = 0, len;
 143   gunichar result;
 144   unsigned char c = (unsigned char) *p;
 145
 146   UTF8_COMPUTE (c, mask, len);
 147   if (len == -1)
 148     return (gunichar)-1;
 149   UTF8_GET (result, p, i, mask, len);
 150
 151   return result;
 152 }
 153
 154 static glong
 155 g_utf8_strlen (const gchar *p,
 156                gssize       max)
 157 {
 158   glong len = 0;
 159   const gchar *start = p;
 160   //g_return_val_if_fail (p != NULL || max == 0, 0);
 161         if (!(p != NULL || max == 0))
 162                 return 0;
 163
 164   if (max < 0)
 165     {
 166       while (*p)
 167         {
 168           p = g_utf8_next_char (p);
 169           ++len;
 170         }
 171     }
 172   else
 173     {
 174       if (max == 0 || !*p)
 175         return 0;
 176
 177       p = g_utf8_next_char (p);
 178
 179       while (p - start < max && *p)
 180         {
 181           ++len;
 182           p = g_utf8_next_char (p);
 183         }
 184
 185       /* only do the last len increment if we got a complete
 186        * char (don't count partial chars)
 187        */
 188       if (p - start == max)
 189         ++len;
 190     }
 191
 192   return len;
 193 }
 194
 195 using namespace std;
 196
 197 /*
 198
 199 The process:
 200 1. Sentence segmentation. (sentences_split)
 201 2. Separate "words" by spaces. (tokenize)
 202 3. Punctuation separation. (tokenize/tokenize_punctuation)
 203 4. Foreign/Abbreviation detection.
 204 5. Proper name detection.
 205 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
 206 generalize all capitalized words.
 207 6* Syllable checking. (check1)
 208 7. Find all possible (misspelled) words. (**) (get_all_words)
 209 8. "pre-separate" sentence into phrases.
 210 9. Word segmentation. (**)
 211 10. Find the best segmentation. (segment_best)
 212 10* Word checking. (check2)
 213
 214 */
 215 /*
 216         namespace Spell {
 217 */
 218 bool VSpell::init()
 219 {
 220         dic_init();
 221
 222         cerr << "Loading dictionary... ";
 223         warch.load("wordlist");
 224         cerr << "done" << endl;
 225         cerr << "Loading ngram... ";
 226         File f("ngram","rt");
 227         if (!f.error()) {
 228                 get_ngram().read(f);
 229                 cerr << "done" << endl;
 230         } else
 231                 cerr << "Ngram loading error. The result may be incorrect" << endl;
 232         cerr << "Loading syngram... ";
 233         File ff("syngram","rt");
 234         if (!ff.error()) {
 235                 get_syngram().read(ff);
 236                 cerr << "done" << endl;
 237         }       else
 238                 cerr << "Syllable Ngram loading error. The result may be incorrect" << endl;
 239         sarch.set_blocked(true);
 240         return true;
 241 }
 242
 243 bool VSpell::check(const char *utf8_pp)
 244 {
 245         utf8_text = utf8_pp;
 246
 247         syllables.clear();
 248
 249         bool run = false;
 250
 251         do {
 252                 string pp = viet_to_viscii_force(utf8_text.c_str());
 253                 vector<string> pps;
 254                 sentences_split(pp,pps);
 255                 unsigned pps_i,pps_len = pps.size();
 256                 unsigned offset = 0;
 257                 text = pp;
 258
 259                 for (pps_i = 0;pps_i < pps_len;pps_i ++) {
 260                         Text *t = text_factory.create(this);
 261                         t->offset = offset;
 262                         t->length = pps[pps_i].size();
 263                         run = !t->sentence_check(pps[pps_i].c_str());
 264                         delete t;
 265                         if (run)
 266                                 break;
 267                         offset += pps[pps_i].size();
 268                 }
 269         } while (run);
 270         return true;
 271 }
 272
 273 void VSpell::replace(unsigned from,unsigned size,const char *s)
 274 {
 275         const char *p = utf8_text.c_str();
 276         unsigned i;
 277         for (i = 0;i < from && *p;i ++)
 278                 p = g_utf8_next_char(p);
 279         unsigned from1 = p - utf8_text.c_str();
 280         for (i = 0;i < size && *p;i ++)
 281                 p = g_utf8_next_char(p);
 282         unsigned to1 = p - utf8_text.c_str();
 283         utf8_text.replace(from1,to1-from1,s);
 284
 285         // remove separators in the range, adjust other seps
 286         int n = separators.size();
 287         if (n) {
 288                 int newsize = g_utf8_strlen(s,-1);
 289                 i = n;
 290                 do {
 291                         i --;
 292                         if (separators[i] >= from) {
 293                                 if (separators[i] < from+size)
 294                                         separators.erase(separators.begin()+i);
 295                                 else
 296                                         separators[i] += newsize-size;
 297                         }
 298                 } while (i > 0);
 299         }
 300 }
 301
 302 void VSpell::add_separators(const std::vector<unsigned> &seps)
 303 {
 304         copy(seps.begin(),seps.end(),back_inserter(separators));
 305 }
 306
 307 void VSpell::add_word(const char *s)
 308 {
 309         istringstream is(s);
 310         strid_string toks;
 311         string ss;
 312         while (is >> ss) {
 313                 toks += sarch[ss];
 314         }
 315         words.insert(toks);
 316 }
 317
 318 bool Text::sentence_check(const char *pp)
 319 {
 320         // preprocess
 321         st.set(pp);
 322         st.standardize();
 323         st.tokenize();
 324
 325         if (!st.get_syllable_count())   // nothing to do but crash ;)
 326                 return true;
 327
 328         // syllable checking
 329         if (!syllable_check() && !ui_syllable_check())
 330                 return false;
 331
 332         //w.construct(st);
 333         set<WordEntry> wes;
 334         WordStateFactories factories;
 335         ExactWordStateFactory exact;
 336         LowerWordStateFactory lower;
 337         UpperWordStateFactory upper;
 338         FuzzyWordStateFactory fuzzy;
 339         factories.push_back(&exact);
 340         factories.push_back(&lower);
 341         factories.push_back(&upper);
 342         factories.push_back(&fuzzy);
 343         w.pre_construct(st,wes,factories);
 344         mark_proper_name(st,wes);
 345         apply_separators(wes);
 346         w.post_construct(wes);
 347         //cerr << w << endl;
 348
 349         Path path;
 350         WordDAG dagw(&w);
 351         WordDAG2 *dagw2;
 352         DAG *dag = &dagw;
 353         if (vspell->get_trigram()) {
 354                 dagw2 = new WordDAG2(&dagw);
 355                 dag = dagw2;
 356         }
 357         PenaltyDAG pdag(dag,vspell->get_penalty());
 358         if (vspell->get_penalty())
 359                 dag = &pdag;
 360
 361         if (vspell->get_normalization()) {
 362                 Bellman pfs;
 363                 pfs.search(*dag,path);
 364         } else {
 365                 PFS pfs;
 366                 pfs.search(*dag,path);
 367         }
 368         if (vspell->get_trigram()) {
 369                 dagw2->demangle(path);
 370                 delete dagw2;
 371         }
 372         seg.resize(path.size()-2);
 373         // don't copy head/tail
 374         copy(path.begin()+1,path.end()-1,seg.begin());
 375         seg.we = w.we;
 376         //cerr << seg << endl;
 377
 378         // word checking
 379         if (!word_check() && !ui_word_check())
 380                 return false;
 381
 382         return true;                                                                    // done
 383 }
 384
 385 bool Text::syllable_check(int i)
 386 {
 387         if (vspell->in_dict(st[i].get_id()))
 388                 return true;
 389
 390         if (sarch.in_dict(st[i].get_cid())) {
 391                 Syllable syl;                                                   // diacritic check
 392                 if (syl.parse(sarch[st[i].get_cid()])) {
 393                         string s = get_lowercased_syllable(syl.to_str());
 394                         if (get_lowercased_syllable(sarch[st[i].get_id()]) == s)
 395                                 return true;
 396                 }
 397         }
 398         return false;
 399 }
 400
 401 bool Text::syllable_check()
 402 {
 403         int i,n = st.get_syllable_count();
 404
 405         suggestions.clear();
 406
 407         for (i = 0;i < n;i ++) {
 408                 if (syllable_check(i))
 409                         continue;
 410
 411                 Suggestion _s;
 412                 _s.id = i;
 413                 suggestions.push_back(_s);
 414         }
 415         return suggestions.empty();
 416 }
 417
 418 bool Text::word_check()
 419 {
 420         int i,n = seg.size();
 421
 422         suggestions.clear();
 423
 424         for (i = 0;i < n;i ++) {
 425                 vector<strid> sylls;
 426                 strid_string sylls2;
 427                 int ii,len = seg[i].node->get_syllable_count();
 428                 seg[i].node->get_syllables(sylls);
 429
 430                 // case-sensitive comparation.
 431                 sylls2.resize(len);
 432                 bool subok = true;
 433                 for (ii = 0;ii < len;ii ++) {
 434                         sylls2[ii] = st[seg[i].pos+ii].get_id();
 435                         if (subok && st[seg[i].pos+ii].get_cid() != sylls[ii])
 436                                 subok = false;
 437                 }
 438
 439                 // in user dict
 440                 bool ok = vspell->in_dict(sylls2);
 441
 442                 if (!ok) {
 443                         strid_string sylls3;
 444                         sylls3.resize(sylls.size());
 445                         for (ii = 0;ii < sylls3.size();ii ++)
 446                                 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
 447
 448                         if (vspell->get_strict_word_checking()) {
 449                                 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
 450                                 if (!subok &&
 451                                                 (is_all_capitalized_word(sylls2) ||
 452                                                  (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
 453                                         subok = true;
 454
 455                                 ok = subok;
 456                         } else {
 457                                 string s;
 458                                 BranchNode *branch = warch.get_root();
 459                                 for (ii = 0;ii < len && branch;ii ++)
 460                                         branch = branch->get_branch(st[seg[i].pos+ii].get_cid());
 461
 462                                 if (branch) {
 463                                         LeafNode* leaf;
 464                                         leaf = branch->get_leaf(sarch["<mainleaf>"]);
 465                                         sylls.clear();
 466                                         leaf->get_syllables(sylls);
 467                                         strid_string sylls3;
 468                                         sylls3.resize(sylls.size());
 469                                         for (ii = 0;ii < sylls3.size();ii ++)
 470                                                 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
 471
 472                                         ok = sylls2 == sylls3;
 473
 474                                         // don't care if the "true" word is lower-cased and the original one is valid upper-cased
 475                                         if (!ok &&
 476                                                         (is_all_capitalized_word(sylls2) ||
 477                                                          (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
 478                                                 ok = true;
 479                                 }
 480                         }
 481                 }
 482
 483                 if (!ok) {
 484                         Suggestion _s;
 485                         _s.id = i;
 486                         suggestions.push_back(_s);
 487                         continue;
 488                 }
 489
 490                 // all syllable are syntatically valid
 491         }
 492         return suggestions.empty();
 493 }
 494
 495 unsigned Text::pos_from_syllable(const Suggestion &s)
 496 {
 497         return offset+st[s.id].start;
 498 }
 499
 500 unsigned Text::pos_from_word(const Suggestion &s)
 501 {
 502         //return offset+st[seg[s.id]].start;
 503         return 0;
 504 }
 505
 506 void Text::replace(unsigned from,unsigned size,const char *s)
 507 {
 508         vspell->replace(from+offset,size,s);
 509 }
 510
 511 string Text::substr(unsigned from,unsigned size)
 512 {
 513         const string &utf8_text = vspell->get_utf8_text();
 514         const char *p = utf8_text.c_str();
 515         unsigned i;
 516         from += offset;
 517         for (i = 0;i < from && *p;i ++)
 518                 p = g_utf8_next_char(p);
 519         unsigned from1 = p - utf8_text.c_str();
 520         for (i = 0;i < size && *p;i ++)
 521                 p = g_utf8_next_char(p);
 522         unsigned to1 = p - utf8_text.c_str();
 523         return utf8_text.substr(from1,to1-from1);
 524 }
 525
 526 void Text::apply_separators(set<WordEntry> &wes)
 527 {
 528         vector<unsigned> seps;
 529         //set<unsigned> seps;
 530
 531         get_separators(seps);
 532         sort(seps.begin(),seps.end());
 533         //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
 534         int sep = 0;
 535         int i,n = st.get_syllable_count();
 536
 537         for (i = 0;i < n-1 && sep < seps.size();i ++) {
 538                 int p = offset+st[i].start+strlen(sarch[st[i].get_id()]);
 539                 if (p <= seps[sep] && seps[sep] <= offset+st[i+1].start) {
 540                         apply_separator(wes,i);
 541                         sep ++;
 542                 }
 543         }
 544 }
 545
 546 void Text::get_separators(vector<unsigned> &v)
 547 {
 548         const vector<unsigned> &vv = vspell->get_separators();
 549
 550         int i,n = vv.size();
 551         for (i = 0;i < n;i ++)
 552                 if (vv[i] >= offset && vv[i] < offset+length)
 553                         v.push_back(vv[i]);
 554 }
 555 /*
 556         }
 557 */
 558
 559 static unsigned char viscii_str[] = {
 560         0xe1,0xe0,0xe4,0xe3,0xd5,
 561         0xe2,0xa4,0xa5,0xa6,0xe7,0xa7,
 562         0xe5,0xa1,0xa2,0xc6,0xc7,0xa3,
 563         0xe9,0xe8,0xeb,0xa8,0xa9,
 564         0xea,0xaa,0xab,0xac,0xad,
 565         0xae,0xed,0xec,0xef,0xee,0xb8,
 566         0xf3,0xf2,0xf6,0xf5,0xf7,
 567         0xf4,0xaf,0xb0,0xb1,0xb2,
 568         0xb5,0xbd,0xbe,0xb6,0xb7,0xde,
 569         0xfe,0xfa,0xf9,0xfc,0xfb,0xf8,
 570         0xdf,0xd1,0xd7,0xd8,0xe6,
 571         0xf1,0xfd,0xcf,0xd6,0xdb,0xdc,
 572         0xf0,
 573         0xc1,0xc0,0xc4,0xc3,0x80,
 574         0xc2,0x84,0x85,0x86,0x06,0x87,
 575         0xc5,0x81,0x82,0x02,0x05,0x83,
 576         0xc9,0xc8,0xcb,0x88,0x89,
 577         0xca,0x8a,0x8b,0x8c,0x8d,0x8e,
 578         0xcd,0xcc,0x9b,0xce,0x98,
 579         0xd3,0xd2,0x99,0xa0,0x9a,
 580         0xd4,0x8f,0x90,0x91,0x92,
 581         0x93,0xb4,0x95,0x96,0x97,0xb3,
 582         0x94,0xda,0xd9,0x9c,0x9d,0x9e,
 583         0xbf,0xba,0xbb,0xbc,0xff,0xb9,
 584         0xdd,0x9f,0x14,0x19,0x1e,
 585         0xd0,
 586         0
 587 };
 588 //"a'a`a?a~a.a^a^'a^`a^?a^~a^.a(a('a(`a(?a(~a(.e'e`e?e~e.e^e^'e^`e^?e^~e^.i'i`i?i~i.o'o`o?o~o.o^o^'o^`o^?o^~o^.o+o+'o+`o+?o+~o+.u'u`u?u~u.u+u+'u+`u+?u+~u+.y'y`y?y~y.ddA'A`A?A~A.A^A^'A^`A^?A^~A^.A(A('A(`A(?A(~A(.E'E`E?E~E.E^E^'E^`E^?E^~E^.I'I`I?I~I.O'O`O?O~O.O^O^'O^`O^?O^~O^.O+O+'O+`O+?O+~O+.U'U`U?U~U.U+U+'U+`U+?U+~U+.Y'Y`Y?Y~Y.DD";
 589
 590 static gunichar unicode_str[] = {
 591          225, 224,7843, 227,7841,
 592          226,7845,7847,7849,7851,7853,
 593          259,7855,7857,7859,7861,7863,
 594          233, 232,7867,7869,7865,
 595          234,7871,7873,7875,7877,
 596         7879, 237, 236,7881, 297,7883,
 597          243, 242,7887, 245,7885,
 598          244,7889,7891,7893,7895,
 599         7897, 417,7899,7901,7903,7905,
 600         7907, 250, 249,7911, 361,7909,
 601          432,7913,7915,7917,7919,
 602         7921, 253,7923,7927,7929,7925,
 603          273,
 604          193, 192,7842, 195,7840,
 605          194,7844,7846,7848,7850,7852,
 606          258,7854,7856,7858,7860,7862,
 607          201, 200,7866,7868,7864,
 608          202,7870,7872,7874,7876,7878,
 609          205, 204,7880, 296,7882,
 610          211, 210,7886, 213,7884,
 611          212,7888,7890,7892,7894,
 612         7896, 416,7898,7900,7902,7904,
 613         7906, 218, 217,7910, 360,7908,
 614          431,7912,7914,7916,7918,7920,
 615          221,7922,7926,7928,7924,
 616          272,
 617         0
 618 };
 619
 620 static map<unsigned char,gunichar> viscii_utf8_map;
 621 static map<gunichar,unsigned char> utf8_viscii_map;
 622
 623 void viet_init()
 624 {
 625         for (unsigned i = 0;viscii_str[i];i ++) {
 626                 viscii_utf8_map[viscii_str[i]] = unicode_str[i];
 627                 utf8_viscii_map[unicode_str[i]] = viscii_str[i];
 628         }
 629 }
 630
 631 bool viet_utf8_to_viscii(const char *in,char *out) // pre-allocated
 632 {
 633         const gchar *p = in;
 634         gunichar ch;
 635         while ((ch = g_utf8_get_char(p)) != 0) {
 636                 p = g_utf8_next_char(p);
 637                 if (ch < 128) {
 638                         *out++ = ch;
 639                         continue;
 640                 }
 641
 642                 map<gunichar,unsigned char>::iterator iter;
 643                 iter = utf8_viscii_map.find(ch);
 644                 if (iter != utf8_viscii_map.end())
 645                         *out++ = (char)iter->second;
 646                 else {
 647                         fprintf(stderr,"Warning: unexpected unicode character %d",ch);
 648                         return false;
 649                 }
 650         }
 651         *out = 0;
 652         return true;
 653 }
 654
 655 bool viet_utf8_to_viscii_force(const char *in,char *out)
 656 {
 657         const gchar *p = in;
 658         gunichar ch;
 659         while ((ch = g_utf8_get_char(p)) != 0) {
 660                 p = g_utf8_next_char(p);
 661                 if (ch < 128) {
 662                         *out++ = ch;
 663                         continue;
 664                 }
 665
 666                 map<gunichar,unsigned char>::iterator iter;
 667                 iter = utf8_viscii_map.find(ch);
 668                 if (iter != utf8_viscii_map.end())
 669                         *out++ = (char)iter->second;
 670                 else {
 671                         fprintf(stderr,"Warning: unexpected unicode character %d",ch);
 672                         *out += 'z';
 673                 }
 674         }
 675         *out = 0;
 676         return true;
 677 }
 678
 679 void viet_viscii_to_utf8(const char *in,char *out) // pre-allocated
 680 {
 681         unsigned char *p = (unsigned char*)in;
 682         unsigned char ch;
 683         while ((ch = *p) != 0) {
 684                 p++;
 685                 if (ch < 128 && ch >= 32) {
 686                         *out++ = ch;
 687                         continue;
 688                 }
 689
 690                 map<unsigned char,gunichar>::iterator iter;
 691                 iter = viscii_utf8_map.find(ch);
 692                 if (iter != viscii_utf8_map.end()) {
 693                         g_unichar_to_utf8(iter->second,out);
 694                         out = g_utf8_next_char(out);
 695                 } else {
 696                         *out++ = ch;                                                    // fall-back case
 697                 }
 698
 699         }
 700         *out = 0;
 701 }
 702
 703 static char buffer[6000];
 704 char* viet_to_viscii(const char *in)
 705 {
 706         if (g_utf8_strlen(in,-1) >= 1000)
 707                 return "";
 708         if (viet_utf8_to_viscii(in,buffer))
 709                 return buffer;
 710         else
 711                 return NULL;
 712 }
 713
 714 char* viet_to_viscii_force(const char *in)
 715 {
 716         if (g_utf8_strlen(in,-1) >= 1000)
 717                 return "";
 718         viet_utf8_to_viscii_force(in,buffer);
 719         return buffer;
 720 }
 721
 722 char* viet_to_utf8(const char *in)
 723 {
 724         if (strlen(in) >= 1000)
 725                 return "";
 726         viet_viscii_to_utf8(in,buffer);
 727         return buffer;
 728 }
 729
 730 /*
 731 static char_traits_strid::char_type*
 732 char_traits_strid::copy(char_traits_strid::char_type* __s1,
 733                  const char_traits_strid::char_type* __s2,
 734                  size_t __n)
 735 {
 736         return static_cast<char_type*>(memcpy(__s1, __s2, __n*sizeof(char_type)));
 737 }
 738
 739 */
 740
 741 bool get_case_syllable_candidates(const char *input,Candidates &output, float v)
 742 {
 743         // There are only two acceptable cases:
 744         // 1. The first character is upper case, the rest is lower
 745         // 2. All are either lower or upper
 746         // if there is some upper case character without following one of these cases, then it's fault.
 747         // also, if there is a uppercase word in dictionary, then add it.
 748
 749         uint i,n = strlen(input);
 750         // check for upper-case chars
 751         for (i = n-1;i >= 0;i --)
 752                 if (viet_toupper(input[i]) == input[i])
 753                         break;
 754
 755         if (i <= 0 || n < 2)                                    // ignore if the only upper char is the first one.
 756                 return false;
 757
 758         string s;
 759
 760         s = input;
 761         s[0] = viet_toupper(s[0]);
 762         for (i = 1;i < n;i ++)
 763                 s[i] = viet_tolower(s[i]);
 764         if (s != string(input))
 765                 output.insert(s,v);
 766
 767         s = input;
 768         for (i = 0;i < n;i ++)
 769                 s[i] = viet_tolower(s[i]);
 770         if (s != string(input))
 771                 output.insert(s,v);
 772
 773         s = input;
 774         for (i = 0;i < n;i ++)
 775                 s[i] = viet_toupper(s[i]);
 776         if (s != string(input))
 777                 output.insert(s,v);
 778         return true;
 779 }
 780
 781 void get_phonetic_syllable_candidates(const char *input,Candidates &output,float v)
 782 {
 783         vector<confusion_set>& confusion_sets = get_confusion_sets();
 784         int i,j,m,n = confusion_sets.size();
 785         bool ret = false;
 786         set<Syllable> syllset,syllset2;
 787         Syllable _syll;
 788
 789         _syll.parse(input);
 790         syllset2.insert(_syll);
 791         while (!syllset2.empty()) {
 792                 const Syllable sy = *syllset2.begin();
 793                 syllset2.erase(syllset2.begin());
 794
 795                 if (syllset.find(sy) != syllset.end())
 796                         continue;                                                               // we already matched&applied this syllable
 797
 798                 //cerr << sy << endl;
 799                 syllset.insert(sy);
 800
 801                 vector<Syllable> sylls;
 802                 // match & apply
 803                 for (i = 0;i < n;i ++) {
 804                         m = confusion_sets[i].size();
 805                         for (j = 0;j < m;j ++)
 806                                 if (confusion_sets[i][j].match(sy))
 807                                         break;
 808                         if (j < m) {
 809                                 for (j = 0;j < m;j ++)
 810                                         confusion_sets[i][j].apply(sy,sylls);
 811                         }
 812                 }
 813                 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
 814         }
 815
 816         // move to _nodes
 817         //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
 818         set<Syllable>::iterator iter;
 819         for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
 820                 string s = iter->to_std_str();
 821                 string ss = get_lowercased_syllable(s);
 822                 //cerr << s << endl;
 823                 if (sarch.in_dict(sarch[s]) ||
 824                                 sarch.in_dict(sarch[ss]))
 825                         output.insert(iter->to_str(),v+1);
 826         }
 827 }
 828
 829 void get_syllable_candidates(const char *input,Candidates &output,float v)
 830 {
 831         Syllable syll;
 832         string s,s2;
 833         set<string> s3;
 834         set<string>::iterator s3i;
 835
 836         get_phonetic_syllable_candidates(input,output,v);
 837
 838         KeyRecover keyr;
 839         keyr.init(input);
 840         while (keyr.step(s)) {
 841                 s2 = get_std_syllable(s);
 842                 if (s2 != s && syll.parse(s2.c_str()))
 843                         output.insert(syll.to_str(),v);
 844                 s3.clear();
 845                 im_recover(s.c_str(),s3);
 846                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 847                         s2 = get_std_syllable(*s3i);
 848                         if (s2 != *s3i && syll.parse(s2.c_str()))
 849                                 output.insert(syll.to_str(),v);
 850                 }
 851         }
 852         keyr.done();
 853
 854         // SpaceInserter
 855         uint i,n = strlen(input);
 856         for (i = 1;i < n;i ++) {
 857                 s = string(input).substr(0,i);
 858                 s2 = string(input).substr(i);
 859                 if (syll.parse(s.c_str())) {
 860                         s = syll.to_str();
 861                         if (syll.parse(s2.c_str())) {
 862                                 output.insert(s + string(" ") + syll.to_str(),v);
 863                         }
 864                 }
 865         }
 866
 867         CharInserter inserter;
 868         inserter.init(input);
 869         while (inserter.step(s)) {
 870                 s2 = get_std_syllable(s);
 871                 if (s2 != s && syll.parse(s2.c_str()))
 872                         output.insert(syll.to_str(),v);
 873                 s3.clear();
 874                 im_recover(s.c_str(),s3);
 875                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 876                         s2 = get_std_syllable(*s3i);
 877                         if (s2 != *s3i && syll.parse(s2.c_str()))
 878                                 output.insert(syll.to_str(),v);
 879                 }
 880         }
 881         inserter.done();
 882
 883         CharEraser eraser;
 884         eraser.init(input);
 885         while (eraser.step(s)) {
 886                 s2 = get_std_syllable(s);
 887                 if (s2 != s && syll.parse(s2.c_str()))
 888                         output.insert(syll.to_str(),v);
 889                 s3.clear();
 890                 im_recover(s.c_str(),s3);
 891                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 892                         s2 = get_std_syllable(*s3i);
 893                         if (s2 != *s3i && syll.parse(s2.c_str()))
 894                                 output.insert(syll.to_str(),v);
 895                 }
 896         }
 897         eraser.done();
 898
 899         CharTransposer transposer;
 900         transposer.init(input);
 901         while (transposer.step(s)) {
 902                 s2 = get_std_syllable(s);
 903                 if (s2 != s && syll.parse(s2.c_str()))
 904                         output.insert(syll.to_str(),v);
 905                 s3.clear();
 906                 im_recover(s.c_str(),s3);
 907                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 908                         s2 = get_std_syllable(*s3i);
 909                         if (s2 != *s3i && syll.parse(s2.c_str()))
 910                                 output.insert(syll.to_str(),v);
 911                 }
 912         }
 913         transposer.done();
 914 }
 915
 916 void get_left_syllable_candidates(const char *input,const char *left,Candidates &output)
 917 {
 918         // merge
 919         string s;
 920         s = string(left)+string(input);
 921         get_syllable_candidates(s.c_str(),output,10);
 922
 923         // cut one char from input
 924         if (strlen(input) > 1) {
 925                 s = string(input+1);
 926                 get_syllable_candidates(s.c_str(),output,10);
 927         }
 928
 929         // insert one char from left to input
 930         if (strlen(left)) {
 931                 s = string(" ") + string(input);
 932                 s[0] = left[strlen(left)-1];
 933                 get_syllable_candidates(s.c_str(),output,10);
 934         }
 935 }
 936
 937 void get_right_syllable_candidates(const char *input,const char *right,Candidates &output)
 938 {
 939         // merge
 940         string s;
 941         s = string(input)+string(right);
 942         get_syllable_candidates(s.c_str(),output,10);
 943
 944         // cut one char from input
 945         if (strlen(input) > 1) {
 946                 s = string(input);
 947                 s.resize(strlen(input)-1);
 948                 get_syllable_candidates(s.c_str(),output,10);
 949         }
 950
 951         // insert one char from right to input
 952         if (strlen(right)) {
 953                 s = string(input)+string(" ");
 954                 s[s.size()-1] = right[0];
 955                 get_syllable_candidates(s.c_str(),output,10);
 956         }
 957 }
 958
 959
 960
 961 void Candidates::insert(const std::string &s,float f)
 962 {
 963         Candidate c;
 964         c.candidate = s;
 965         c.priority = f;
 966         set<Candidate>::iterator iter = candidates.find(c);
 967         if (iter != candidates.end()) {
 968                 if (iter->priority < c.priority)
 969                         candidates.erase(iter);
 970                 else
 971                         return;
 972         }
 973         candidates.insert(c);
 974 }
 975
 976 bool Candidates::CandidateComparator::operator()(const std::string &s1,const std::string &s2)
 977 {
 978         set<Candidate>::iterator i1,i2;
 979         Candidate c1,c2;
 980         c1.candidate = s1;
 981         c2.candidate = s2;
 982         i1 = c.candidates.find(c1);
 983         i2 = c.candidates.find(c2);
 984         if (i1->priority != i2->priority)
 985                 return i1->priority > i2->priority;
 986         float f1,f2;
 987         VocabIndex v;
 988         v = Vocab_None;
 989         f1 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s1))],&v);
 990         f2 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s2))],&v);
 991         //cerr << f1 << "<>" << f2 << endl;
 992         return f1 > f2; // we want reverse order
 993 }
 994
 995 void Candidates::get_list(std::vector<std::string> &v)
 996 {
 997         uint i,n = candidates.size();
 998         v.resize(n);
 999         set<Candidate>::iterator iter;
1000         n = 0;
1001         for (iter = candidates.begin();iter != candidates.end();++iter)
1002                 if (!get_case_syllable_candidates(iter->candidate.c_str(),*this,iter->priority)) {
1003                         if (sarch.in_dict(get_std_syllable(get_lowercased_syllable(iter->candidate)))) {
1004                                 v[n++] = iter->candidate;
1005                         }
1006                 }
1007         v.resize(n);
1008         sort(v.begin(),v.end(),CandidateComparator(*this));
1009 }