libvspell/spell.cpp

   1 #include "config.h"             // -*- tab-width: 2 -*-
   2 #include <iterator>
   3 #include <algorithm>
   4 #include <sstream>
   5 #include <iostream>
   6 #include <fstream>
   7 #include "vspell.h"
   8 #include "sentence.h"
   9 #include "syllable.h"
  10 #include "pfs.h"
  11 #include <map>
  12 #include "propername.h"
  13 #include "keyboard.h"
  14 #include "shuffle.h"
  15 #include "penalty.h"
  16 #include "bellman.h"
  17 #include <math.h>
  18
  19 // stolen from glib
  20 typedef unsigned int guint32;
  21 typedef unsigned int guint;
  22 typedef guint32 gunichar;
  23 typedef char gchar;
  24 typedef unsigned char guchar;
  25 typedef long glong;
  26 typedef signed int gssize;
  27 #define UTF8_COMPUTE(Char, Mask, Len)                                     \
  28   if (Char < 128)                                                                                                       \
  29     {                                                                                                                                                   \
  30       Len                                                               = 1;                                                              \
  31       Mask                                                              = 0x7f;                                                   \
  32     }                                                                                                                                                   \
  33   else if ((Char & 0xe0) == 0xc0)                                             \
  34     {                                                                                                                                                   \
  35       Len                                                               = 2;                                                              \
  36       Mask                                                              = 0x1f;                                                   \
  37     }                                                                                                                                                   \
  38   else if ((Char & 0xf0) == 0xe0)                                             \
  39     {                                                                                                                                                   \
  40       Len                                                               = 3;                                                              \
  41       Mask                                                              = 0x0f;                                                   \
  42     }                                                                                                                                                   \
  43   else if ((Char & 0xf8) == 0xf0)                                             \
  44     {                                                                                                                                                   \
  45       Len                                                               = 4;                                                              \
  46       Mask                                                              = 0x07;                                                   \
  47     }                                                                                                                                                   \
  48   else if ((Char & 0xfc) == 0xf8)                                             \
  49     {                                                                                                                                                   \
  50       Len                                                               = 5;                                                              \
  51       Mask                                                              = 0x03;                                                   \
  52     }                                                                                                                                                   \
  53   else if ((Char & 0xfe) == 0xfc)                                             \
  54     {                                                                                                                                                   \
  55       Len                                                               = 6;                                                              \
  56       Mask                                                              = 0x01;                                                   \
  57     }                                                                                                                                                   \
  58   else                                                                                                                                          \
  59     Len                                                                         = -1;
  60
  61 #define UTF8_GET(Result, Chars, Count, Mask, Len)                       \
  62   (Result) = (Chars)[0] & (Mask);                                                                                       \
  63   for ((Count) = 1; (Count) < (Len); ++(Count))                         \
  64     {                                                                                                                                                                                                   \
  65       if (((Chars)[(Count)] & 0xc0) != 0x80)                                    \
  66                                 {                                                                                                                                                                                       \
  67                                         (Result) = (gunichar)-1;                                                                                \
  68                                         break;                                                                                                                                                  \
  69                                 }                                                                                                                                                                                       \
  70       (Result) <<= 6;                                                                                                                                   \
  71       (Result) |= ((Chars)[(Count)] & 0x3f);                                    \
  72     }
  73
  74 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
  75 static const gchar utf8_skip_data[256] = {
  76   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  77   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  78   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  79   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  80   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  81   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  82   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  83   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
  84 };
  85
  86 static const gchar * const g_utf8_skip = utf8_skip_data;
  87 static int
  88 g_unichar_to_utf8 (gunichar c,
  89                    gchar   *outbuf)
  90 {
  91   guint len = 0;
  92   int first;
  93   int i;
  94
  95   if (c < 0x80)
  96     {
  97       first = 0;
  98       len = 1;
  99     }
 100   else if (c < 0x800)
 101     {
 102       first = 0xc0;
 103       len = 2;
 104     }
 105   else if (c < 0x10000)
 106     {
 107       first = 0xe0;
 108       len = 3;
 109     }
 110    else if (c < 0x200000)
 111     {
 112       first = 0xf0;
 113       len = 4;
 114     }
 115   else if (c < 0x4000000)
 116     {
 117       first = 0xf8;
 118       len = 5;
 119     }
 120   else
 121     {
 122       first = 0xfc;
 123       len = 6;
 124     }
 125
 126   if (outbuf)
 127     {
 128       for (i = len - 1; i > 0; --i)
 129         {
 130           outbuf[i] = (c & 0x3f) | 0x80;
 131           c >>= 6;
 132         }
 133       outbuf[0] = c | first;
 134     }
 135
 136   return len;
 137 }
 138
 139 static gunichar
 140 g_utf8_get_char (const gchar *p)
 141 {
 142   int i, mask = 0, len;
 143   gunichar result;
 144   unsigned char c = (unsigned char) *p;
 145
 146   UTF8_COMPUTE (c, mask, len);
 147   if (len == -1)
 148     return (gunichar)-1;
 149   UTF8_GET (result, p, i, mask, len);
 150
 151   return result;
 152 }
 153
 154 static glong
 155 g_utf8_strlen (const gchar *p,
 156                gssize       max)
 157 {
 158   glong len = 0;
 159   const gchar *start = p;
 160   //g_return_val_if_fail (p != NULL || max == 0, 0);
 161         if (!(p != NULL || max == 0))
 162                 return 0;
 163
 164   if (max < 0)
 165     {
 166       while (*p)
 167         {
 168           p = g_utf8_next_char (p);
 169           ++len;
 170         }
 171     }
 172   else
 173     {
 174       if (max == 0 || !*p)
 175         return 0;
 176
 177       p = g_utf8_next_char (p);
 178
 179       while (p - start < max && *p)
 180         {
 181           ++len;
 182           p = g_utf8_next_char (p);
 183         }
 184
 185       /* only do the last len increment if we got a complete
 186        * char (don't count partial chars)
 187        */
 188       if (p - start == max)
 189         ++len;
 190     }
 191
 192   return len;
 193 }
 194
 195 using namespace std;
 196
 197 /*
 198
 199 The process:
 200 1. Sentence segmentation. (sentences_split)
 201 2. Separate "words" by spaces. (tokenize)
 202 3. Punctuation separation. (tokenize/tokenize_punctuation)
 203 4. Foreign/Abbreviation detection.
 204 5. Proper name detection.
 205 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
 206 generalize all capitalized words.
 207 6* Syllable checking. (check1)
 208 7. Find all possible (misspelled) words. (**) (get_all_words)
 209 8. "pre-separate" sentence into phrases.
 210 9. Word segmentation. (**)
 211 10. Find the best segmentation. (segment_best)
 212 10* Word checking. (check2)
 213
 214 */
 215 /*
 216         namespace Spell {
 217 */
 218 bool VSpell::init()
 219 {
 220         dic_init();
 221
 222         cerr << "Loading dictionary... ";
 223         warch.load("wordlist");
 224         cerr << "done" << endl;
 225         cerr << "Loading ngram... ";
 226         File f("ngram","rt");
 227         if (!f.error()) {
 228                 get_ngram().read(f);
 229                 cerr << "done" << endl;
 230         } else
 231                 cerr << "Ngram loading error. The result may be incorrect" << endl;
 232         cerr << "Loading syngram... ";
 233         File ff("syngram","rt");
 234         if (!ff.error()) {
 235                 get_syngram().read(ff);
 236                 cerr << "done" << endl;
 237         }       else
 238                 cerr << "Syllable Ngram loading error. The result may be incorrect" << endl;
 239         sarch.set_blocked(true);
 240         return true;
 241 }
 242
 243 bool VSpell::check(const char *utf8_pp)
 244 {
 245         utf8_text = utf8_pp;
 246
 247         syllables.clear();
 248
 249         bool run = false;
 250
 251         do {
 252                 string pp = viet_to_viscii_force(utf8_text.c_str());
 253                 vector<string> pps;
 254                 sentences_split(pp,pps);
 255                 unsigned pps_i,pps_len = pps.size();
 256                 unsigned offset = 0;
 257                 text = pp;
 258
 259                 for (pps_i = 0;pps_i < pps_len;pps_i ++) {
 260                         Text *t = text_factory.create(this);
 261                         t->offset = offset;
 262                         t->length = pps[pps_i].size();
 263                         run = !t->sentence_check(pps[pps_i].c_str());
 264                         delete t;
 265                         if (run)
 266                                 break;
 267                         offset += pps[pps_i].size();
 268                 }
 269         } while (run);
 270         return true;
 271 }
 272
 273 void VSpell::replace(unsigned from,unsigned size,const char *s)
 274 {
 275         const char *p = utf8_text.c_str();
 276         unsigned i;
 277         for (i = 0;i < from && *p;i ++)
 278                 p = g_utf8_next_char(p);
 279         unsigned from1 = p - utf8_text.c_str();
 280         for (i = 0;i < size && *p;i ++)
 281                 p = g_utf8_next_char(p);
 282         unsigned to1 = p - utf8_text.c_str();
 283         utf8_text.replace(from1,to1-from1,s);
 284
 285         // remove separators in the range, adjust other seps
 286         int n = separators.size();
 287         if (n) {
 288                 int newsize = g_utf8_strlen(s,-1);
 289                 i = n;
 290                 do {
 291                         i --;
 292                         if (separators[i] >= from) {
 293                                 if (separators[i] < from+size)
 294                                         separators.erase(separators.begin()+i);
 295                                 else
 296                                         separators[i] += newsize-size;
 297                         }
 298                 } while (i > 0);
 299         }
 300 }
 301
 302 void VSpell::add_separators(const std::vector<unsigned> &seps)
 303 {
 304         copy(seps.begin(),seps.end(),back_inserter(separators));
 305 }
 306
 307 void VSpell::add_word(const char *s)
 308 {
 309         istringstream is(s);
 310         strid_string toks;
 311         string ss;
 312         while (is >> ss) {
 313                 toks += sarch[ss];
 314         }
 315         words.insert(toks);
 316 }
 317
 318 bool Text::sentence_check(const char *pp)
 319 {
 320         // preprocess
 321         st.set(pp);
 322         st.standardize();
 323         st.tokenize();
 324
 325         if (!st.get_syllable_count())   // nothing to do but crash ;)
 326                 return true;
 327
 328         // syllable checking
 329         if (!syllable_check() && !ui_syllable_check())
 330                 return false;
 331
 332         //w.construct(st);
 333         set<WordEntry> wes;
 334         WordStateFactories factories;
 335         ExactWordStateFactory exact;
 336         LowerWordStateFactory lower;
 337         UpperWordStateFactory upper;
 338         FuzzyWordStateFactory fuzzy;
 339         factories.push_back(&exact);
 340         factories.push_back(&lower);
 341         factories.push_back(&upper);
 342         factories.push_back(&fuzzy);
 343         w.pre_construct(st,wes,factories);
 344         mark_proper_name(st,wes);
 345         apply_separators(wes);
 346         w.post_construct(wes);
 347         //cerr << w << endl;
 348
 349         WordDAG dagw(&w);
 350         DAG *dag = &dagw;
 351
 352         WordDAG2 *dagw2;
 353         if (vspell->get_trigram()) {
 354                 dagw2 = new WordDAG2(&dagw);
 355                 dag = dagw2;
 356         }
 357
 358         Penalty2DAG p2dag(dag,vspell->get_penalty2());
 359         if (vspell->get_penalty2()) {
 360                 // with penalty2dag, we have to do non-fuzzy segmentation first
 361                 // to feed Penalty2DAG::syllable_weights
 362                 Segmentation p2seg;
 363                 penalty2_construct(p2seg);
 364                 p2dag.set_syllable_weights(p2seg);
 365                 dag = &p2dag;
 366         }
 367
 368         PenaltyDAG pdag(dag,vspell->get_penalty());
 369         if (vspell->get_penalty()) {
 370                 dag = &pdag;
 371         }
 372
 373         Path path;
 374         if (vspell->get_normalization()) {
 375                 Bellman pfs;
 376                 pfs.search(*dag,path);
 377         } else {
 378                 PFS pfs;
 379                 pfs.search(*dag,path);
 380         }
 381
 382         if (vspell->get_trigram()) {
 383                 dagw2->demangle(path);
 384                 delete dagw2;
 385         }
 386
 387         seg.resize(path.size()-2);
 388         // don't copy head/tail
 389         copy(path.begin()+1,path.end()-1,seg.begin());
 390         seg.we = w.we;
 391         //cerr << seg << endl;
 392
 393         // word checking
 394         if (!word_check() && !ui_word_check())
 395                 return false;
 396
 397         return true;                                                                    // done
 398 }
 399
 400 void Text::penalty2_construct(Segmentation &seg)
 401 {
 402         WordStateFactories factories;
 403         ExactWordStateFactory exact;
 404         LowerWordStateFactory lower;
 405         UpperWordStateFactory upper;
 406         //FuzzyWordStateFactory fuzzy;
 407
 408         factories.push_back(&exact);
 409         factories.push_back(&lower);
 410         factories.push_back(&upper);
 411         //factories.push_back(&fuzzy);
 412         Lattice lattice;
 413         set<WordEntry> wes;
 414         lattice.pre_construct(st,wes,factories);
 415         mark_proper_name(st,wes);
 416         //apply_separators(wes);
 417         lattice.post_construct(wes);
 418
 419         WordDAG dagw(&lattice);
 420         DAG *dag = &dagw;
 421
 422         WordDAG2 *dagw2;
 423         if (vspell->get_trigram()) {
 424                 dagw2 = new WordDAG2(&dagw);
 425                 dag = dagw2;
 426         }
 427
 428         Path path;
 429         if (vspell->get_normalization()) {
 430                 Bellman pfs;
 431                 pfs.search(*dag,path);
 432         } else {
 433                 PFS pfs;
 434                 pfs.search(*dag,path);
 435         }
 436
 437         if (vspell->get_trigram()) {
 438                 dagw2->demangle(path);
 439                 delete dagw2;
 440         }
 441
 442         seg.resize(path.size()-2);
 443         // don't copy head/tail
 444         copy(path.begin()+1,path.end()-1,seg.begin());
 445         seg.we = lattice.we;
 446 }
 447
 448
 449 bool Text::syllable_check(int i)
 450 {
 451         if (vspell->in_dict(st[i].get_id()))
 452                 return true;
 453
 454         if (sarch.in_dict(st[i].get_cid())) {
 455                 Syllable syl;                                                   // diacritic check
 456                 if (syl.parse(sarch[st[i].get_cid()])) {
 457                         string s = get_lowercased_syllable(syl.to_str());
 458                         if (get_lowercased_syllable(sarch[st[i].get_id()]) == s)
 459                                 return true;
 460                 }
 461         }
 462         return false;
 463 }
 464
 465 bool Text::syllable_check()
 466 {
 467         int i,n = st.get_syllable_count();
 468
 469         suggestions.clear();
 470
 471         for (i = 0;i < n;i ++) {
 472                 if (syllable_check(i))
 473                         continue;
 474
 475                 Suggestion _s;
 476                 _s.id = i;
 477                 suggestions.push_back(_s);
 478         }
 479         return suggestions.empty();
 480 }
 481
 482 bool Text::word_check()
 483 {
 484         int i,n = seg.size();
 485
 486         suggestions.clear();
 487
 488         for (i = 0;i < n;i ++) {
 489                 vector<strid> sylls;
 490                 strid_string sylls2;
 491                 int ii,len = seg[i].node->get_syllable_count();
 492                 seg[i].node->get_syllables(sylls);
 493
 494                 // case-sensitive comparation.
 495                 sylls2.resize(len);
 496                 bool subok = true;
 497                 for (ii = 0;ii < len;ii ++) {
 498                         sylls2[ii] = st[seg[i].pos+ii].get_id();
 499                         if (subok && st[seg[i].pos+ii].get_cid() != sylls[ii])
 500                                 subok = false;
 501                 }
 502
 503                 // in user dict
 504                 bool ok = vspell->in_dict(sylls2);
 505
 506                 if (!ok) {
 507                         strid_string sylls3;
 508                         sylls3.resize(sylls.size());
 509                         for (ii = 0;ii < sylls3.size();ii ++)
 510                                 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
 511
 512                         if (vspell->get_strict_word_checking()) {
 513                                 // don't care if the "true" word is lower-cased and the original one is valid upper-cased
 514                                 if (!subok &&
 515                                                 (is_all_capitalized_word(sylls2) ||
 516                                                  (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
 517                                         subok = true;
 518
 519                                 ok = subok;
 520                         } else {
 521                                 string s;
 522                                 BranchNode *branch = warch.get_root();
 523                                 LeafNode* leaf;
 524                                 for (ii = 0;ii < len && branch;ii ++)
 525                                         branch = branch->get_branch(st[seg[i].pos+ii].get_cid());
 526
 527                                 if (branch && (leaf = branch->get_leaf(sarch["<mainleaf>"])) != NULL) {
 528                                         sylls.clear();
 529                                         leaf->get_syllables(sylls);
 530                                         strid_string sylls3;
 531                                         sylls3.resize(sylls.size());
 532                                         for (ii = 0;ii < sylls3.size();ii ++)
 533                                                 sylls3[ii] = sarch[get_unstd_syllable(sarch[sylls[ii]])];
 534
 535                                         ok = sylls2 == sylls3;
 536
 537                                         // don't care if the "true" word is lower-cased and the original one is valid upper-cased
 538                                         if (!ok &&
 539                                                         (is_all_capitalized_word(sylls2) ||
 540                                                          (is_first_capitalized_word(sylls2) && is_lower_cased_word(sylls3))))
 541                                                 ok = true;
 542                                 }
 543                         }
 544                 }
 545
 546                 if (!ok) {
 547                         Suggestion _s;
 548                         _s.id = i;
 549                         suggestions.push_back(_s);
 550                         continue;
 551                 }
 552
 553                 // all syllable are syntatically valid
 554         }
 555         return suggestions.empty();
 556 }
 557
 558 unsigned Text::pos_from_syllable(const Suggestion &s)
 559 {
 560         return offset+st[s.id].start;
 561 }
 562
 563 unsigned Text::pos_from_word(const Suggestion &s)
 564 {
 565         //return offset+st[seg[s.id]].start;
 566         return 0;
 567 }
 568
 569 void Text::replace(unsigned from,unsigned size,const char *s)
 570 {
 571         vspell->replace(from+offset,size,s);
 572 }
 573
 574 int Text::utf8_pos(unsigned from)
 575 {
 576         const string &utf8_text = vspell->get_utf8_text();
 577         const char *p = utf8_text.c_str();
 578         const char *op = p;
 579         from += offset;
 580         for (int i = 0;i < from && *p;i ++)
 581                 p = g_utf8_next_char(p);
 582         return (int)(p - op);
 583 }
 584
 585 string Text::substr(unsigned from,unsigned size)
 586 {
 587         const string &utf8_text = vspell->get_utf8_text();
 588         const char *p = utf8_text.c_str();
 589         unsigned i;
 590         from += offset;
 591         for (i = 0;i < from && *p;i ++)
 592                 p = g_utf8_next_char(p);
 593         unsigned from1 = p - utf8_text.c_str();
 594         for (i = 0;i < size && *p;i ++)
 595                 p = g_utf8_next_char(p);
 596         unsigned to1 = p - utf8_text.c_str();
 597         return utf8_text.substr(from1,to1-from1);
 598 }
 599
 600 void Text::apply_separators(set<WordEntry> &wes)
 601 {
 602         vector<unsigned> seps;
 603         //set<unsigned> seps;
 604
 605         get_separators(seps);
 606         sort(seps.begin(),seps.end());
 607         //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
 608         int sep = 0;
 609         int i,n = st.get_syllable_count();
 610
 611         for (i = 0;i < n-1 && sep < seps.size();i ++) {
 612                 int p = offset+st[i].start+strlen(sarch[st[i].get_id()]);
 613                 if (p <= seps[sep] && seps[sep] <= offset+st[i+1].start) {
 614                         apply_separator(wes,i);
 615                         sep ++;
 616                 }
 617         }
 618 }
 619
 620 void Text::get_separators(vector<unsigned> &v)
 621 {
 622         const vector<unsigned> &vv = vspell->get_separators();
 623
 624         int i,n = vv.size();
 625         for (i = 0;i < n;i ++)
 626                 if (vv[i] >= offset && vv[i] < offset+length)
 627                         v.push_back(vv[i]);
 628 }
 629 /*
 630         }
 631 */
 632
 633 static unsigned char viscii_str[] = {
 634         0xe1,0xe0,0xe4,0xe3,0xd5,
 635         0xe2,0xa4,0xa5,0xa6,0xe7,0xa7,
 636         0xe5,0xa1,0xa2,0xc6,0xc7,0xa3,
 637         0xe9,0xe8,0xeb,0xa8,0xa9,
 638         0xea,0xaa,0xab,0xac,0xad,
 639         0xae,0xed,0xec,0xef,0xee,0xb8,
 640         0xf3,0xf2,0xf6,0xf5,0xf7,
 641         0xf4,0xaf,0xb0,0xb1,0xb2,
 642         0xb5,0xbd,0xbe,0xb6,0xb7,0xde,
 643         0xfe,0xfa,0xf9,0xfc,0xfb,0xf8,
 644         0xdf,0xd1,0xd7,0xd8,0xe6,
 645         0xf1,0xfd,0xcf,0xd6,0xdb,0xdc,
 646         0xf0,
 647         0xc1,0xc0,0xc4,0xc3,0x80,
 648         0xc2,0x84,0x85,0x86,0x06,0x87,
 649         0xc5,0x81,0x82,0x02,0x05,0x83,
 650         0xc9,0xc8,0xcb,0x88,0x89,
 651         0xca,0x8a,0x8b,0x8c,0x8d,0x8e,
 652         0xcd,0xcc,0x9b,0xce,0x98,
 653         0xd3,0xd2,0x99,0xa0,0x9a,
 654         0xd4,0x8f,0x90,0x91,0x92,
 655         0x93,0xb4,0x95,0x96,0x97,0xb3,
 656         0x94,0xda,0xd9,0x9c,0x9d,0x9e,
 657         0xbf,0xba,0xbb,0xbc,0xff,0xb9,
 658         0xdd,0x9f,0x14,0x19,0x1e,
 659         0xd0,
 660         0
 661 };
 662 //"a'a`a?a~a.a^a^'a^`a^?a^~a^.a(a('a(`a(?a(~a(.e'e`e?e~e.e^e^'e^`e^?e^~e^.i'i`i?i~i.o'o`o?o~o.o^o^'o^`o^?o^~o^.o+o+'o+`o+?o+~o+.u'u`u?u~u.u+u+'u+`u+?u+~u+.y'y`y?y~y.ddA'A`A?A~A.A^A^'A^`A^?A^~A^.A(A('A(`A(?A(~A(.E'E`E?E~E.E^E^'E^`E^?E^~E^.I'I`I?I~I.O'O`O?O~O.O^O^'O^`O^?O^~O^.O+O+'O+`O+?O+~O+.U'U`U?U~U.U+U+'U+`U+?U+~U+.Y'Y`Y?Y~Y.DD";
 663
 664 static gunichar unicode_str[] = {
 665          225, 224,7843, 227,7841,
 666          226,7845,7847,7849,7851,7853,
 667          259,7855,7857,7859,7861,7863,
 668          233, 232,7867,7869,7865,
 669          234,7871,7873,7875,7877,
 670         7879, 237, 236,7881, 297,7883,
 671          243, 242,7887, 245,7885,
 672          244,7889,7891,7893,7895,
 673         7897, 417,7899,7901,7903,7905,
 674         7907, 250, 249,7911, 361,7909,
 675          432,7913,7915,7917,7919,
 676         7921, 253,7923,7927,7929,7925,
 677          273,
 678          193, 192,7842, 195,7840,
 679          194,7844,7846,7848,7850,7852,
 680          258,7854,7856,7858,7860,7862,
 681          201, 200,7866,7868,7864,
 682          202,7870,7872,7874,7876,7878,
 683          205, 204,7880, 296,7882,
 684          211, 210,7886, 213,7884,
 685          212,7888,7890,7892,7894,
 686         7896, 416,7898,7900,7902,7904,
 687         7906, 218, 217,7910, 360,7908,
 688          431,7912,7914,7916,7918,7920,
 689          221,7922,7926,7928,7924,
 690          272,
 691         0
 692 };
 693
 694 static map<unsigned char,gunichar> viscii_utf8_map;
 695 static map<gunichar,unsigned char> utf8_viscii_map;
 696
 697 void viet_init()
 698 {
 699         for (unsigned i = 0;viscii_str[i];i ++) {
 700                 viscii_utf8_map[viscii_str[i]] = unicode_str[i];
 701                 utf8_viscii_map[unicode_str[i]] = viscii_str[i];
 702         }
 703 }
 704
 705 bool viet_utf8_to_viscii(const char *in,char *out) // pre-allocated
 706 {
 707         const gchar *p = in;
 708         gunichar ch;
 709         while ((ch = g_utf8_get_char(p)) != 0) {
 710                 p = g_utf8_next_char(p);
 711                 if (ch < 128) {
 712                         *out++ = ch;
 713                         continue;
 714                 }
 715
 716                 map<gunichar,unsigned char>::iterator iter;
 717                 iter = utf8_viscii_map.find(ch);
 718                 if (iter != utf8_viscii_map.end())
 719                         *out++ = (char)iter->second;
 720                 else {
 721                         fprintf(stderr,"Warning: unexpected unicode character %d",ch);
 722                         return false;
 723                 }
 724         }
 725         *out = 0;
 726         return true;
 727 }
 728
 729 bool viet_utf8_to_viscii_force(const char *in,char *out)
 730 {
 731         const gchar *p = in;
 732         gunichar ch;
 733         while ((ch = g_utf8_get_char(p)) != 0) {
 734                 p = g_utf8_next_char(p);
 735                 if (ch < 128) {
 736                         *out++ = ch;
 737                         continue;
 738                 }
 739
 740                 map<gunichar,unsigned char>::iterator iter;
 741                 iter = utf8_viscii_map.find(ch);
 742                 if (iter != utf8_viscii_map.end())
 743                         *out++ = (char)iter->second;
 744                 else {
 745                         fprintf(stderr,"Warning: unexpected unicode character %d",ch);
 746                         *out += 'z';
 747                 }
 748         }
 749         *out = 0;
 750         return true;
 751 }
 752
 753 void viet_viscii_to_utf8(const char *in,char *out) // pre-allocated
 754 {
 755         unsigned char *p = (unsigned char*)in;
 756         unsigned char ch;
 757         while ((ch = *p) != 0) {
 758                 p++;
 759                 if (ch < 128 && ch >= 32) {
 760                         *out++ = ch;
 761                         continue;
 762                 }
 763
 764                 map<unsigned char,gunichar>::iterator iter;
 765                 iter = viscii_utf8_map.find(ch);
 766                 if (iter != viscii_utf8_map.end()) {
 767                         g_unichar_to_utf8(iter->second,out);
 768                         out = g_utf8_next_char(out);
 769                 } else {
 770                         *out++ = ch;                                                    // fall-back case
 771                 }
 772
 773         }
 774         *out = 0;
 775 }
 776
 777 static char buffer[6000];
 778 char* viet_to_viscii(const char *in)
 779 {
 780         if (g_utf8_strlen(in,-1) >= 1000)
 781                 return "";
 782         if (viet_utf8_to_viscii(in,buffer))
 783                 return buffer;
 784         else
 785                 return NULL;
 786 }
 787
 788 char* viet_to_viscii_force(const char *in)
 789 {
 790         if (g_utf8_strlen(in,-1) >= 1000)
 791                 return "";
 792         viet_utf8_to_viscii_force(in,buffer);
 793         return buffer;
 794 }
 795
 796 char* viet_to_utf8(const char *in)
 797 {
 798         if (strlen(in) >= 1000)
 799                 return "";
 800         viet_viscii_to_utf8(in,buffer);
 801         return buffer;
 802 }
 803
 804 /*
 805 static char_traits_strid::char_type*
 806 char_traits_strid::copy(char_traits_strid::char_type* __s1,
 807                  const char_traits_strid::char_type* __s2,
 808                  size_t __n)
 809 {
 810         return static_cast<char_type*>(memcpy(__s1, __s2, __n*sizeof(char_type)));
 811 }
 812
 813 */
 814
 815 bool get_case_syllable_candidates(const char *input,Candidates &output, float v)
 816 {
 817         // There are only two acceptable cases:
 818         // 1. The first character is upper case, the rest is lower
 819         // 2. All are either lower or upper
 820         // if there is some upper case character without following one of these cases, then it's fault.
 821         // also, if there is a uppercase word in dictionary, then add it.
 822
 823         uint i,n = strlen(input);
 824         // check for upper-case chars
 825         for (i = n-1;i >= 0;i --)
 826                 if (viet_toupper(input[i]) == input[i])
 827                         break;
 828
 829         if (i <= 0 || n < 2)                                    // ignore if the only upper char is the first one.
 830                 return false;
 831
 832         string s;
 833
 834         s = input;
 835         s[0] = viet_toupper(s[0]);
 836         for (i = 1;i < n;i ++)
 837                 s[i] = viet_tolower(s[i]);
 838         if (s != string(input))
 839                 output.insert(s,v);
 840
 841         s = input;
 842         for (i = 0;i < n;i ++)
 843                 s[i] = viet_tolower(s[i]);
 844         if (s != string(input))
 845                 output.insert(s,v);
 846
 847         s = input;
 848         for (i = 0;i < n;i ++)
 849                 s[i] = viet_toupper(s[i]);
 850         if (s != string(input))
 851                 output.insert(s,v);
 852         return true;
 853 }
 854
 855 void get_phonetic_syllable_candidates(const char *input,Candidates &output,float v)
 856 {
 857         vector<confusion_set>& confusion_sets = get_confusion_sets();
 858         int i,j,m,n = confusion_sets.size();
 859         bool ret = false;
 860         set<Syllable> syllset,syllset2;
 861         Syllable _syll;
 862
 863         _syll.parse(input);
 864         syllset2.insert(_syll);
 865         while (!syllset2.empty()) {
 866                 const Syllable sy = *syllset2.begin();
 867                 syllset2.erase(syllset2.begin());
 868
 869                 if (syllset.find(sy) != syllset.end())
 870                         continue;                                                               // we already matched&applied this syllable
 871
 872                 //cerr << sy << endl;
 873                 syllset.insert(sy);
 874
 875                 vector<Syllable> sylls;
 876                 // match & apply
 877                 for (i = 0;i < n;i ++) {
 878                         m = confusion_sets[i].size();
 879                         for (j = 0;j < m;j ++)
 880                                 if (confusion_sets[i][j].match(sy))
 881                                         break;
 882                         if (j < m) {
 883                                 for (j = 0;j < m;j ++)
 884                                         confusion_sets[i][j].apply(sy,sylls);
 885                         }
 886                 }
 887                 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
 888         }
 889
 890         // move to _nodes
 891         //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
 892         set<Syllable>::iterator iter;
 893         for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
 894                 string s = iter->to_std_str();
 895                 string ss = get_lowercased_syllable(s);
 896                 //cerr << s << endl;
 897                 if (sarch.in_dict(sarch[s]) ||
 898                                 sarch.in_dict(sarch[ss]))
 899                         output.insert(iter->to_str(),v+1);
 900         }
 901 }
 902
 903 void get_syllable_candidates(const char *input,Candidates &output,float v)
 904 {
 905         Syllable syll;
 906         string s,s2;
 907         set<string> s3;
 908         set<string>::iterator s3i;
 909
 910
 911         // bo dau sai vi tri
 912         if (syll.parse(input) &&
 913                         syll.to_str() != string(input))
 914                 output.insert(syll.to_str(),v+10);
 915
 916         get_phonetic_syllable_candidates(input,output,v);
 917
 918         KeyRecover keyr;
 919         keyr.init(input);
 920         while (keyr.step(s)) {
 921                 s2 = get_std_syllable(s);
 922                 if (s2 != s && syll.parse(s2.c_str()))
 923                         output.insert(syll.to_str(),v);
 924                 s3.clear();
 925                 im_recover(s.c_str(),s3);
 926                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 927                         s2 = get_std_syllable(*s3i);
 928                         if (s2 != *s3i && syll.parse(s2.c_str()))
 929                                 output.insert(syll.to_str(),v);
 930                 }
 931         }
 932         keyr.done();
 933
 934         // SpaceInserter
 935         uint i,n = strlen(input);
 936         for (i = 1;i < n;i ++) {
 937                 s = string(input).substr(0,i);
 938                 s2 = string(input).substr(i);
 939                 if (syll.parse(s.c_str())) {
 940                         s = syll.to_str();
 941                         if (syll.parse(s2.c_str())) {
 942                                 output.insert(s + string(" ") + syll.to_str(),v);
 943                         }
 944                 }
 945         }
 946
 947         CharInserter inserter;
 948         inserter.init(input);
 949         while (inserter.step(s)) {
 950                 s2 = get_std_syllable(s);
 951                 if (s2 != s && syll.parse(s2.c_str()))
 952                         output.insert(syll.to_str(),v);
 953                 s3.clear();
 954                 im_recover(s.c_str(),s3);
 955                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 956                         s2 = get_std_syllable(*s3i);
 957                         if (s2 != *s3i && syll.parse(s2.c_str()))
 958                                 output.insert(syll.to_str(),v);
 959                 }
 960         }
 961         inserter.done();
 962
 963         CharEraser eraser;
 964         eraser.init(input);
 965         while (eraser.step(s)) {
 966                 s2 = get_std_syllable(s);
 967                 if (s2 != s && syll.parse(s2.c_str()))
 968                         output.insert(syll.to_str(),v);
 969                 s3.clear();
 970                 im_recover(s.c_str(),s3);
 971                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 972                         s2 = get_std_syllable(*s3i);
 973                         if (s2 != *s3i && syll.parse(s2.c_str()))
 974                                 output.insert(syll.to_str(),v);
 975                 }
 976         }
 977         eraser.done();
 978
 979         CharTransposer transposer;
 980         transposer.init(input);
 981         while (transposer.step(s)) {
 982                 s2 = get_std_syllable(s);
 983                 if (s2 != s && syll.parse(s2.c_str()))
 984                         output.insert(syll.to_str(),v);
 985                 s3.clear();
 986                 im_recover(s.c_str(),s3);
 987                 for (s3i = s3.begin(); s3i != s3.end(); ++ s3i) {
 988                         s2 = get_std_syllable(*s3i);
 989                         if (s2 != *s3i && syll.parse(s2.c_str()))
 990                                 output.insert(syll.to_str(),v);
 991                 }
 992         }
 993         transposer.done();
 994 }
 995
 996 void get_left_syllable_candidates(const char *input,const char *left,Candidates &output)
 997 {
 998         // merge
 999         string s;
1000         s = string(left)+string(input);
1001         get_syllable_candidates(s.c_str(),output,10);
1002
1003         // cut one char from input
1004         if (strlen(input) > 1) {
1005                 s = string(input+1);
1006                 get_syllable_candidates(s.c_str(),output,10);
1007         }
1008
1009         // insert one char from left to input
1010         if (strlen(left)) {
1011                 s = string(" ") + string(input);
1012                 s[0] = left[strlen(left)-1];
1013                 get_syllable_candidates(s.c_str(),output,10);
1014         }
1015 }
1016
1017 void get_right_syllable_candidates(const char *input,const char *right,Candidates &output)
1018 {
1019         // merge
1020         string s;
1021         s = string(input)+string(right);
1022         get_syllable_candidates(s.c_str(),output,10);
1023
1024         // cut one char from input
1025         if (strlen(input) > 1) {
1026                 s = string(input);
1027                 s.resize(strlen(input)-1);
1028                 get_syllable_candidates(s.c_str(),output,10);
1029         }
1030
1031         // insert one char from right to input
1032         if (strlen(right)) {
1033                 s = string(input)+string(" ");
1034                 s[s.size()-1] = right[0];
1035                 get_syllable_candidates(s.c_str(),output,10);
1036         }
1037 }
1038
1039
1040
1041 void Candidates::insert(const std::string &s,float f)
1042 {
1043         Candidate c;
1044         c.candidate = s;
1045         c.priority = f;
1046         set<Candidate>::iterator iter = candidates.find(c);
1047         if (iter != candidates.end()) {
1048                 if (iter->priority < c.priority)
1049                         candidates.erase(iter);
1050                 else
1051                         return;
1052         }
1053         candidates.insert(c);
1054 }
1055
1056 bool Candidates::CandidateComparator::operator()(const std::string &s1,const std::string &s2)
1057 {
1058         set<Candidate>::iterator i1,i2;
1059         Candidate c1,c2;
1060         c1.candidate = s1;
1061         c2.candidate = s2;
1062         i1 = c.candidates.find(c1);
1063         i2 = c.candidates.find(c2);
1064         if (i1->priority != i2->priority)
1065                 return i1->priority > i2->priority;
1066         float f1,f2;
1067         VocabIndex v;
1068         v = Vocab_None;
1069         f1 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s1))],&v);
1070         f2 = -get_syngram().wordProb(sarch[get_std_syllable(get_lowercased_syllable(s2))],&v);
1071         //cerr << f1 << "<>" << f2 << endl;
1072         return f1 > f2; // we want reverse order
1073 }
1074
1075 void Candidates::get_list(std::vector<std::string> &v)
1076 {
1077         set<Candidate>::iterator iter;
1078
1079         iter = candidates.begin();
1080         while (iter != candidates.end()) {
1081                 if (!is_valid_word_form(iter->candidate.c_str())) {
1082                         get_case_syllable_candidates(iter->candidate.c_str(),*this,iter->priority);
1083                         candidates.erase(iter++);
1084                 } else
1085                         ++iter;
1086         }
1087
1088         v.resize(candidates.size());
1089         uint n = 0;
1090         for (iter = candidates.begin();iter != candidates.end(); ++iter)
1091                 if (sarch.in_dict(get_std_syllable(get_lowercased_syllable(iter->candidate))))
1092                         v[n++] = iter->candidate;
1093         v.resize(n);
1094         sort(v.begin(),v.end(),CandidateComparator(*this));
1095 }