libvspell/words.cpp

   1 #include "config.h"                                                     // -*- tab-width: 2 -*-
   2 #include "spell.h"
   3 #include <iterator>
   4 #include <algorithm>
   5 #include <sstream>
   6 #include <iostream>
   7 #include <fstream>
   8 #include <boost/format.hpp>
   9 #include <set>
  10 #include "syllable.h"
  11 #include "propername.h"
  12 #ifndef _SArray_cc_
  13 #include <libsrilm/SArray.cc>
  14 #endif
  15
  16
  17 using namespace std;
  18
  19 WordState::WordState(const WordState &ws):dnode(ws.dnode),sent(ws.sent),fuzid(ws.fuzid),pos(ws.pos),len(ws.len)
  20 {
  21 }
  22
  23 void WordState::add_word(set<WordEntry> &we,LeafNode*leaf)
  24 {
  25         WordEntry e;
  26         e.pos = pos;
  27         e.len = len;
  28         e.fuzid = fuzid;
  29         e.node = leaf;
  30         //cerr << "Add " << e << endl;
  31         we.insert(e);
  32 }
  33
  34 void WordState::collect_words(set<WordEntry> &we)
  35 {
  36         LeafNode *leaf = dnode.node->get_leaf(sarch["<mainleaf>"]);
  37         if (leaf)
  38                 add_word(we,leaf);
  39 }
  40
  41 void UpperWordState::collect_words(set<WordEntry> &we)
  42 {
  43         LeafNode *leaf = dnode.node->get_leaf(sarch["<caseleaf>"]);
  44         if (leaf)
  45                 add_word(we,leaf);
  46 }
  47
  48 void WordState::get_first(WordStates &states,uint _pos)
  49 {
  50         dnode.node = warch.get_root();
  51         pos = _pos;
  52         fuzid = 0;
  53         len = 0;
  54         get_next(states);
  55 }
  56
  57 void ExactWordState::get_next(WordStates &states)
  58 {
  59         uint i = pos+len;
  60         BranchNode *branch = dnode.node->get_branch(sent[i].get_cid());
  61         if (branch == NULL) {
  62                 delete this;
  63                 return;
  64         }
  65         //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
  66         states.push_back(this);
  67         // change the info
  68         dnode = branch;
  69         len ++;
  70 }
  71
  72 void LowerWordState::get_next(WordStates &states)
  73 {
  74         string s1,s2;
  75         uint i = pos+len;
  76         s1 = get_sarch()[sent[i].get_cid()];
  77         s2 = get_lowercased_syllable(s1);
  78         BranchNode *branch = dnode.node->get_branch(get_sarch()[s2]);
  79         if (branch == NULL) {
  80                 delete this;
  81                 return;
  82         }
  83         //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
  84         states.push_back(this);
  85         // change the info
  86         dnode = branch;
  87         len ++;
  88         if (s1 != s2)
  89                 fuzid |= 1 << (i-pos);
  90 }
  91
  92 void FuzzyWordState::get_next(WordStates &states)
  93 {
  94         vector<confusion_set>& confusion_sets = get_confusion_sets();
  95         int i,j,m,n = confusion_sets.size();
  96         bool ret = false;
  97         set<Syllable> syllset,syllset2;
  98         Syllable _syll;
  99         uint _i = pos+len;
 100         string s1 = get_sarch()[sent[_i].get_cid()];
 101
 102         _syll.parse(s1.c_str());
 103
 104         syllset2.insert(_syll);
 105         while (!syllset2.empty()) {
 106                 const Syllable sy = *syllset2.begin();
 107                 syllset2.erase(syllset2.begin());
 108
 109                 if (syllset.find(sy) != syllset.end())
 110                         continue;                                                               // we already matched&applied this syllable
 111
 112                 //cerr << sy << endl;
 113                 syllset.insert(sy);
 114
 115
 116                 vector<Syllable> sylls;
 117                 // match & apply
 118                 for (i = 0;i < n;i ++) {
 119                         m = confusion_sets[i].size();
 120                         for (j = 0;j < m;j ++)
 121                                 if (confusion_sets[i][j].match(sy)) {
 122                                         //cerr << "Match " << i << " " << j << endl;
 123                                         break;
 124                                 }
 125                         if (j < m) {
 126                                 for (j = 0;j < m;j ++) {
 127                                         confusion_sets[i][j].apply(sy,sylls);
 128                                         //cerr << "Apply " << i << " " << j << endl;
 129                                 }
 130                         }
 131                 }
 132                 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
 133         }
 134
 135         // move to _nodes
 136         //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
 137         set<Syllable>::iterator iter;
 138         for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
 139                 //cerr << iter->to_std_str() << endl;
 140                 string str = get_lowercased_syllable(iter->to_std_str());
 141                 BranchNode::const_np_range range = dnode.node->get_nodes().equal_range(get_sarch()[str]);
 142                 BranchNode::node_map::const_iterator pnode;
 143                 for (pnode = range.first;pnode != range.second;++pnode)
 144                         if (!pnode->second->is_leaf()) {
 145                                 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
 146                                 FuzzyWordState *s = new FuzzyWordState(*this);
 147
 148                                 // change the info
 149                                 s->dnode.node = (BranchNode*)pnode->second.get();
 150                                 s->len++;
 151                                 if (s1 != str)
 152                                         s->fuzid |= 1 << (_i-s->pos);
 153                                 states.push_back(s);
 154                                 //cerr << nodes[ii] << endl;
 155                         }
 156         }
 157
 158         delete this;
 159 }
 160
 161 /**
 162         Find all possible words. The whole process is divided into two
 163         phases. The first one is pre_construct(), which creates base words
 164         and store in set<WordEntry>. The second one is post_construct(),
 165         which does the rest. Using pre_construct(),post_construct() directly
 166         give chances to modify the lattice creation.
 167
 168         \param sent specify the input sentence
 169         \param w must be cleared before calling this function
 170 */
 171 void Lattice::construct(const Sentence &sent)
 172 {
 173         set<WordEntry> wes;
 174         WordStateFactories factories;
 175         ExactWordStateFactory exact;
 176         LowerWordStateFactory lower;
 177         UpperWordStateFactory upper;
 178         FuzzyWordStateFactory fuzzy;
 179         factories.push_back(&exact);
 180         factories.push_back(&lower);
 181         factories.push_back(&upper);
 182         factories.push_back(&fuzzy);
 183         pre_construct(sent,wes,factories);
 184         mark_proper_name(sent,wes);
 185         post_construct(wes);
 186 }
 187
 188 /**
 189         The first phase of lattice creation. Create all possible words to we.
 190  */
 191
 192 void Lattice::pre_construct(const Sentence &sent,set<WordEntry> &we,const WordStateFactories &f)
 193 {
 194         Lattice &w = *this;
 195         int i,n,ii,nn,k,nnn,iii;
 196         int fi,fn = f.size();
 197
 198         //cerr << "construct\n";
 199
 200         w.st = &sent;
 201
 202         WordStates states1;
 203         WordStates states2;
 204         states1.reserve(10);
 205         states2.reserve(10);
 206
 207         n = sent.get_syllable_count();
 208
 209         for (i = 0;i < n;i ++) {
 210
 211                 //cerr << *this << endl;
 212
 213                         states2.clear();
 214
 215                 // new states
 216                 for (fi = 0;fi < fn;fi ++)
 217                         f[fi]->create_new(states2,i,sent);
 218
 219                 // move old states to new states
 220                 nn = states1.size();
 221                 for (ii = 0;ii < nn;ii ++)
 222                         // state1[ii].get_next() have to delete itself if necessary.
 223                         states1[ii]->get_next(states2);
 224
 225                 // get completed words
 226                 nn = states2.size();
 227                 for (ii = 0;ii < nn;ii ++)
 228                         states2[ii]->collect_words(we);
 229
 230                 states1.swap(states2);
 231         }
 232
 233         //cerr << *this << endl;
 234
 235         nn = states1.size();
 236         for (ii = 0;ii < nn;ii ++)
 237                 delete states1[ii];
 238         //cerr << *this << endl;
 239
 240 }
 241
 242 /**
 243          The second phase of lattice creation.
 244  */
 245
 246 bool we_pos_cmp(const WordEntry &w1,const WordEntry &w2)
 247 {
 248         return w1.pos < w2.pos;
 249 }
 250
 251 void Lattice::post_construct(set<WordEntry> &we)
 252 {
 253         Lattice &w = *this;
 254         unsigned i,n,k;
 255
 256         n = st->get_syllable_count();
 257
 258         uint max = 0,len = 0;
 259         while (max < n) {
 260
 261                 // find out how far we can go from head (pos -> max, len -> len)
 262                 set<int> traces;
 263                 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr;
 264                 set<WordEntry>::iterator iter;
 265                 traces.insert(max);
 266                 while (!traces.empty()) {
 267                         uint ii, nn = 0;
 268                         WordEntry fake_entry;
 269                         fake_entry.pos = *traces.begin();
 270                         traces.erase(traces.begin());
 271                         pr = equal_range(we.begin(),we.end(),fake_entry,we_pos_cmp);
 272                         for (iter = pr.first;iter != pr.second; ++iter) {
 273                                 if (iter->pos+iter->len < n)
 274                                         traces.insert(iter->pos+iter->len);
 275                                 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr2;
 276                                 set<WordEntry>::iterator iter2;
 277                                 WordEntry fake_entry2;
 278                                 fake_entry2.pos = iter->pos+iter->len;
 279                                 pr2 = equal_range(we.begin(),we.end(),fake_entry2,we_pos_cmp);
 280                                 if (pr2.first == pr2.second && len < iter->pos+iter->len) {
 281                                         max = iter->pos;
 282                                         len = iter->pos+iter->len;
 283                                 }
 284                         }
 285                 }
 286
 287                 // Yee, we're coming tail!
 288                 if (len >= n)
 289                         break;
 290
 291                 // make sure that one can go from source to destination, without gaps.
 292                 WordEntry e;
 293                 e.pos = len;
 294                 e.len = 1;
 295                 e.fuzid = 0;
 296                 // if one starts with a cardinal number, then mark it number_id
 297                 string s = get_sarch()[(*st)[max].get_cid()];
 298                 //cerr << "Consider " << s;
 299                 if (strchr("0123456789",s[1]) != NULL)
 300                         e.node = get_special_node(NUMBER_ID);
 301                 else {
 302                         int iiii,nnnn = s.size();
 303                         for (iiii = 0;iiii < nnnn;iiii ++)
 304                                 if (viet_ispunct(s[i]))
 305                                         break;
 306                         if (iiii < nnnn)
 307                                 e.node = get_special_node(PUNCT_ID);
 308                         else
 309                                 e.node = get_special_node(UNK_ID);
 310                 }
 311                 max = len+1;
 312                 len = max;
 313                 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
 314                 we.insert(e);
 315         }
 316
 317         //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
 318         // copy to real _we
 319         n = we.size();
 320         w.we = boost::shared_ptr<WordEntries>(new WordEntries(n));
 321         //w.we->resize(n);
 322         copy(we.begin(),we.end(),w.we->begin());
 323         for (i = 0;i < n;i ++) {
 324                 (*w.we)[i].id = i;
 325         }
 326         // build Lattice structure
 327         w.construct();
 328 }
 329
 330 /**
 331          Post-process after initialize core member of Lattice
 332  */
 333
 334 void Lattice::construct()
 335 {
 336         int i_we,n_we = we->size();
 337
 338         for (i_we = 0;i_we < n_we;i_we ++) {
 339                 //cerr << "=" << i_we << endl;
 340                 add((*we)[i_we]);
 341         }
 342 }
 343 /**
 344          Dump a Lattice
 345  */
 346 ostream& operator << (ostream &os, const Lattice &w)
 347 {
 348         int i,n;
 349
 350         if (!!w.we) {
 351                 n = w.we->size();
 352                 for (i = 0;i < n;i ++)
 353                         os << (*w.we)[i] << endl;
 354         }
 355         /*
 356         n = w.wi.size();
 357         for (i = 0;i < n;i ++) {
 358                 int ii,nn = w.wi[i].we.size();
 359                 if (nn) cerr << ">" << i << " ";
 360                 for (ii = 0;ii < nn;ii ++)
 361                         cerr << w.wi[i].we[ii] << " ";
 362                 if (nn)
 363                         cerr << endl;
 364                 nn = w.wi[i].fuzzy_map.size();
 365                 if (nn) cerr << "<" << i << " ";
 366                 for (ii = 0;ii < nn;ii ++)
 367                         cerr << w.wi[i].fuzzy_map[ii] << " ";
 368                 if (nn)
 369                         cerr << endl;
 370         }
 371         */
 372         /*
 373         int i, nn = w.get_word_count();
 374         for (i = 0;i < nn;i ++) {
 375                 int nnn = w.get_len(i);
 376                 for (int ii = 0;ii < nnn;ii ++) {
 377                         int nnnn = w.get_fuzzy_count(i,ii);
 378                         if (w.get_we_exact(i,ii))
 379                                 os << *w.get_we_exact(i,ii) << endl;
 380                         for (int iii = 0;iii < nnnn;iii ++)
 381                                 os << w.get_we_fuzzy(i,ii,iii) << endl;
 382                 }
 383         }
 384         */
 385         return os;
 386 }
 387
 388 Lattice::~Lattice()
 389 {
 390         /*
 391         int pos,nr_pos = get_word_count();
 392         for (pos = 0;pos < nr_pos;pos ++) {
 393                 int len,nr_len = get_len(pos);
 394                 for (len = 0;len < nr_len;len ++)
 395                         delete (*(*this)[pos])[len];
 396                 delete (*this)[pos];
 397         }
 398         */
 399 }
 400
 401 std::ostream& operator << (std::ostream &os,const WordEntry &we)
 402 {
 403         using namespace boost;
 404         os << format("%d %d %x %d") % we.pos % we.len % we.fuzid % we.id << we.node;
 405         return os;
 406 }
 407
 408
 409 /**
 410          Construct a Lattice based on another Lattice.
 411          Keep only exact matches.
 412  */
 413
 414 void Lattice::based_on(const Lattice &w)
 415 {
 416         vector<WordInfos> &me = wi;
 417         we = w.we;
 418         st = w.st;
 419
 420         me.resize(w.get_word_count());
 421         int i,n = we->size();
 422         for (i = 0;i < n;i ++)
 423                 if ((*we)[i].fuzid == 0)
 424                         add((*we)[i]);
 425 }
 426
 427 void Lattice::add(WordEntry &w)
 428 {
 429         vector<WordInfos> &me = wi;
 430         if (me.size() <= w.pos)
 431                 me.resize(w.pos+1);
 432
 433         if (w.fuzid) {
 434                 for (unsigned int j = 0;j < w.len;j ++)
 435                         if (w.fuzid & (1 << j)) {
 436                                 if (me.size() <= j+w.pos)
 437                                         me.resize(j+w.pos+1);
 438                                 me[j+w.pos].fuzzy_map.push_back(&w);
 439                         }
 440         }
 441
 442         WordInfos &wis = me[w.pos];
 443         wis.we.push_back(&w);
 444 }
 445
 446 WordInfos::WordInfos()
 447 {
 448 }
 449
 450 unsigned int Lattice::get_len(unsigned int p) const
 451 {
 452         const WordEntryRefs &we = get_we(p);
 453         return we.size();
 454 }
 455
 456 void apply_separator(std::set<WordEntry> &wes,int i)
 457 {
 458         set<WordEntry>::iterator iter;
 459         //cerr << "Separator applying after " << i << endl;
 460         for (iter = wes.begin();iter != wes.end(); ++iter) {
 461                 //cerr << *iter << endl;
 462                 if (iter->pos <= i) {
 463                         if (iter->pos+iter->len-1 >= i+1)       { // a word that across the separator
 464                                 //cerr << "Remove " << *iter << endl;
 465                                 wes.erase(iter);
 466                         }
 467                 } else
 468                                 break;                                                          // because it's sorted, we don't need  to go farther
 469         }
 470 }