Fixed leaks in LM::operator[](const char *) and LM::clear_oov()
[vspell.git] / libvspell / words.cpp
blob56d601e814e444c3162d3cebfce58525c4a6df7c
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include "spell.h"
3 #include <iterator>
4 #include <algorithm>
5 #include <sstream>
6 #include <iostream>
7 #include <fstream>
8 #include <boost/format.hpp>
9 #include <set>
10 #include "syllable.h"
11 #include "propername.h"
14 using namespace std;
16 WordState::WordState(const WordState &ws):dnode(ws.dnode),sent(ws.sent),fuzid(ws.fuzid),pos(ws.pos),len(ws.len)
20 void WordState::add_word(set<WordEntry> &we,LeafNNode*leaf)
22 WordEntry e;
23 e.pos = pos;
24 e.len = len;
25 e.fuzid = fuzid;
26 e.node = leaf;
27 //cerr << "Add " << e << endl;
28 we.insert(e);
31 void WordState::collect_words(set<WordEntry> &we)
33 LeafNNode *leaf = dnode.node->get_leaf(sarch["<mainleaf>"]);
34 if (leaf)
35 add_word(we,leaf);
38 void UpperWordState::collect_words(set<WordEntry> &we)
40 LeafNNode *leaf = dnode.node->get_leaf(sarch["<caseleaf>"]);
41 if (leaf)
42 add_word(we,leaf);
45 void WordState::get_first(WordStates &states,uint _pos)
47 dnode.node = warch.get_root();
48 pos = _pos;
49 fuzid = 0;
50 len = 0;
51 get_next(states);
54 void ExactWordState::get_next(WordStates &states)
56 uint i = pos+len;
57 BranchNNode *branch = dnode.node->get_branch(sent[i].get_cid());
58 if (branch == NULL) {
59 delete this;
60 return;
62 //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
63 states.push_back(this);
64 // change the info
65 dnode = branch;
66 len ++;
69 void LowerWordState::get_next(WordStates &states)
71 string s1,s2;
72 uint i = pos+len;
73 s1 = get_ngram()[sent[i].get_cid()];
74 s2 = get_lowercased_syllable(s1);
75 BranchNNode *branch = dnode.node->get_branch(get_ngram()[s2]);
76 if (branch == NULL) {
77 delete this;
78 return;
80 //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
81 states.push_back(this);
82 // change the info
83 dnode = branch;
84 len ++;
85 if (s1 != s2)
86 fuzid |= 1 << (i-pos);
89 void FuzzyWordState::get_next(WordStates &states)
91 vector<confusion_set>& confusion_sets = get_confusion_sets();
92 int i,j,m,n = confusion_sets.size();
93 bool ret = false;
94 set<Syllable> syllset,syllset2;
95 Syllable _syll;
96 uint _i = pos+len;
97 string s1 = get_ngram()[sent[_i].get_cid()];
99 _syll.parse(s1.c_str());
101 syllset2.insert(_syll);
102 while (!syllset2.empty()) {
103 const Syllable sy = *syllset2.begin();
104 syllset2.erase(syllset2.begin());
106 if (syllset.find(sy) != syllset.end())
107 continue; // we already matched&applied this syllable
109 //cerr << sy << endl;
110 syllset.insert(sy);
113 vector<Syllable> sylls;
114 // match & apply
115 for (i = 0;i < n;i ++) {
116 m = confusion_sets[i].size();
117 for (j = 0;j < m;j ++)
118 if (confusion_sets[i][j].match(sy)) {
119 //cerr << "Match " << i << " " << j << endl;
120 break;
122 if (j < m) {
123 for (j = 0;j < m;j ++) {
124 confusion_sets[i][j].apply(sy,sylls);
125 //cerr << "Apply " << i << " " << j << endl;
129 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
132 // move to _nodes
133 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
134 set<Syllable>::iterator iter;
135 for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
136 //cerr << iter->to_std_str() << endl;
137 string str = get_lowercased_syllable(iter->to_std_str());
138 BranchNNode::const_np_range range = dnode.node->get_nodes().equal_range(get_ngram()[str]);
139 BranchNNode::node_map::const_iterator pnode;
140 for (pnode = range.first;pnode != range.second;++pnode)
141 if (!pnode->second->is_leaf()) {
142 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
143 FuzzyWordState *s = new FuzzyWordState(*this);
145 // change the info
146 s->dnode.node = (BranchNNode*)pnode->second.get();
147 s->len++;
148 if (s1 != str)
149 s->fuzid |= 1 << (_i-s->pos);
150 states.push_back(s);
151 //cerr << nodes[ii] << endl;
155 delete this;
159 Find all possible words. The whole process is divided into two
160 phases. The first one is pre_construct(), which creates base words
161 and store in set<WordEntry>. The second one is post_construct(),
162 which does the rest. Using pre_construct(),post_construct() directly
163 give chances to modify the lattice creation.
165 \param sent specify the input sentence
166 \param w must be cleared before calling this function
168 void Lattice::construct(const boost::shared_ptr<const Sentence> &sent)
170 set<WordEntry> wes;
171 WordStateFactories factories;
172 ExactWordStateFactory exact;
173 LowerWordStateFactory lower;
174 UpperWordStateFactory upper;
175 FuzzyWordStateFactory fuzzy;
176 factories.push_back(&exact);
177 factories.push_back(&lower);
178 factories.push_back(&upper);
179 factories.push_back(&fuzzy);
180 pre_construct(sent,wes,factories);
181 mark_proper_name(*sent,wes);
182 post_construct(wes);
186 The first phase of lattice creation. Create all possible words to we.
189 void Lattice::pre_construct(const boost::shared_ptr<const Sentence> &st,set<WordEntry> &we,const WordStateFactories &f)
191 Lattice &w = *this;
192 int i,n,ii,nn,k,nnn,iii;
193 int fi,fn = f.size();
195 //cerr << "construct\n";
197 w.st = st;
199 WordStates states1;
200 WordStates states2;
201 states1.reserve(10);
202 states2.reserve(10);
204 n = st->get_syllable_count();
206 for (i = 0;i < n;i ++) {
208 //cerr << *this << endl;
210 states2.clear();
212 // new states
213 for (fi = 0;fi < fn;fi ++)
214 f[fi]->create_new(states2,i,*st);
216 // move old states to new states
217 nn = states1.size();
218 for (ii = 0;ii < nn;ii ++)
219 // state1[ii].get_next() have to delete itself if necessary.
220 states1[ii]->get_next(states2);
222 // get completed words
223 nn = states2.size();
224 for (ii = 0;ii < nn;ii ++)
225 states2[ii]->collect_words(we);
227 states1.swap(states2);
230 //cerr << *this << endl;
232 nn = states1.size();
233 for (ii = 0;ii < nn;ii ++)
234 delete states1[ii];
235 //cerr << *this << endl;
240 The second phase of lattice creation.
243 bool we_pos_cmp(const WordEntry &w1,const WordEntry &w2)
245 return w1.pos < w2.pos;
248 void Lattice::post_construct(set<WordEntry> &we)
250 Lattice &w = *this;
251 unsigned i,n,k;
253 n = st->get_syllable_count();
255 uint max = 0,len = 0;
256 while (max < n) {
258 // find out how far we can go from head (pos -> max, len -> len)
259 set<int> traces;
260 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr;
261 set<WordEntry>::iterator iter;
262 traces.insert(max);
263 while (!traces.empty()) {
264 uint ii, nn = 0;
265 WordEntry fake_entry;
266 fake_entry.pos = *traces.begin();
267 traces.erase(traces.begin());
268 pr = equal_range(we.begin(),we.end(),fake_entry,we_pos_cmp);
269 for (iter = pr.first;iter != pr.second; ++iter) {
270 if (iter->pos+iter->len < n)
271 traces.insert(iter->pos+iter->len);
272 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr2;
273 set<WordEntry>::iterator iter2;
274 WordEntry fake_entry2;
275 fake_entry2.pos = iter->pos+iter->len;
276 pr2 = equal_range(we.begin(),we.end(),fake_entry2,we_pos_cmp);
277 if (pr2.first == pr2.second && len < iter->pos+iter->len) {
278 max = iter->pos;
279 len = iter->pos+iter->len;
284 // Yee, we're coming tail!
285 if (len >= n)
286 break;
288 // make sure that one can go from source to destination, without gaps.
289 WordEntry e;
290 e.pos = len;
291 e.len = 1;
292 e.fuzid = 0;
293 // if one starts with a cardinal number, then mark it number_id
294 string s = get_ngram()[(*st)[max].get_cid()];
295 //cerr << "Consider " << s;
296 if (strchr("0123456789",s[1]) != NULL)
297 e.node = get_special_node(NUMBER_ID);
298 else {
299 int iiii,nnnn = s.size();
300 for (iiii = 0;iiii < nnnn;iiii ++)
301 if (viet_ispunct((unsigned char)s[iiii]))
302 break;
303 if (iiii < nnnn)
304 e.node = get_special_node(PUNCT_ID);
305 else
306 e.node = get_special_node(UNK_ID);
308 max = len+1;
309 len = max;
310 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
311 we.insert(e);
314 //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
315 // copy to real _we
316 n = we.size();
317 w.we = boost::shared_ptr<WordEntries>(new WordEntries(n));
318 //w.we->resize(n);
319 copy(we.begin(),we.end(),w.we->begin());
320 for (i = 0;i < n;i ++) {
321 (*w.we)[i].id = i;
323 // build Lattice structure
324 w.construct();
328 Post-process after initialize core member of Lattice
331 void Lattice::construct()
333 int i_we,n_we = we->size();
335 for (i_we = 0;i_we < n_we;i_we ++) {
336 //cerr << "=" << i_we << endl;
337 add((*we)[i_we]);
341 Dump a Lattice
343 ostream& operator << (ostream &os, const Lattice &w)
345 int i,n;
347 if (!!w.we) {
348 n = w.we->size();
349 for (i = 0;i < n;i ++)
350 os << (*w.we)[i] << endl;
354 n = w.wi.size();
355 for (i = 0;i < n;i ++) {
356 int ii,nn = w.wi[i].we.size();
357 if (nn) cerr << ">" << i << " ";
358 for (ii = 0;ii < nn;ii ++)
359 cerr << w.wi[i].we[ii] << " ";
360 if (nn)
361 cerr << endl;
362 nn = w.wi[i].fuzzy_map.size();
363 if (nn) cerr << "<" << i << " ";
364 for (ii = 0;ii < nn;ii ++)
365 cerr << w.wi[i].fuzzy_map[ii] << " ";
366 if (nn)
367 cerr << endl;
371 int i, nn = w.get_word_count();
372 for (i = 0;i < nn;i ++) {
373 int nnn = w.get_len(i);
374 for (int ii = 0;ii < nnn;ii ++) {
375 int nnnn = w.get_fuzzy_count(i,ii);
376 if (w.get_we_exact(i,ii))
377 os << *w.get_we_exact(i,ii) << endl;
378 for (int iii = 0;iii < nnnn;iii ++)
379 os << w.get_we_fuzzy(i,ii,iii) << endl;
383 return os;
386 istream& operator >> (istream &is, Lattice &w)
388 int i,n;
389 w.we = boost::shared_ptr<WordEntries>(new WordEntries);
391 string s;
392 while (getline(is,s)) {
393 if (s.empty())
394 break;
395 istringstream iss(s);
396 w.we->push_back(WordEntry());
397 iss >> w.we->back();
398 if (w.we->back().node.node == NULL) {
399 cerr << "Invalid WordEntry!" << endl;
400 w.we->pop_back();
403 n = w.we->size();
404 for (i = 0;i < n;i ++) {
405 (*w.we)[i].id = i;
407 w.construct();
408 boost::shared_ptr<Sentence> st(new Sentence(w));
409 w.st = st;
410 return is;
413 Lattice::~Lattice()
416 int pos,nr_pos = get_word_count();
417 for (pos = 0;pos < nr_pos;pos ++) {
418 int len,nr_len = get_len(pos);
419 for (len = 0;len < nr_len;len ++)
420 delete (*(*this)[pos])[len];
421 delete (*this)[pos];
426 std::ostream& operator << (std::ostream &os,const WordEntry &we)
428 os << boost::format("%d %d %04x ") % we.pos % we.len % we.fuzid << we.node;
429 return os;
432 std::istream& operator >> (std::istream &is,WordEntry &we)
434 is >> we.pos >> we.len >> hex >> we.fuzid >> dec >> we.node;
435 return is;
439 Construct a Lattice based on another Lattice.
440 Keep only exact matches.
443 void Lattice::based_on(const Lattice &w)
445 vector<WordInfos> &me = wi;
446 we = w.we;
447 st = w.st;
449 me.resize(w.get_word_count());
450 int i,n = we->size();
451 for (i = 0;i < n;i ++)
452 if ((*we)[i].fuzid == 0)
453 add((*we)[i]);
456 void Lattice::add(WordEntry &w)
458 vector<WordInfos> &me = wi;
459 if (me.size() <= w.pos)
460 me.resize(w.pos+1);
462 if (w.fuzid) {
463 for (unsigned int j = 0;j < w.len;j ++)
464 if (w.fuzid & (1 << j)) {
465 if (me.size() <= j+w.pos)
466 me.resize(j+w.pos+1);
467 me[j+w.pos].fuzzy_map.push_back(&w);
471 WordInfos &wis = me[w.pos];
472 wis.we.push_back(&w);
475 WordInfos::WordInfos()
479 unsigned int Lattice::get_len(unsigned int p) const
481 const WordEntryRefs &we = get_we(p);
482 return we.size();
485 void apply_separator(std::set<WordEntry> &wes,int i)
487 set<WordEntry>::iterator iter;
488 //cerr << "Separator applying after " << i << endl;
489 for (iter = wes.begin();iter != wes.end(); ++iter) {
490 //cerr << *iter << endl;
491 if (iter->pos <= i) {
492 if (iter->pos+iter->len-1 >= i+1) { // a word that across the separator
493 //cerr << "Remove " << *iter << endl;
494 wes.erase(iter);
496 } else
497 break; // because it's sorted, we don't need to go farther