softcount: tolerate zero ngrams
[vspell.git] / libvspell / words.cpp
blobc3a6c69a2ce1aa39b7778d6c68417f2298f9e2cb
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include "spell.h"
3 #include <iterator>
4 #include <algorithm>
5 #include <sstream>
6 #include <iostream>
7 #include <fstream>
8 #include <cassert>
9 #include <boost/format.hpp>
10 #include <set>
11 #include "syllable.h"
12 #include "propername.h"
15 using namespace std;
17 WordState::WordState(const WordState &ws):dnode(ws.dnode),sent(ws.sent),fuzid(ws.fuzid),pos(ws.pos),len(ws.len)
21 void WordState::add_word(set<WordEntry> &we,LeafNNode*leaf)
23 WordEntry e;
25 assert(leaf);
27 e.pos = pos;
28 e.len = len;
29 e.fuzid = fuzid;
30 e.node = leaf;
31 //cerr << "Add " << e << endl;
32 we.insert(e);
35 void WordState::collect_words(set<WordEntry> &we)
37 assert(dnode.node);
38 LeafNNode *leaf = dnode.node->get_leaf(sarch["<mainleaf>"]);
39 if (leaf)
40 add_word(we,leaf);
43 void UpperWordState::collect_words(set<WordEntry> &we)
45 assert(dnode.node);
46 LeafNNode *leaf = dnode.node->get_leaf(sarch["<caseleaf>"]);
47 if (leaf)
48 add_word(we,leaf);
51 void WordState::get_first(WordStates &states,uint _pos)
53 dnode.node = warch.get_root();
54 assert(dnode.node);
55 pos = _pos;
56 fuzid = 0;
57 len = 0;
58 get_next(states);
61 void ExactWordState::get_next(WordStates &states)
63 uint i = pos+len;
64 assert(dnode.node);
65 BranchNNode *branch = dnode.node->get_branch(sent[i].get_cid());
66 if (branch == NULL) {
67 delete this;
68 return;
70 //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
71 states.push_back(this);
72 // change the info
73 dnode = branch;
74 len ++;
77 void LowerWordState::get_next(WordStates &states)
79 string s1,s2;
80 uint i = pos+len;
81 s1 = get_ngram()[sent[i].get_cid()];
82 s2 = get_lowercased_syllable(s1);
83 assert(dnode.node);
84 BranchNNode *branch = dnode.node->get_branch(get_ngram()[s2]);
85 if (branch == NULL) {
86 delete this;
87 return;
89 //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
90 states.push_back(this);
91 // change the info
92 dnode = branch;
93 len ++;
94 if (s1 != s2)
95 fuzid |= 1 << (i-pos);
98 void FuzzyWordState::get_next(WordStates &states)
100 vector<confusion_set>& confusion_sets = get_confusion_sets();
101 int i,j,m,n = confusion_sets.size();
102 bool ret = false;
103 set<Syllable> syllset,syllset2;
104 Syllable _syll;
105 uint _i = pos+len;
106 string s1 = get_ngram()[sent[_i].get_cid()];
108 assert(dnode.node);
109 _syll.parse(s1.c_str());
111 syllset2.insert(_syll);
112 while (!syllset2.empty()) {
113 const Syllable sy = *syllset2.begin();
114 syllset2.erase(syllset2.begin());
116 if (syllset.find(sy) != syllset.end())
117 continue; // we already matched&applied this syllable
119 //cerr << sy << endl;
120 syllset.insert(sy);
123 vector<Syllable> sylls;
124 // match & apply
125 for (i = 0;i < n;i ++) {
126 m = confusion_sets[i].size();
127 for (j = 0;j < m;j ++)
128 if (confusion_sets[i][j].match(sy)) {
129 //cerr << "Match " << i << " " << j << endl;
130 break;
132 if (j < m) {
133 for (j = 0;j < m;j ++) {
134 confusion_sets[i][j].apply(sy,sylls);
135 //cerr << "Apply " << i << " " << j << endl;
139 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
142 // move to _nodes
143 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
144 set<Syllable>::iterator iter;
145 for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
146 //cerr << iter->to_std_str() << endl;
147 string str = get_lowercased_syllable(iter->to_std_str());
148 BranchNNode::const_np_range range = dnode.node->get_nodes().equal_range(get_ngram()[str]);
149 BranchNNode::node_map::const_iterator pnode;
150 for (pnode = range.first;pnode != range.second;++pnode)
151 if (!pnode->second->is_leaf()) {
152 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
153 FuzzyWordState *s = new FuzzyWordState(*this);
155 // change the info
156 s->dnode.node = (BranchNNode*)pnode->second.get();
157 assert(s->dnode.node);
158 s->len++;
159 if (s1 != str)
160 s->fuzid |= 1 << (_i-s->pos);
161 states.push_back(s);
162 //cerr << nodes[ii] << endl;
166 delete this;
170 Find all possible words. The whole process is divided into two
171 phases. The first one is pre_construct(), which creates base words
172 and store in set<WordEntry>. The second one is post_construct(),
173 which does the rest. Using pre_construct(),post_construct() directly
174 give chances to modify the lattice creation.
176 \param sent specify the input sentence
177 \param w must be cleared before calling this function
179 void Lattice::construct(const boost::shared_ptr<const Sentence> &sent)
181 set<WordEntry> wes;
182 WordStateFactories factories;
183 ExactWordStateFactory exact;
184 LowerWordStateFactory lower;
185 UpperWordStateFactory upper;
186 FuzzyWordStateFactory fuzzy;
187 factories.push_back(&exact);
188 factories.push_back(&lower);
189 factories.push_back(&upper);
190 factories.push_back(&fuzzy);
191 pre_construct(sent,wes,factories);
192 mark_proper_name(*sent,wes);
193 post_construct(wes);
197 The first phase of lattice creation. Create all possible words to we.
200 void Lattice::pre_construct(const boost::shared_ptr<const Sentence> &st,set<WordEntry> &we,const WordStateFactories &f)
202 Lattice &w = *this;
203 int i,n,ii,nn,k,nnn,iii;
204 int fi,fn = f.size();
206 //cerr << "construct\n";
208 w.st = st;
210 WordStates states1;
211 WordStates states2;
212 states1.reserve(10);
213 states2.reserve(10);
215 n = st->get_syllable_count();
217 for (i = 0;i < n;i ++) {
219 //cerr << *this << endl;
221 states2.clear();
223 // new states
224 for (fi = 0;fi < fn;fi ++)
225 f[fi]->create_new(states2,i,*st);
227 // move old states to new states
228 nn = states1.size();
229 for (ii = 0;ii < nn;ii ++)
230 // state1[ii].get_next() have to delete itself if necessary.
231 states1[ii]->get_next(states2);
233 // get completed words
234 nn = states2.size();
235 for (ii = 0;ii < nn;ii ++)
236 states2[ii]->collect_words(we);
238 states1.swap(states2);
241 //cerr << *this << endl;
243 nn = states1.size();
244 for (ii = 0;ii < nn;ii ++)
245 delete states1[ii];
246 //cerr << *this << endl;
251 The second phase of lattice creation.
254 bool we_pos_cmp(const WordEntry &w1,const WordEntry &w2)
256 return w1.pos < w2.pos;
259 void Lattice::post_construct(set<WordEntry> &we)
261 Lattice &w = *this;
262 unsigned i,n,k;
264 n = st->get_syllable_count();
266 uint max = 0,len = 0;
267 while (max < n) {
269 // find out how far we can go from head (pos -> max, len -> len)
270 set<int> traces;
271 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr;
272 set<WordEntry>::iterator iter;
273 traces.insert(max);
274 while (!traces.empty()) {
275 uint ii, nn = 0;
276 WordEntry fake_entry;
277 fake_entry.pos = *traces.begin();
278 traces.erase(traces.begin());
279 pr = equal_range(we.begin(),we.end(),fake_entry,we_pos_cmp);
280 for (iter = pr.first;iter != pr.second; ++iter) {
281 if (iter->pos+iter->len < n)
282 traces.insert(iter->pos+iter->len);
283 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr2;
284 set<WordEntry>::iterator iter2;
285 WordEntry fake_entry2;
286 fake_entry2.pos = iter->pos+iter->len;
287 pr2 = equal_range(we.begin(),we.end(),fake_entry2,we_pos_cmp);
288 if (pr2.first == pr2.second && len < iter->pos+iter->len) {
289 max = iter->pos;
290 len = iter->pos+iter->len;
295 // Yee, we're coming tail!
296 if (len >= n)
297 break;
299 // make sure that one can go from source to destination, without gaps.
300 WordEntry e;
301 e.pos = len;
302 e.len = 1;
303 e.fuzid = 0;
304 // if one starts with a cardinal number, then mark it number_id
305 string s = get_ngram()[(*st)[max].get_cid()];
306 //cerr << "Consider " << s;
307 if (strchr("0123456789",s[1]) != NULL)
308 e.node = get_special_node(NUMBER_ID);
309 else {
310 int iiii,nnnn = s.size();
311 for (iiii = 0;iiii < nnnn;iiii ++)
312 if (viet_ispunct((unsigned char)s[iiii]))
313 break;
314 if (iiii < nnnn)
315 e.node = get_special_node(PUNCT_ID);
316 else
317 e.node = get_special_node(UNK_ID);
319 max = len+1;
320 len = max;
321 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
322 we.insert(e);
325 //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
326 // copy to real _we
327 n = we.size();
328 w.we = boost::shared_ptr<WordEntries>(new WordEntries(n));
329 //w.we->resize(n);
330 copy(we.begin(),we.end(),w.we->begin());
331 for (i = 0;i < n;i ++) {
332 (*w.we)[i].id = i;
334 // build Lattice structure
335 w.construct();
339 Post-process after initialize core member of Lattice
342 void Lattice::construct()
344 int i_we,n_we = we->size();
346 for (i_we = 0;i_we < n_we;i_we ++) {
347 //cerr << "=" << i_we << endl;
348 add((*we)[i_we]);
352 Dump a Lattice
354 ostream& operator << (ostream &os, const Lattice &w)
356 int i,n;
358 if (!!w.we) {
359 n = w.we->size();
360 for (i = 0;i < n;i ++)
361 os << (*w.we)[i] << endl;
365 n = w.wi.size();
366 for (i = 0;i < n;i ++) {
367 int ii,nn = w.wi[i].we.size();
368 if (nn) cerr << ">" << i << " ";
369 for (ii = 0;ii < nn;ii ++)
370 cerr << w.wi[i].we[ii] << " ";
371 if (nn)
372 cerr << endl;
373 nn = w.wi[i].fuzzy_map.size();
374 if (nn) cerr << "<" << i << " ";
375 for (ii = 0;ii < nn;ii ++)
376 cerr << w.wi[i].fuzzy_map[ii] << " ";
377 if (nn)
378 cerr << endl;
382 int i, nn = w.get_word_count();
383 for (i = 0;i < nn;i ++) {
384 int nnn = w.get_len(i);
385 for (int ii = 0;ii < nnn;ii ++) {
386 int nnnn = w.get_fuzzy_count(i,ii);
387 if (w.get_we_exact(i,ii))
388 os << *w.get_we_exact(i,ii) << endl;
389 for (int iii = 0;iii < nnnn;iii ++)
390 os << w.get_we_fuzzy(i,ii,iii) << endl;
394 return os;
397 istream& operator >> (istream &is, Lattice &w)
399 int i,n;
400 w.we = boost::shared_ptr<WordEntries>(new WordEntries);
402 string s;
403 while (getline(is,s)) {
404 if (s.empty())
405 break;
406 istringstream iss(s);
407 w.we->push_back(WordEntry());
408 iss >> w.we->back();
409 if (w.we->back().node.node == NULL) {
410 cerr << "Invalid WordEntry: " << s << endl;
411 w.we->pop_back();
414 n = w.we->size();
415 for (i = 0;i < n;i ++) {
416 (*w.we)[i].id = i;
418 w.construct();
419 boost::shared_ptr<Sentence> st(new Sentence(w));
420 w.st = st;
421 return is;
424 Lattice::~Lattice()
427 int pos,nr_pos = get_word_count();
428 for (pos = 0;pos < nr_pos;pos ++) {
429 int len,nr_len = get_len(pos);
430 for (len = 0;len < nr_len;len ++)
431 delete (*(*this)[pos])[len];
432 delete (*this)[pos];
437 std::ostream& operator << (std::ostream &os,const WordEntry &we)
439 os << boost::format("%d %d %04x ") % we.pos % we.len % we.fuzid << we.node;
440 return os;
443 std::istream& operator >> (std::istream &is,WordEntry &we)
445 is >> we.pos >> we.len >> hex >> we.fuzid >> dec >> we.node;
446 return is;
450 Construct a Lattice based on another Lattice.
451 Keep only exact matches.
454 void Lattice::based_on(const Lattice &w)
456 vector<WordInfos> &me = wi;
457 we = w.we;
458 st = w.st;
460 me.resize(w.get_word_count());
461 int i,n = we->size();
462 for (i = 0;i < n;i ++)
463 if ((*we)[i].fuzid == 0)
464 add((*we)[i]);
467 void Lattice::add(WordEntry &w)
469 vector<WordInfos> &me = wi;
470 if (me.size() <= w.pos)
471 me.resize(w.pos+1);
473 if (w.fuzid) {
474 for (unsigned int j = 0;j < w.len;j ++)
475 if (w.fuzid & (1 << j)) {
476 if (me.size() <= j+w.pos)
477 me.resize(j+w.pos+1);
478 me[j+w.pos].fuzzy_map.push_back(&w);
482 WordInfos &wis = me[w.pos];
483 wis.we.push_back(&w);
486 WordInfos::WordInfos()
490 unsigned int Lattice::get_len(unsigned int p) const
492 const WordEntryRefs &we = get_we(p);
493 return we.size();
496 void apply_separator(std::set<WordEntry> &wes,int i)
498 set<WordEntry>::iterator iter;
499 //cerr << "Separator applying after " << i << endl;
500 for (iter = wes.begin();iter != wes.end(); ++iter) {
501 //cerr << *iter << endl;
502 if (iter->pos <= i) {
503 if (iter->pos+iter->len-1 >= i+1) { // a word that across the separator
504 //cerr << "Remove " << *iter << endl;
505 wes.erase(iter);
507 } else
508 break; // because it's sorted, we don't need to go farther