fixed bugs in Text::penalty2_construct(), Penalty2DAG::set_syllable_weights()
[vspell.git] / libvspell / words.cpp
blobaed46d39851dd99b7cae439567f210e0ac5ccc23
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include "spell.h"
3 #include <iterator>
4 #include <algorithm>
5 #include <sstream>
6 #include <iostream>
7 #include <fstream>
8 #include <boost/format.hpp>
9 #include <set>
10 #include "syllable.h"
11 #include "propername.h"
12 #ifndef _SArray_cc_
13 #include <libsrilm/SArray.cc>
14 #endif
17 using namespace std;
19 WordState::WordState(const WordState &ws):dnode(ws.dnode),sent(ws.sent),fuzid(ws.fuzid),pos(ws.pos),len(ws.len)
23 void WordState::add_word(set<WordEntry> &we,LeafNode*leaf)
25 WordEntry e;
26 e.pos = pos;
27 e.len = len;
28 e.fuzid = fuzid;
29 e.node = leaf;
30 //cerr << "Add " << e << endl;
31 we.insert(e);
34 void WordState::collect_words(set<WordEntry> &we)
36 LeafNode *leaf = dnode.node->get_leaf(sarch["<mainleaf>"]);
37 if (leaf)
38 add_word(we,leaf);
41 void UpperWordState::collect_words(set<WordEntry> &we)
43 LeafNode *leaf = dnode.node->get_leaf(sarch["<caseleaf>"]);
44 if (leaf)
45 add_word(we,leaf);
48 void WordState::get_first(WordStates &states,uint _pos)
50 dnode.node = warch.get_root();
51 pos = _pos;
52 fuzid = 0;
53 len = 0;
54 get_next(states);
57 void ExactWordState::get_next(WordStates &states)
59 uint i = pos+len;
60 BranchNode *branch = dnode.node->get_branch(sent[i].get_cid());
61 if (branch == NULL) {
62 delete this;
63 return;
65 //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
66 states.push_back(this);
67 // change the info
68 dnode = branch;
69 len ++;
72 void LowerWordState::get_next(WordStates &states)
74 string s1,s2;
75 uint i = pos+len;
76 s1 = get_sarch()[sent[i].get_cid()];
77 s2 = get_lowercased_syllable(s1);
78 BranchNode *branch = dnode.node->get_branch(get_sarch()[s2]);
79 if (branch == NULL) {
80 delete this;
81 return;
83 //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
84 states.push_back(this);
85 // change the info
86 dnode = branch;
87 len ++;
88 if (s1 != s2)
89 fuzid |= 1 << (i-pos);
92 void FuzzyWordState::get_next(WordStates &states)
94 vector<confusion_set>& confusion_sets = get_confusion_sets();
95 int i,j,m,n = confusion_sets.size();
96 bool ret = false;
97 set<Syllable> syllset,syllset2;
98 Syllable _syll;
99 uint _i = pos+len;
100 string s1 = get_sarch()[sent[_i].get_cid()];
102 _syll.parse(s1.c_str());
104 syllset2.insert(_syll);
105 while (!syllset2.empty()) {
106 const Syllable sy = *syllset2.begin();
107 syllset2.erase(syllset2.begin());
109 if (syllset.find(sy) != syllset.end())
110 continue; // we already matched&applied this syllable
112 //cerr << sy << endl;
113 syllset.insert(sy);
116 vector<Syllable> sylls;
117 // match & apply
118 for (i = 0;i < n;i ++) {
119 m = confusion_sets[i].size();
120 for (j = 0;j < m;j ++)
121 if (confusion_sets[i][j].match(sy)) {
122 //cerr << "Match " << i << " " << j << endl;
123 break;
125 if (j < m) {
126 for (j = 0;j < m;j ++) {
127 confusion_sets[i][j].apply(sy,sylls);
128 //cerr << "Apply " << i << " " << j << endl;
132 copy(sylls.begin(),sylls.end(), inserter(syllset2,syllset2.begin()));
135 // move to _nodes
136 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
137 set<Syllable>::iterator iter;
138 for (iter = syllset.begin();iter != syllset.end(); ++ iter) {
139 //cerr << iter->to_std_str() << endl;
140 string str = get_lowercased_syllable(iter->to_std_str());
141 BranchNode::const_np_range range = dnode.node->get_nodes().equal_range(get_sarch()[str]);
142 BranchNode::node_map::const_iterator pnode;
143 for (pnode = range.first;pnode != range.second;++pnode)
144 if (!pnode->second->is_leaf()) {
145 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
146 FuzzyWordState *s = new FuzzyWordState(*this);
148 // change the info
149 s->dnode.node = (BranchNode*)pnode->second.get();
150 s->len++;
151 if (s1 != str)
152 s->fuzid |= 1 << (_i-s->pos);
153 states.push_back(s);
154 //cerr << nodes[ii] << endl;
158 delete this;
162 Find all possible words. The whole process is divided into two
163 phases. The first one is pre_construct(), which creates base words
164 and store in set<WordEntry>. The second one is post_construct(),
165 which does the rest. Using pre_construct(),post_construct() directly
166 give chances to modify the lattice creation.
168 \param sent specify the input sentence
169 \param w must be cleared before calling this function
171 void Lattice::construct(const Sentence &sent)
173 set<WordEntry> wes;
174 WordStateFactories factories;
175 ExactWordStateFactory exact;
176 LowerWordStateFactory lower;
177 UpperWordStateFactory upper;
178 FuzzyWordStateFactory fuzzy;
179 factories.push_back(&exact);
180 factories.push_back(&lower);
181 factories.push_back(&upper);
182 factories.push_back(&fuzzy);
183 pre_construct(sent,wes,factories);
184 mark_proper_name(sent,wes);
185 post_construct(wes);
189 The first phase of lattice creation. Create all possible words to we.
192 void Lattice::pre_construct(const Sentence &sent,set<WordEntry> &we,const WordStateFactories &f)
194 Lattice &w = *this;
195 int i,n,ii,nn,k,nnn,iii;
196 int fi,fn = f.size();
198 //cerr << "construct\n";
200 w.st = &sent;
202 WordStates states1;
203 WordStates states2;
204 states1.reserve(10);
205 states2.reserve(10);
207 n = sent.get_syllable_count();
209 for (i = 0;i < n;i ++) {
211 //cerr << *this << endl;
213 states2.clear();
215 // new states
216 for (fi = 0;fi < fn;fi ++)
217 f[fi]->create_new(states2,i,sent);
219 // move old states to new states
220 nn = states1.size();
221 for (ii = 0;ii < nn;ii ++)
222 // state1[ii].get_next() have to delete itself if necessary.
223 states1[ii]->get_next(states2);
225 // get completed words
226 nn = states2.size();
227 for (ii = 0;ii < nn;ii ++)
228 states2[ii]->collect_words(we);
230 states1.swap(states2);
233 //cerr << *this << endl;
235 nn = states1.size();
236 for (ii = 0;ii < nn;ii ++)
237 delete states1[ii];
238 //cerr << *this << endl;
243 The second phase of lattice creation.
246 bool we_pos_cmp(const WordEntry &w1,const WordEntry &w2)
248 return w1.pos < w2.pos;
251 void Lattice::post_construct(set<WordEntry> &we)
253 Lattice &w = *this;
254 unsigned i,n,k;
256 n = st->get_syllable_count();
258 uint max = 0,len = 0;
259 while (max < n) {
261 // find out how far we can go from head (pos -> max, len -> len)
262 set<int> traces;
263 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr;
264 set<WordEntry>::iterator iter;
265 traces.insert(max);
266 while (!traces.empty()) {
267 uint ii, nn = 0;
268 WordEntry fake_entry;
269 fake_entry.pos = *traces.begin();
270 traces.erase(traces.begin());
271 pr = equal_range(we.begin(),we.end(),fake_entry,we_pos_cmp);
272 for (iter = pr.first;iter != pr.second; ++iter) {
273 if (iter->pos+iter->len < n)
274 traces.insert(iter->pos+iter->len);
275 pair<set<WordEntry>::iterator,set<WordEntry>::iterator> pr2;
276 set<WordEntry>::iterator iter2;
277 WordEntry fake_entry2;
278 fake_entry2.pos = iter->pos+iter->len;
279 pr2 = equal_range(we.begin(),we.end(),fake_entry2,we_pos_cmp);
280 if (pr2.first == pr2.second && len < iter->pos+iter->len) {
281 max = iter->pos;
282 len = iter->pos+iter->len;
287 // Yee, we're coming tail!
288 if (len >= n)
289 break;
291 // make sure that one can go from source to destination, without gaps.
292 WordEntry e;
293 e.pos = len;
294 e.len = 1;
295 e.fuzid = 0;
296 // if one starts with a cardinal number, then mark it number_id
297 string s = get_sarch()[(*st)[max].get_cid()];
298 //cerr << "Consider " << s;
299 if (strchr("0123456789",s[1]) != NULL)
300 e.node = get_special_node(NUMBER_ID);
301 else {
302 int iiii,nnnn = s.size();
303 for (iiii = 0;iiii < nnnn;iiii ++)
304 if (viet_ispunct(s[i]))
305 break;
306 if (iiii < nnnn)
307 e.node = get_special_node(PUNCT_ID);
308 else
309 e.node = get_special_node(UNK_ID);
311 max = len+1;
312 len = max;
313 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
314 we.insert(e);
317 //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
318 // copy to real _we
319 n = we.size();
320 w.we = boost::shared_ptr<WordEntries>(new WordEntries(n));
321 //w.we->resize(n);
322 copy(we.begin(),we.end(),w.we->begin());
323 for (i = 0;i < n;i ++) {
324 (*w.we)[i].id = i;
326 // build Lattice structure
327 w.construct();
331 Post-process after initialize core member of Lattice
334 void Lattice::construct()
336 int i_we,n_we = we->size();
338 for (i_we = 0;i_we < n_we;i_we ++) {
339 //cerr << "=" << i_we << endl;
340 add((*we)[i_we]);
344 Dump a Lattice
346 ostream& operator << (ostream &os, const Lattice &w)
348 int i,n;
350 if (!!w.we) {
351 n = w.we->size();
352 for (i = 0;i < n;i ++)
353 os << (*w.we)[i] << endl;
356 n = w.wi.size();
357 for (i = 0;i < n;i ++) {
358 int ii,nn = w.wi[i].we.size();
359 if (nn) cerr << ">" << i << " ";
360 for (ii = 0;ii < nn;ii ++)
361 cerr << w.wi[i].we[ii] << " ";
362 if (nn)
363 cerr << endl;
364 nn = w.wi[i].fuzzy_map.size();
365 if (nn) cerr << "<" << i << " ";
366 for (ii = 0;ii < nn;ii ++)
367 cerr << w.wi[i].fuzzy_map[ii] << " ";
368 if (nn)
369 cerr << endl;
373 int i, nn = w.get_word_count();
374 for (i = 0;i < nn;i ++) {
375 int nnn = w.get_len(i);
376 for (int ii = 0;ii < nnn;ii ++) {
377 int nnnn = w.get_fuzzy_count(i,ii);
378 if (w.get_we_exact(i,ii))
379 os << *w.get_we_exact(i,ii) << endl;
380 for (int iii = 0;iii < nnnn;iii ++)
381 os << w.get_we_fuzzy(i,ii,iii) << endl;
385 return os;
388 Lattice::~Lattice()
391 int pos,nr_pos = get_word_count();
392 for (pos = 0;pos < nr_pos;pos ++) {
393 int len,nr_len = get_len(pos);
394 for (len = 0;len < nr_len;len ++)
395 delete (*(*this)[pos])[len];
396 delete (*this)[pos];
401 std::ostream& operator << (std::ostream &os,const WordEntry &we)
403 using namespace boost;
404 os << format("%d %d %x %d") % we.pos % we.len % we.fuzid % we.id << we.node;
405 return os;
410 Construct a Lattice based on another Lattice.
411 Keep only exact matches.
414 void Lattice::based_on(const Lattice &w)
416 vector<WordInfos> &me = wi;
417 we = w.we;
418 st = w.st;
420 me.resize(w.get_word_count());
421 int i,n = we->size();
422 for (i = 0;i < n;i ++)
423 if ((*we)[i].fuzid == 0)
424 add((*we)[i]);
427 void Lattice::add(WordEntry &w)
429 vector<WordInfos> &me = wi;
430 if (me.size() <= w.pos)
431 me.resize(w.pos+1);
433 if (w.fuzid) {
434 for (unsigned int j = 0;j < w.len;j ++)
435 if (w.fuzid & (1 << j)) {
436 if (me.size() <= j+w.pos)
437 me.resize(j+w.pos+1);
438 me[j+w.pos].fuzzy_map.push_back(&w);
442 WordInfos &wis = me[w.pos];
443 wis.we.push_back(&w);
446 WordInfos::WordInfos()
450 unsigned int Lattice::get_len(unsigned int p) const
452 const WordEntryRefs &we = get_we(p);
453 return we.size();
456 void apply_separator(std::set<WordEntry> &wes,int i)
458 set<WordEntry>::iterator iter;
459 //cerr << "Separator applying after " << i << endl;
460 for (iter = wes.begin();iter != wes.end(); ++iter) {
461 //cerr << *iter << endl;
462 if (iter->pos <= i) {
463 if (iter->pos+iter->len-1 >= i+1) { // a word that across the separator
464 //cerr << "Remove " << *iter << endl;
465 wes.erase(iter);
467 } else
468 break; // because it's sorted, we don't need to go farther