last changes
[vspell.git] / libvspell / wfst.cpp
blob75c0a4121ed9451fcc7c55b8aafaa0a5097c4cce
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include "wfst.h"
3 #include <iterator>
4 #include <algorithm>
5 #include <sstream>
6 #include <iostream>
7 #include <fstream>
8 #include <boost/format.hpp>
9 #include <set>
10 #include "posgen.h"
11 #include <signal.h>
12 #include <unistd.h>
14 using namespace std;
18 typedef vector<Segmentations> Segmentation2;
20 /**
21 Generate new Lattice info based on misspelled position info.
22 Call WFST::get_sections() to split into sections
23 then call WFST::segment_all1() to do real segmentation.
24 \param pos contains possibly misspelled position.
25 \param len specify the actual length pos. Don't use pos.size()
28 void WFST::generate_misspelled_words(const vector<uint> &pos,int len,Segmentation &final_seg)
30 const Lattice &words = *p_words;
31 Lattice w;
33 w.based_on(words);
35 // 2. Compute the score, jump to 1. if score is too low (pruning 1)
37 // create new (more compact) Lattice structure
38 int i,n = words.get_word_count();
39 for (i = 0;i < len;i ++) {
40 const WordEntryRefs &fmap = words.get_fuzzy_map(pos[i]);
41 int ii,nn = fmap.size();
42 for (ii = 0;ii < nn;++ii)
43 w.add(*fmap[ii]);
46 //cerr << w << endl;
48 // 4. Create sections
49 Sections sects;
50 sects.construct(words);
52 // 5. Call create_base_segmentation
53 //Segmentation base_seg(words.we);
54 //create_base_segmentation(words,sects,base_seg);
57 // 6. Get the best segmentation of each section,
58 // then merge to one big segment.
59 n = sects.size();
61 uint ii,nn;
63 i = ii = 0;
64 nn = words.get_word_count();
66 final_seg.clear();
67 while (ii < nn)
68 if (i < n && sects[i].start == ii) {
69 Segmentation seg;
70 sects[i].segment_best(words,seg);
71 copy(seg.begin(),
72 seg.end(),
73 back_insert_iterator< Segmentation >(final_seg));
74 ii += sects[i].len;
75 i ++;
76 } else {
77 // only word(i,*,0) exists
78 final_seg.push_back(words.get_we(ii)[0]->id);
79 ii += words.get_we(ii)[0]->len;
83 /**
84 Create the best segmentation for a sentence.
85 \param words store Lattice info
86 \param seps return the best segmentation
89 void WFST::segment_best(const Lattice &words,Segmentation &seps)
91 //int i,ii,n,nn;
93 p_words = &words;
95 // in test mode, generate all positions where misspelled can appear,
96 // then create a new Lattice for them, re get_sections,
97 // create_base_segmentation and call segment_all1 for each sections.
99 // 1. Generate mispelled positions (pruning 0 - GA)
100 // 2. Compute the score, jump to 1. if score is too low (pruning 1)
101 // 3. Make a new Lattice based on the original Lattice
102 // 4. Call get_sections
103 // 5. Call create_base_segmentation
104 // 6. Call segment_all1 for each sections.
105 // 6.1. Recompute the score after each section processed. (pruning 2)
107 // 1. Bai toan hoan vi, tinh chap C(k,n) with modification. C(1,n)+C(2,n)+...+C(k,n)
108 Generator gen;
110 gen.init(*words.st);
111 vector<uint> pos;
112 uint len;
113 seps.prob = 100;
114 while (gen.step(pos,len)) {
115 Segmentation seg;
116 //cerr << "POS :";
117 //for (int i = 0;i < len;i ++) cerr << pos[i];
118 //cerr << endl;
119 generate_misspelled_words(pos,len,seg);
120 if (seg.prob < seps.prob)
121 seps = seg;
123 gen.done();
128 Create the best segmentation for a sentence. The biggest difference between
129 segment_best and segment_best_no_fuzzy is that segment_best_no_fuzzy don't
130 use Generator. It assumes there is no misspelled position at all.
131 \param words store Lattice info
132 \param seps return the best segmentation
135 void WFST::segment_best_no_fuzzy(const Lattice &words,Segmentation &seps)
137 p_words = &words;
139 vector<uint> pos;
140 generate_misspelled_words(pos,0,seps);
143 // WFST (Weighted Finite State Transducer) implementation
144 // TUNE: think about genetic/greedy. Vietnamese is almost one-syllable words..
145 // we find where is likely a start of a word, then try to construct word
146 // and check if others are still valid words.
148 // the last item is always the incompleted item. We will try to complete
149 // a word from the item. If we reach the end of sentence, we will remove it
150 // from segs
152 // obsolete
153 void WFST::segment_all(const Sentence &sent,vector<Segmentation> &result)
155 Lattice words;
156 words.construct(sent);
157 // segment_all1(sent,words,0,sent.get_syllable_count(),result);a
159 int nn = words.size();
160 for (i = 0;i < nn;i ++) {
161 int nnn = words[i].size();
162 cerr << "From " << i << endl;
163 for (int ii = 0;ii < nnn;ii ++) {
164 int nnnn = words[i][ii].fuzzy_match.size();
165 cerr << "Len " << ii << endl;
166 for (int iii = 0;iii < nnnn;iii ++) {
167 cerr << words[i][ii].fuzzy_match[iii].distance << " ";
168 cerr << words[i][ii].fuzzy_match[iii].node->get_prob() << endl;
177 * Segmentation comparator
180 class SegmentationComparator
182 public:
183 bool operator() (const Segmentation &seg1,const Segmentation &seg2) {
184 return seg1.prob > seg2.prob;
193 void Segmentor::init(const Lattice &words,
194 int from,
195 int to)
197 nr_syllables = to+1;
199 segs.clear();
200 segs.reserve(100);
202 Trace trace(words.we);
203 trace.next_syllable = from;
204 segs.push_back(trace); // empty seg
206 _words = &words;
209 bool Segmentor::step(Segmentation &result)
211 const Lattice &words = *_words;
212 // const Sentence &sent = *_words->st;
213 while (!segs.empty()) {
214 // get one
215 Trace trace = segs.back();
216 segs.pop_back();
218 Segmentation seg = trace.s;
219 int next_syllable = trace.next_syllable;
221 // segmentation completed. throw it away
222 if (next_syllable >= nr_syllables)
223 continue;
225 //WordEntries::iterator i_entry;
226 const WordEntryRefs &wes = words.get_we(next_syllable);
227 int ii,nn = wes.size();
228 for (ii = 0;ii < nn;ii ++) {
229 WordEntryRef i_entry = wes[ii];
231 // New segmentation for longer incomplete word
232 Trace newtrace(words.we);
233 newtrace = trace;
234 newtrace.s.push_back(i_entry->id);
235 newtrace.s.prob += i_entry->node.node->get_prob();
237 /*-it's better to compute ngram outside this function
238 if (ngram_enabled) {
239 if (newtrace.s.size() >= NGRAM_LENGTH) {
240 VocabIndex *vi = new VocabIndex[NGRAM_LENGTH];
241 vi[0] = newtrace.s.items[len-1].node(sent).node->get_id();
242 vi[1] = Vocab_None;
243 newtrace.s.prob += -ngram.wordProb(newtrace.s.items[len-2].node(sent).node->get_id(),vi);
244 delete[] vi;
249 newtrace.next_syllable += i_entry->len;
250 if (newtrace.next_syllable == nr_syllables) {
251 //cout << count << " " << newtrace.s << endl;
252 result = newtrace.s;
253 return true;
254 //result.push_back(newtrace.s);
255 //push_heap(result.begin(),result.end(),SegmentationComparator());
256 //count ++;
257 } else {
258 segs.push_back(newtrace);
261 } // end while
262 return false;
265 void Segmentor::done()