1 #include "config.h" // -*- tab-width: 2 -*-
8 #include <boost/format.hpp>
18 typedef vector
<Segmentations
> Segmentation2
;
21 Generate new Lattice info based on misspelled position info.
22 Call WFST::get_sections() to split into sections
23 then call WFST::segment_all1() to do real segmentation.
24 \param pos contains possibly misspelled position.
25 \param len specify the actual length pos. Don't use pos.size()
28 void WFST::generate_misspelled_words(const vector
<uint
> &pos
,int len
,Segmentation
&final_seg
)
30 const Lattice
&words
= *p_words
;
35 // 2. Compute the score, jump to 1. if score is too low (pruning 1)
37 // create new (more compact) Lattice structure
38 int i
,n
= words
.get_word_count();
39 for (i
= 0;i
< len
;i
++) {
40 const WordEntryRefs
&fmap
= words
.get_fuzzy_map(pos
[i
]);
41 int ii
,nn
= fmap
.size();
42 for (ii
= 0;ii
< nn
;++ii
)
50 sects
.construct(words
);
52 // 5. Call create_base_segmentation
53 //Segmentation base_seg(words.we);
54 //create_base_segmentation(words,sects,base_seg);
57 // 6. Get the best segmentation of each section,
58 // then merge to one big segment.
64 nn
= words
.get_word_count();
68 if (i
< n
&& sects
[i
].start
== ii
) {
70 sects
[i
].segment_best(words
,seg
);
73 back_insert_iterator
< Segmentation
>(final_seg
));
77 // only word(i,*,0) exists
78 final_seg
.push_back(words
.get_we(ii
)[0]->id
);
79 ii
+= words
.get_we(ii
)[0]->len
;
84 Create the best segmentation for a sentence.
85 \param words store Lattice info
86 \param seps return the best segmentation
89 void WFST::segment_best(const Lattice
&words
,Segmentation
&seps
)
95 // in test mode, generate all positions where misspelled can appear,
96 // then create a new Lattice for them, re get_sections,
97 // create_base_segmentation and call segment_all1 for each sections.
99 // 1. Generate mispelled positions (pruning 0 - GA)
100 // 2. Compute the score, jump to 1. if score is too low (pruning 1)
101 // 3. Make a new Lattice based on the original Lattice
102 // 4. Call get_sections
103 // 5. Call create_base_segmentation
104 // 6. Call segment_all1 for each sections.
105 // 6.1. Recompute the score after each section processed. (pruning 2)
107 // 1. Bai toan hoan vi, tinh chap C(k,n) with modification. C(1,n)+C(2,n)+...+C(k,n)
114 while (gen
.step(pos
,len
)) {
117 //for (int i = 0;i < len;i ++) cerr << pos[i];
119 generate_misspelled_words(pos
,len
,seg
);
120 if (seg
.prob
< seps
.prob
)
128 Create the best segmentation for a sentence. The biggest difference between
129 segment_best and segment_best_no_fuzzy is that segment_best_no_fuzzy don't
130 use Generator. It assumes there is no misspelled position at all.
131 \param words store Lattice info
132 \param seps return the best segmentation
135 void WFST::segment_best_no_fuzzy(const Lattice
&words
,Segmentation
&seps
)
140 generate_misspelled_words(pos
,0,seps
);
143 // WFST (Weighted Finite State Transducer) implementation
144 // TUNE: think about genetic/greedy. Vietnamese is almost one-syllable words..
145 // we find where is likely a start of a word, then try to construct word
146 // and check if others are still valid words.
148 // the last item is always the incompleted item. We will try to complete
149 // a word from the item. If we reach the end of sentence, we will remove it
153 void WFST::segment_all(const Sentence
&sent
,vector
<Segmentation
> &result
)
156 words
.construct(sent
);
157 // segment_all1(sent,words,0,sent.get_syllable_count(),result);a
159 int nn = words.size();
160 for (i = 0;i < nn;i ++) {
161 int nnn = words[i].size();
162 cerr << "From " << i << endl;
163 for (int ii = 0;ii < nnn;ii ++) {
164 int nnnn = words[i][ii].fuzzy_match.size();
165 cerr << "Len " << ii << endl;
166 for (int iii = 0;iii < nnnn;iii ++) {
167 cerr << words[i][ii].fuzzy_match[iii].distance << " ";
168 cerr << words[i][ii].fuzzy_match[iii].node->get_prob() << endl;
177 * Segmentation comparator
180 class SegmentationComparator
183 bool operator() (const Segmentation
&seg1
,const Segmentation
&seg2
) {
184 return seg1
.prob
> seg2
.prob
;
193 void Segmentor::init(const Lattice
&words
,
202 Trace
trace(words
.we
);
203 trace
.next_syllable
= from
;
204 segs
.push_back(trace
); // empty seg
209 bool Segmentor::step(Segmentation
&result
)
211 const Lattice
&words
= *_words
;
212 // const Sentence &sent = *_words->st;
213 while (!segs
.empty()) {
215 Trace trace
= segs
.back();
218 Segmentation seg
= trace
.s
;
219 int next_syllable
= trace
.next_syllable
;
221 // segmentation completed. throw it away
222 if (next_syllable
>= nr_syllables
)
225 //WordEntries::iterator i_entry;
226 const WordEntryRefs
&wes
= words
.get_we(next_syllable
);
227 int ii
,nn
= wes
.size();
228 for (ii
= 0;ii
< nn
;ii
++) {
229 WordEntryRef i_entry
= wes
[ii
];
231 // New segmentation for longer incomplete word
232 Trace
newtrace(words
.we
);
234 newtrace
.s
.push_back(i_entry
->id
);
235 newtrace
.s
.prob
+= i_entry
->node
.node
->get_prob();
237 /*-it's better to compute ngram outside this function
239 if (newtrace.s.size() >= NGRAM_LENGTH) {
240 VocabIndex *vi = new VocabIndex[NGRAM_LENGTH];
241 vi[0] = newtrace.s.items[len-1].node(sent).node->get_id();
243 newtrace.s.prob += -ngram.wordProb(newtrace.s.items[len-2].node(sent).node->get_id(),vi);
249 newtrace
.next_syllable
+= i_entry
->len
;
250 if (newtrace
.next_syllable
== nr_syllables
) {
251 //cout << count << " " << newtrace.s << endl;
254 //result.push_back(newtrace.s);
255 //push_heap(result.begin(),result.end(),SegmentationComparator());
258 segs
.push_back(newtrace
);
265 void Segmentor::done()