1 #include "config.h" // -*- tab-width: 2 -*-
9 #include <boost/format.hpp>
12 #include "propername.h"
17 WordState::WordState(const WordState
&ws
):dnode(ws
.dnode
),sent(ws
.sent
),fuzid(ws
.fuzid
),pos(ws
.pos
),len(ws
.len
)
21 void WordState::add_word(set
<WordEntry
> &we
,LeafNNode
*leaf
)
31 //cerr << "Add " << e << endl;
35 void WordState::collect_words(set
<WordEntry
> &we
)
38 LeafNNode
*leaf
= dnode
.node
->get_leaf(sarch
["<mainleaf>"]);
43 void UpperWordState::collect_words(set
<WordEntry
> &we
)
46 LeafNNode
*leaf
= dnode
.node
->get_leaf(sarch
["<caseleaf>"]);
51 void WordState::get_first(WordStates
&states
,uint _pos
)
53 dnode
.node
= warch
.get_root();
61 void ExactWordState::get_next(WordStates
&states
)
65 BranchNNode
*branch
= dnode
.node
->get_branch(sent
[i
].get_cid());
70 //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
71 states
.push_back(this);
77 void LowerWordState::get_next(WordStates
&states
)
81 s1
= get_ngram()[sent
[i
].get_cid()];
82 s2
= get_lowercased_syllable(s1
);
84 BranchNNode
*branch
= dnode
.node
->get_branch(get_ngram()[s2
]);
89 //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
90 states
.push_back(this);
95 fuzid
|= 1 << (i
-pos
);
98 void FuzzyWordState::get_next(WordStates
&states
)
100 vector
<confusion_set
>& confusion_sets
= get_confusion_sets();
101 int i
,j
,m
,n
= confusion_sets
.size();
103 set
<Syllable
> syllset
,syllset2
;
106 string s1
= get_ngram()[sent
[_i
].get_cid()];
109 _syll
.parse(s1
.c_str());
111 syllset2
.insert(_syll
);
112 while (!syllset2
.empty()) {
113 const Syllable sy
= *syllset2
.begin();
114 syllset2
.erase(syllset2
.begin());
116 if (syllset
.find(sy
) != syllset
.end())
117 continue; // we already matched&applied this syllable
119 //cerr << sy << endl;
123 vector
<Syllable
> sylls
;
125 for (i
= 0;i
< n
;i
++) {
126 m
= confusion_sets
[i
].size();
127 for (j
= 0;j
< m
;j
++)
128 if (confusion_sets
[i
][j
].match(sy
)) {
129 //cerr << "Match " << i << " " << j << endl;
133 for (j
= 0;j
< m
;j
++) {
134 confusion_sets
[i
][j
].apply(sy
,sylls
);
135 //cerr << "Apply " << i << " " << j << endl;
139 copy(sylls
.begin(),sylls
.end(), inserter(syllset2
,syllset2
.begin()));
143 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
144 set
<Syllable
>::iterator iter
;
145 for (iter
= syllset
.begin();iter
!= syllset
.end(); ++ iter
) {
146 //cerr << iter->to_std_str() << endl;
147 string str
= get_lowercased_syllable(iter
->to_std_str());
148 BranchNNode::const_np_range range
= dnode
.node
->get_nodes().equal_range(get_ngram()[str
]);
149 BranchNNode::node_map::const_iterator pnode
;
150 for (pnode
= range
.first
;pnode
!= range
.second
;++pnode
)
151 if (!pnode
->second
->is_leaf()) {
152 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
153 FuzzyWordState
*s
= new FuzzyWordState(*this);
156 s
->dnode
.node
= (BranchNNode
*)pnode
->second
.get();
157 assert(s
->dnode
.node
);
160 s
->fuzid
|= 1 << (_i
-s
->pos
);
162 //cerr << nodes[ii] << endl;
170 Find all possible words. The whole process is divided into two
171 phases. The first one is pre_construct(), which creates base words
172 and store in set<WordEntry>. The second one is post_construct(),
173 which does the rest. Using pre_construct(),post_construct() directly
174 give chances to modify the lattice creation.
176 \param sent specify the input sentence
177 \param w must be cleared before calling this function
179 void Lattice::construct(const boost::shared_ptr
<const Sentence
> &sent
)
182 WordStateFactories factories
;
183 ExactWordStateFactory exact
;
184 LowerWordStateFactory lower
;
185 UpperWordStateFactory upper
;
186 FuzzyWordStateFactory fuzzy
;
187 factories
.push_back(&exact
);
188 factories
.push_back(&lower
);
189 factories
.push_back(&upper
);
190 factories
.push_back(&fuzzy
);
191 pre_construct(sent
,wes
,factories
);
192 mark_proper_name(*sent
,wes
);
197 The first phase of lattice creation. Create all possible words to we.
200 void Lattice::pre_construct(const boost::shared_ptr
<const Sentence
> &st
,set
<WordEntry
> &we
,const WordStateFactories
&f
)
203 int i
,n
,ii
,nn
,k
,nnn
,iii
;
204 int fi
,fn
= f
.size();
206 //cerr << "construct\n";
215 n
= st
->get_syllable_count();
217 for (i
= 0;i
< n
;i
++) {
219 //cerr << *this << endl;
224 for (fi
= 0;fi
< fn
;fi
++)
225 f
[fi
]->create_new(states2
,i
,*st
);
227 // move old states to new states
229 for (ii
= 0;ii
< nn
;ii
++)
230 // state1[ii].get_next() have to delete itself if necessary.
231 states1
[ii
]->get_next(states2
);
233 // get completed words
235 for (ii
= 0;ii
< nn
;ii
++)
236 states2
[ii
]->collect_words(we
);
238 states1
.swap(states2
);
241 //cerr << *this << endl;
244 for (ii
= 0;ii
< nn
;ii
++)
246 //cerr << *this << endl;
251 The second phase of lattice creation.
254 bool we_pos_cmp(const WordEntry
&w1
,const WordEntry
&w2
)
256 return w1
.pos
< w2
.pos
;
259 void Lattice::post_construct(set
<WordEntry
> &we
)
264 n
= st
->get_syllable_count();
266 uint max
= 0,len
= 0;
269 // find out how far we can go from head (pos -> max, len -> len)
271 pair
<set
<WordEntry
>::iterator
,set
<WordEntry
>::iterator
> pr
;
272 set
<WordEntry
>::iterator iter
;
274 while (!traces
.empty()) {
276 WordEntry fake_entry
;
277 fake_entry
.pos
= *traces
.begin();
278 traces
.erase(traces
.begin());
279 pr
= equal_range(we
.begin(),we
.end(),fake_entry
,we_pos_cmp
);
280 for (iter
= pr
.first
;iter
!= pr
.second
; ++iter
) {
281 if (iter
->pos
+iter
->len
< n
)
282 traces
.insert(iter
->pos
+iter
->len
);
283 pair
<set
<WordEntry
>::iterator
,set
<WordEntry
>::iterator
> pr2
;
284 set
<WordEntry
>::iterator iter2
;
285 WordEntry fake_entry2
;
286 fake_entry2
.pos
= iter
->pos
+iter
->len
;
287 pr2
= equal_range(we
.begin(),we
.end(),fake_entry2
,we_pos_cmp
);
288 if (pr2
.first
== pr2
.second
&& len
< iter
->pos
+iter
->len
) {
290 len
= iter
->pos
+iter
->len
;
295 // Yee, we're coming tail!
299 // make sure that one can go from source to destination, without gaps.
304 // if one starts with a cardinal number, then mark it number_id
305 string s
= get_ngram()[(*st
)[max
].get_cid()];
306 //cerr << "Consider " << s;
307 if (strchr("0123456789",s
[1]) != NULL
)
308 e
.node
= get_special_node(NUMBER_ID
);
310 int iiii
,nnnn
= s
.size();
311 for (iiii
= 0;iiii
< nnnn
;iiii
++)
312 if (viet_ispunct((unsigned char)s
[iiii
]))
315 e
.node
= get_special_node(PUNCT_ID
);
317 e
.node
= get_special_node(UNK_ID
);
321 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
325 //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
328 w
.we
= boost::shared_ptr
<WordEntries
>(new WordEntries(n
));
330 copy(we
.begin(),we
.end(),w
.we
->begin());
331 for (i
= 0;i
< n
;i
++) {
334 // build Lattice structure
339 Post-process after initialize core member of Lattice
342 void Lattice::construct()
344 int i_we
,n_we
= we
->size();
346 for (i_we
= 0;i_we
< n_we
;i_we
++) {
347 //cerr << "=" << i_we << endl;
354 ostream
& operator << (ostream
&os
, const Lattice
&w
)
360 for (i
= 0;i
< n
;i
++)
361 os
<< (*w
.we
)[i
] << endl
;
366 for (i = 0;i < n;i ++) {
367 int ii,nn = w.wi[i].we.size();
368 if (nn) cerr << ">" << i << " ";
369 for (ii = 0;ii < nn;ii ++)
370 cerr << w.wi[i].we[ii] << " ";
373 nn = w.wi[i].fuzzy_map.size();
374 if (nn) cerr << "<" << i << " ";
375 for (ii = 0;ii < nn;ii ++)
376 cerr << w.wi[i].fuzzy_map[ii] << " ";
382 int i, nn = w.get_word_count();
383 for (i = 0;i < nn;i ++) {
384 int nnn = w.get_len(i);
385 for (int ii = 0;ii < nnn;ii ++) {
386 int nnnn = w.get_fuzzy_count(i,ii);
387 if (w.get_we_exact(i,ii))
388 os << *w.get_we_exact(i,ii) << endl;
389 for (int iii = 0;iii < nnnn;iii ++)
390 os << w.get_we_fuzzy(i,ii,iii) << endl;
397 istream
& operator >> (istream
&is
, Lattice
&w
)
400 w
.we
= boost::shared_ptr
<WordEntries
>(new WordEntries
);
403 while (getline(is
,s
)) {
406 istringstream
iss(s
);
407 w
.we
->push_back(WordEntry());
409 if (w
.we
->back().node
.node
== NULL
) {
410 cerr
<< "Invalid WordEntry: " << s
<< endl
;
415 for (i
= 0;i
< n
;i
++) {
419 boost::shared_ptr
<Sentence
> st(new Sentence(w
));
427 int pos,nr_pos = get_word_count();
428 for (pos = 0;pos < nr_pos;pos ++) {
429 int len,nr_len = get_len(pos);
430 for (len = 0;len < nr_len;len ++)
431 delete (*(*this)[pos])[len];
437 std::ostream
& operator << (std::ostream
&os
,const WordEntry
&we
)
439 os
<< boost::format("%d %d %04x ") % we
.pos
% we
.len
% we
.fuzid
<< we
.node
;
443 std::istream
& operator >> (std::istream
&is
,WordEntry
&we
)
445 is
>> we
.pos
>> we
.len
>> hex
>> we
.fuzid
>> dec
>> we
.node
;
450 Construct a Lattice based on another Lattice.
451 Keep only exact matches.
454 void Lattice::based_on(const Lattice
&w
)
456 vector
<WordInfos
> &me
= wi
;
460 me
.resize(w
.get_word_count());
461 int i
,n
= we
->size();
462 for (i
= 0;i
< n
;i
++)
463 if ((*we
)[i
].fuzid
== 0)
467 void Lattice::add(WordEntry
&w
)
469 vector
<WordInfos
> &me
= wi
;
470 if (me
.size() <= w
.pos
)
474 for (unsigned int j
= 0;j
< w
.len
;j
++)
475 if (w
.fuzid
& (1 << j
)) {
476 if (me
.size() <= j
+w
.pos
)
477 me
.resize(j
+w
.pos
+1);
478 me
[j
+w
.pos
].fuzzy_map
.push_back(&w
);
482 WordInfos
&wis
= me
[w
.pos
];
483 wis
.we
.push_back(&w
);
486 WordInfos::WordInfos()
490 unsigned int Lattice::get_len(unsigned int p
) const
492 const WordEntryRefs
&we
= get_we(p
);
496 void apply_separator(std::set
<WordEntry
> &wes
,int i
)
498 set
<WordEntry
>::iterator iter
;
499 //cerr << "Separator applying after " << i << endl;
500 for (iter
= wes
.begin();iter
!= wes
.end(); ++iter
) {
501 //cerr << *iter << endl;
502 if (iter
->pos
<= i
) {
503 if (iter
->pos
+iter
->len
-1 >= i
+1) { // a word that across the separator
504 //cerr << "Remove " << *iter << endl;
508 break; // because it's sorted, we don't need to go farther