1 #include "config.h" // -*- tab-width: 2 -*-
8 #include <boost/format.hpp>
11 #include "propername.h"
13 #include <libsrilm/SArray.cc>
19 WordState::WordState(const WordState
&ws
):dnode(ws
.dnode
),sent(ws
.sent
),fuzid(ws
.fuzid
),pos(ws
.pos
),len(ws
.len
)
23 void WordState::add_word(set
<WordEntry
> &we
,LeafNode
*leaf
)
30 //cerr << "Add " << e << endl;
34 void WordState::collect_words(set
<WordEntry
> &we
)
36 LeafNode
*leaf
= dnode
.node
->get_leaf(sarch
["<mainleaf>"]);
41 void UpperWordState::collect_words(set
<WordEntry
> &we
)
43 LeafNode
*leaf
= dnode
.node
->get_leaf(sarch
["<caseleaf>"]);
48 void WordState::get_first(WordStates
&states
,uint _pos
)
50 dnode
.node
= warch
.get_root();
57 void ExactWordState::get_next(WordStates
&states
)
60 BranchNode
*branch
= dnode
.node
->get_branch(sent
[i
].get_cid());
65 //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
66 states
.push_back(this);
72 void LowerWordState::get_next(WordStates
&states
)
76 s1
= get_sarch()[sent
[i
].get_cid()];
77 s2
= get_lowercased_syllable(s1
);
78 BranchNode
*branch
= dnode
.node
->get_branch(get_sarch()[s2
]);
83 //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
84 states
.push_back(this);
89 fuzid
|= 1 << (i
-pos
);
92 void FuzzyWordState::get_next(WordStates
&states
)
94 vector
<confusion_set
>& confusion_sets
= get_confusion_sets();
95 int i
,j
,m
,n
= confusion_sets
.size();
97 set
<Syllable
> syllset
,syllset2
;
100 string s1
= get_sarch()[sent
[_i
].get_cid()];
102 _syll
.parse(s1
.c_str());
104 syllset2
.insert(_syll
);
105 while (!syllset2
.empty()) {
106 const Syllable sy
= *syllset2
.begin();
107 syllset2
.erase(syllset2
.begin());
109 if (syllset
.find(sy
) != syllset
.end())
110 continue; // we already matched&applied this syllable
112 //cerr << sy << endl;
116 vector
<Syllable
> sylls
;
118 for (i
= 0;i
< n
;i
++) {
119 m
= confusion_sets
[i
].size();
120 for (j
= 0;j
< m
;j
++)
121 if (confusion_sets
[i
][j
].match(sy
)) {
122 //cerr << "Match " << i << " " << j << endl;
126 for (j
= 0;j
< m
;j
++) {
127 confusion_sets
[i
][j
].apply(sy
,sylls
);
128 //cerr << "Apply " << i << " " << j << endl;
132 copy(sylls
.begin(),sylls
.end(), inserter(syllset2
,syllset2
.begin()));
136 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
137 set
<Syllable
>::iterator iter
;
138 for (iter
= syllset
.begin();iter
!= syllset
.end(); ++ iter
) {
139 //cerr << iter->to_std_str() << endl;
140 string str
= get_lowercased_syllable(iter
->to_std_str());
141 BranchNode::const_np_range range
= dnode
.node
->get_nodes().equal_range(get_sarch()[str
]);
142 BranchNode::node_map::const_iterator pnode
;
143 for (pnode
= range
.first
;pnode
!= range
.second
;++pnode
)
144 if (!pnode
->second
->is_leaf()) {
145 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
146 FuzzyWordState
*s
= new FuzzyWordState(*this);
149 s
->dnode
.node
= (BranchNode
*)pnode
->second
.get();
152 s
->fuzid
|= 1 << (_i
-s
->pos
);
154 //cerr << nodes[ii] << endl;
162 Find all possible words. The whole process is divided into two
163 phases. The first one is pre_construct(), which creates base words
164 and store in set<WordEntry>. The second one is post_construct(),
165 which does the rest. Using pre_construct(),post_construct() directly
166 give chances to modify the lattice creation.
168 \param sent specify the input sentence
169 \param w must be cleared before calling this function
171 void Lattice::construct(const Sentence
&sent
)
174 WordStateFactories factories
;
175 ExactWordStateFactory exact
;
176 LowerWordStateFactory lower
;
177 UpperWordStateFactory upper
;
178 FuzzyWordStateFactory fuzzy
;
179 factories
.push_back(&exact
);
180 factories
.push_back(&lower
);
181 factories
.push_back(&upper
);
182 factories
.push_back(&fuzzy
);
183 pre_construct(sent
,wes
,factories
);
184 mark_proper_name(sent
,wes
);
189 The first phase of lattice creation. Create all possible words to we.
192 void Lattice::pre_construct(const Sentence
&sent
,set
<WordEntry
> &we
,const WordStateFactories
&f
)
195 int i
,n
,ii
,nn
,k
,nnn
,iii
;
196 int fi
,fn
= f
.size();
198 //cerr << "construct\n";
207 n
= sent
.get_syllable_count();
209 for (i
= 0;i
< n
;i
++) {
211 //cerr << *this << endl;
216 for (fi
= 0;fi
< fn
;fi
++)
217 f
[fi
]->create_new(states2
,i
,sent
);
219 // move old states to new states
221 for (ii
= 0;ii
< nn
;ii
++)
222 // state1[ii].get_next() have to delete itself if necessary.
223 states1
[ii
]->get_next(states2
);
225 // get completed words
227 for (ii
= 0;ii
< nn
;ii
++)
228 states2
[ii
]->collect_words(we
);
230 states1
.swap(states2
);
233 //cerr << *this << endl;
236 for (ii
= 0;ii
< nn
;ii
++)
238 //cerr << *this << endl;
243 The second phase of lattice creation.
246 bool we_pos_cmp(const WordEntry
&w1
,const WordEntry
&w2
)
248 return w1
.pos
< w2
.pos
;
251 void Lattice::post_construct(set
<WordEntry
> &we
)
256 n
= st
->get_syllable_count();
258 uint max
= 0,len
= 0;
261 // find out how far we can go from head (pos -> max, len -> len)
263 pair
<set
<WordEntry
>::iterator
,set
<WordEntry
>::iterator
> pr
;
264 set
<WordEntry
>::iterator iter
;
266 while (!traces
.empty()) {
268 WordEntry fake_entry
;
269 fake_entry
.pos
= *traces
.begin();
270 traces
.erase(traces
.begin());
271 pr
= equal_range(we
.begin(),we
.end(),fake_entry
,we_pos_cmp
);
272 for (iter
= pr
.first
;iter
!= pr
.second
; ++iter
) {
273 if (iter
->pos
+iter
->len
< n
)
274 traces
.insert(iter
->pos
+iter
->len
);
275 pair
<set
<WordEntry
>::iterator
,set
<WordEntry
>::iterator
> pr2
;
276 set
<WordEntry
>::iterator iter2
;
277 WordEntry fake_entry2
;
278 fake_entry2
.pos
= iter
->pos
+iter
->len
;
279 pr2
= equal_range(we
.begin(),we
.end(),fake_entry2
,we_pos_cmp
);
280 if (pr2
.first
== pr2
.second
&& len
< iter
->pos
+iter
->len
) {
282 len
= iter
->pos
+iter
->len
;
287 // Yee, we're coming tail!
291 // make sure that one can go from source to destination, without gaps.
296 // if one starts with a cardinal number, then mark it number_id
297 string s
= get_sarch()[(*st
)[max
].get_cid()];
298 //cerr << "Consider " << s;
299 if (strchr("0123456789",s
[1]) != NULL
)
300 e
.node
= get_special_node(NUMBER_ID
);
302 int iiii
,nnnn
= s
.size();
303 for (iiii
= 0;iiii
< nnnn
;iiii
++)
304 if (viet_ispunct(s
[i
]))
307 e
.node
= get_special_node(PUNCT_ID
);
309 e
.node
= get_special_node(UNK_ID
);
313 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
317 //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
320 w
.we
= boost::shared_ptr
<WordEntries
>(new WordEntries(n
));
322 copy(we
.begin(),we
.end(),w
.we
->begin());
323 for (i
= 0;i
< n
;i
++) {
326 // build Lattice structure
331 Post-process after initialize core member of Lattice
334 void Lattice::construct()
336 int i_we
,n_we
= we
->size();
338 for (i_we
= 0;i_we
< n_we
;i_we
++) {
339 //cerr << "=" << i_we << endl;
346 ostream
& operator << (ostream
&os
, const Lattice
&w
)
352 for (i
= 0;i
< n
;i
++)
353 os
<< (*w
.we
)[i
] << endl
;
357 for (i = 0;i < n;i ++) {
358 int ii,nn = w.wi[i].we.size();
359 if (nn) cerr << ">" << i << " ";
360 for (ii = 0;ii < nn;ii ++)
361 cerr << w.wi[i].we[ii] << " ";
364 nn = w.wi[i].fuzzy_map.size();
365 if (nn) cerr << "<" << i << " ";
366 for (ii = 0;ii < nn;ii ++)
367 cerr << w.wi[i].fuzzy_map[ii] << " ";
373 int i, nn = w.get_word_count();
374 for (i = 0;i < nn;i ++) {
375 int nnn = w.get_len(i);
376 for (int ii = 0;ii < nnn;ii ++) {
377 int nnnn = w.get_fuzzy_count(i,ii);
378 if (w.get_we_exact(i,ii))
379 os << *w.get_we_exact(i,ii) << endl;
380 for (int iii = 0;iii < nnnn;iii ++)
381 os << w.get_we_fuzzy(i,ii,iii) << endl;
391 int pos,nr_pos = get_word_count();
392 for (pos = 0;pos < nr_pos;pos ++) {
393 int len,nr_len = get_len(pos);
394 for (len = 0;len < nr_len;len ++)
395 delete (*(*this)[pos])[len];
401 std::ostream
& operator << (std::ostream
&os
,const WordEntry
&we
)
403 using namespace boost
;
404 os
<< format("%d %d %x %d") % we
.pos
% we
.len
% we
.fuzid
% we
.id
<< we
.node
;
410 Construct a Lattice based on another Lattice.
411 Keep only exact matches.
414 void Lattice::based_on(const Lattice
&w
)
416 vector
<WordInfos
> &me
= wi
;
420 me
.resize(w
.get_word_count());
421 int i
,n
= we
->size();
422 for (i
= 0;i
< n
;i
++)
423 if ((*we
)[i
].fuzid
== 0)
427 void Lattice::add(WordEntry
&w
)
429 vector
<WordInfos
> &me
= wi
;
430 if (me
.size() <= w
.pos
)
434 for (unsigned int j
= 0;j
< w
.len
;j
++)
435 if (w
.fuzid
& (1 << j
)) {
436 if (me
.size() <= j
+w
.pos
)
437 me
.resize(j
+w
.pos
+1);
438 me
[j
+w
.pos
].fuzzy_map
.push_back(&w
);
442 WordInfos
&wis
= me
[w
.pos
];
443 wis
.we
.push_back(&w
);
446 WordInfos::WordInfos()
450 unsigned int Lattice::get_len(unsigned int p
) const
452 const WordEntryRefs
&we
= get_we(p
);
456 void apply_separator(std::set
<WordEntry
> &wes
,int i
)
458 set
<WordEntry
>::iterator iter
;
459 //cerr << "Separator applying after " << i << endl;
460 for (iter
= wes
.begin();iter
!= wes
.end(); ++iter
) {
461 //cerr << *iter << endl;
462 if (iter
->pos
<= i
) {
463 if (iter
->pos
+iter
->len
-1 >= i
+1) { // a word that across the separator
464 //cerr << "Remove " << *iter << endl;
468 break; // because it's sorted, we don't need to go farther