1 #include "config.h" // -*- tab-width: 2 -*-
8 #include <boost/format.hpp>
11 #include "propername.h"
16 WordState::WordState(const WordState
&ws
):dnode(ws
.dnode
),sent(ws
.sent
),fuzid(ws
.fuzid
),pos(ws
.pos
),len(ws
.len
)
20 void WordState::add_word(set
<WordEntry
> &we
,LeafNNode
*leaf
)
27 //cerr << "Add " << e << endl;
31 void WordState::collect_words(set
<WordEntry
> &we
)
33 LeafNNode
*leaf
= dnode
.node
->get_leaf(sarch
["<mainleaf>"]);
38 void UpperWordState::collect_words(set
<WordEntry
> &we
)
40 LeafNNode
*leaf
= dnode
.node
->get_leaf(sarch
["<caseleaf>"]);
45 void WordState::get_first(WordStates
&states
,uint _pos
)
47 dnode
.node
= warch
.get_root();
54 void ExactWordState::get_next(WordStates
&states
)
57 BranchNNode
*branch
= dnode
.node
->get_branch(sent
[i
].get_cid());
62 //cerr << "Exact: " << get_sarch()[sent[i].get_cid()] << endl;
63 states
.push_back(this);
69 void LowerWordState::get_next(WordStates
&states
)
73 s1
= get_ngram()[sent
[i
].get_cid()];
74 s2
= get_lowercased_syllable(s1
);
75 BranchNNode
*branch
= dnode
.node
->get_branch(get_ngram()[s2
]);
80 //cerr << "Lower: " << get_lowercased_syllable(get_sarch()[sent[i].get_cid()]) << endl;
81 states
.push_back(this);
86 fuzid
|= 1 << (i
-pos
);
89 void FuzzyWordState::get_next(WordStates
&states
)
91 vector
<confusion_set
>& confusion_sets
= get_confusion_sets();
92 int i
,j
,m
,n
= confusion_sets
.size();
94 set
<Syllable
> syllset
,syllset2
;
97 string s1
= get_ngram()[sent
[_i
].get_cid()];
99 _syll
.parse(s1
.c_str());
101 syllset2
.insert(_syll
);
102 while (!syllset2
.empty()) {
103 const Syllable sy
= *syllset2
.begin();
104 syllset2
.erase(syllset2
.begin());
106 if (syllset
.find(sy
) != syllset
.end())
107 continue; // we already matched&applied this syllable
109 //cerr << sy << endl;
113 vector
<Syllable
> sylls
;
115 for (i
= 0;i
< n
;i
++) {
116 m
= confusion_sets
[i
].size();
117 for (j
= 0;j
< m
;j
++)
118 if (confusion_sets
[i
][j
].match(sy
)) {
119 //cerr << "Match " << i << " " << j << endl;
123 for (j
= 0;j
< m
;j
++) {
124 confusion_sets
[i
][j
].apply(sy
,sylls
);
125 //cerr << "Apply " << i << " " << j << endl;
129 copy(sylls
.begin(),sylls
.end(), inserter(syllset2
,syllset2
.begin()));
133 //copy(syllset.begin(),syllset.end(),ostream_iterator<Syllable>(cerr)); cerr << endl;
134 set
<Syllable
>::iterator iter
;
135 for (iter
= syllset
.begin();iter
!= syllset
.end(); ++ iter
) {
136 //cerr << iter->to_std_str() << endl;
137 string str
= get_lowercased_syllable(iter
->to_std_str());
138 BranchNNode::const_np_range range
= dnode
.node
->get_nodes().equal_range(get_ngram()[str
]);
139 BranchNNode::node_map::const_iterator pnode
;
140 for (pnode
= range
.first
;pnode
!= range
.second
;++pnode
)
141 if (!pnode
->second
->is_leaf()) {
142 //cerr << "Fuzzy: " << iter->to_std_str() << endl;
143 FuzzyWordState
*s
= new FuzzyWordState(*this);
146 s
->dnode
.node
= (BranchNNode
*)pnode
->second
.get();
149 s
->fuzid
|= 1 << (_i
-s
->pos
);
151 //cerr << nodes[ii] << endl;
159 Find all possible words. The whole process is divided into two
160 phases. The first one is pre_construct(), which creates base words
161 and store in set<WordEntry>. The second one is post_construct(),
162 which does the rest. Using pre_construct(),post_construct() directly
163 give chances to modify the lattice creation.
165 \param sent specify the input sentence
166 \param w must be cleared before calling this function
168 void Lattice::construct(const boost::shared_ptr
<const Sentence
> &sent
)
171 WordStateFactories factories
;
172 ExactWordStateFactory exact
;
173 LowerWordStateFactory lower
;
174 UpperWordStateFactory upper
;
175 FuzzyWordStateFactory fuzzy
;
176 factories
.push_back(&exact
);
177 factories
.push_back(&lower
);
178 factories
.push_back(&upper
);
179 factories
.push_back(&fuzzy
);
180 pre_construct(sent
,wes
,factories
);
181 mark_proper_name(*sent
,wes
);
186 The first phase of lattice creation. Create all possible words to we.
189 void Lattice::pre_construct(const boost::shared_ptr
<const Sentence
> &st
,set
<WordEntry
> &we
,const WordStateFactories
&f
)
192 int i
,n
,ii
,nn
,k
,nnn
,iii
;
193 int fi
,fn
= f
.size();
195 //cerr << "construct\n";
204 n
= st
->get_syllable_count();
206 for (i
= 0;i
< n
;i
++) {
208 //cerr << *this << endl;
213 for (fi
= 0;fi
< fn
;fi
++)
214 f
[fi
]->create_new(states2
,i
,*st
);
216 // move old states to new states
218 for (ii
= 0;ii
< nn
;ii
++)
219 // state1[ii].get_next() have to delete itself if necessary.
220 states1
[ii
]->get_next(states2
);
222 // get completed words
224 for (ii
= 0;ii
< nn
;ii
++)
225 states2
[ii
]->collect_words(we
);
227 states1
.swap(states2
);
230 //cerr << *this << endl;
233 for (ii
= 0;ii
< nn
;ii
++)
235 //cerr << *this << endl;
240 The second phase of lattice creation.
243 bool we_pos_cmp(const WordEntry
&w1
,const WordEntry
&w2
)
245 return w1
.pos
< w2
.pos
;
248 void Lattice::post_construct(set
<WordEntry
> &we
)
253 n
= st
->get_syllable_count();
255 uint max
= 0,len
= 0;
258 // find out how far we can go from head (pos -> max, len -> len)
260 pair
<set
<WordEntry
>::iterator
,set
<WordEntry
>::iterator
> pr
;
261 set
<WordEntry
>::iterator iter
;
263 while (!traces
.empty()) {
265 WordEntry fake_entry
;
266 fake_entry
.pos
= *traces
.begin();
267 traces
.erase(traces
.begin());
268 pr
= equal_range(we
.begin(),we
.end(),fake_entry
,we_pos_cmp
);
269 for (iter
= pr
.first
;iter
!= pr
.second
; ++iter
) {
270 if (iter
->pos
+iter
->len
< n
)
271 traces
.insert(iter
->pos
+iter
->len
);
272 pair
<set
<WordEntry
>::iterator
,set
<WordEntry
>::iterator
> pr2
;
273 set
<WordEntry
>::iterator iter2
;
274 WordEntry fake_entry2
;
275 fake_entry2
.pos
= iter
->pos
+iter
->len
;
276 pr2
= equal_range(we
.begin(),we
.end(),fake_entry2
,we_pos_cmp
);
277 if (pr2
.first
== pr2
.second
&& len
< iter
->pos
+iter
->len
) {
279 len
= iter
->pos
+iter
->len
;
284 // Yee, we're coming tail!
288 // make sure that one can go from source to destination, without gaps.
293 // if one starts with a cardinal number, then mark it number_id
294 string s
= get_ngram()[(*st
)[max
].get_cid()];
295 //cerr << "Consider " << s;
296 if (strchr("0123456789",s
[1]) != NULL
)
297 e
.node
= get_special_node(NUMBER_ID
);
299 int iiii
,nnnn
= s
.size();
300 for (iiii
= 0;iiii
< nnnn
;iiii
++)
301 if (viet_ispunct((unsigned char)s
[iiii
]))
304 e
.node
= get_special_node(PUNCT_ID
);
306 e
.node
= get_special_node(UNK_ID
);
310 //cerr << " " << get_sarch()[e.node.node->get_id()] << endl;
314 //copy(we.begin(),we.end(),ostream_iterator<WordEntry>(cerr));
317 w
.we
= boost::shared_ptr
<WordEntries
>(new WordEntries(n
));
319 copy(we
.begin(),we
.end(),w
.we
->begin());
320 for (i
= 0;i
< n
;i
++) {
323 // build Lattice structure
328 Post-process after initialize core member of Lattice
331 void Lattice::construct()
333 int i_we
,n_we
= we
->size();
335 for (i_we
= 0;i_we
< n_we
;i_we
++) {
336 //cerr << "=" << i_we << endl;
343 ostream
& operator << (ostream
&os
, const Lattice
&w
)
349 for (i
= 0;i
< n
;i
++)
350 os
<< (*w
.we
)[i
] << endl
;
355 for (i = 0;i < n;i ++) {
356 int ii,nn = w.wi[i].we.size();
357 if (nn) cerr << ">" << i << " ";
358 for (ii = 0;ii < nn;ii ++)
359 cerr << w.wi[i].we[ii] << " ";
362 nn = w.wi[i].fuzzy_map.size();
363 if (nn) cerr << "<" << i << " ";
364 for (ii = 0;ii < nn;ii ++)
365 cerr << w.wi[i].fuzzy_map[ii] << " ";
371 int i, nn = w.get_word_count();
372 for (i = 0;i < nn;i ++) {
373 int nnn = w.get_len(i);
374 for (int ii = 0;ii < nnn;ii ++) {
375 int nnnn = w.get_fuzzy_count(i,ii);
376 if (w.get_we_exact(i,ii))
377 os << *w.get_we_exact(i,ii) << endl;
378 for (int iii = 0;iii < nnnn;iii ++)
379 os << w.get_we_fuzzy(i,ii,iii) << endl;
386 istream
& operator >> (istream
&is
, Lattice
&w
)
389 w
.we
= boost::shared_ptr
<WordEntries
>(new WordEntries
);
392 while (getline(is
,s
)) {
395 istringstream
iss(s
);
396 w
.we
->push_back(WordEntry());
398 if (w
.we
->back().node
.node
== NULL
) {
399 cerr
<< "Invalid WordEntry!" << endl
;
404 for (i
= 0;i
< n
;i
++) {
408 boost::shared_ptr
<Sentence
> st(new Sentence(w
));
416 int pos,nr_pos = get_word_count();
417 for (pos = 0;pos < nr_pos;pos ++) {
418 int len,nr_len = get_len(pos);
419 for (len = 0;len < nr_len;len ++)
420 delete (*(*this)[pos])[len];
426 std::ostream
& operator << (std::ostream
&os
,const WordEntry
&we
)
428 os
<< boost::format("%d %d %04x ") % we
.pos
% we
.len
% we
.fuzid
<< we
.node
;
432 std::istream
& operator >> (std::istream
&is
,WordEntry
&we
)
434 is
>> we
.pos
>> we
.len
>> hex
>> we
.fuzid
>> dec
>> we
.node
;
439 Construct a Lattice based on another Lattice.
440 Keep only exact matches.
443 void Lattice::based_on(const Lattice
&w
)
445 vector
<WordInfos
> &me
= wi
;
449 me
.resize(w
.get_word_count());
450 int i
,n
= we
->size();
451 for (i
= 0;i
< n
;i
++)
452 if ((*we
)[i
].fuzid
== 0)
456 void Lattice::add(WordEntry
&w
)
458 vector
<WordInfos
> &me
= wi
;
459 if (me
.size() <= w
.pos
)
463 for (unsigned int j
= 0;j
< w
.len
;j
++)
464 if (w
.fuzid
& (1 << j
)) {
465 if (me
.size() <= j
+w
.pos
)
466 me
.resize(j
+w
.pos
+1);
467 me
[j
+w
.pos
].fuzzy_map
.push_back(&w
);
471 WordInfos
&wis
= me
[w
.pos
];
472 wis
.we
.push_back(&w
);
475 WordInfos::WordInfos()
479 unsigned int Lattice::get_len(unsigned int p
) const
481 const WordEntryRefs
&we
= get_we(p
);
485 void apply_separator(std::set
<WordEntry
> &wes
,int i
)
487 set
<WordEntry
>::iterator iter
;
488 //cerr << "Separator applying after " << i << endl;
489 for (iter
= wes
.begin();iter
!= wes
.end(); ++iter
) {
490 //cerr << *iter << endl;
491 if (iter
->pos
<= i
) {
492 if (iter
->pos
+iter
->len
-1 >= i
+1) { // a word that across the separator
493 //cerr << "Remove " << *iter << endl;
497 break; // because it's sorted, we don't need to go farther