1 #include "config.h" // -*- tab-width: 2 -*-
8 #include <boost/format.hpp>
19 std::ostream
& operator << (std::ostream
&os
,const Sections
&me
)
21 using namespace boost
;
22 const Sentence
&_sent
= *me
.st
;
23 unsigned int i
,ii
,nn
,n
= _sent
.get_syllable_count();
26 for (i
= 0;i
< n
;i
++) {
27 if (ii
< nn
&& me
[ii
].start
== i
)
29 os
<< format("%s") % get_sarch()[_sent
[i
].id
];
30 if (ii
< nn
&& me
[ii
].start
+me
[ii
].len
-1 == i
) {
31 os
<< "]" << me
[ii
].len
;
40 Split Lattice into Sections
45 void Sections::construct(const Lattice
&words
)
47 Sections
& sects
= *this;
49 // mark all possible words. All bounds left is really bounds.
50 // because we need at most n-1 boundary syllable for n-gram
51 // if two ambiguous sections is near than n-1 syllables, then merge them.
52 unsigned int i
,ii
,n
,nn
;
56 n
= words
.get_word_count();
58 vector
<uint
> bound(n
);
60 for (i
= 0;i
< n
;i
++) {
61 nn
= words
.get_len(i
);
62 for (ii
= 0;ii
< nn
-2;ii
++)
67 bound
[bound
.size()-1] = 1; // it's obvious there is a boundary in the end
69 //copy(bound.begin(),bound.end(),ostream_iterator<int>(cerr," "));
72 int pos
,len
= bound
.size();
78 for (pos
= 0;pos
< len
;pos
++) {
79 // ignore "1" boundaries
84 // just write down and figure out what the formulas mean
85 sect
.len
= pos
- sect
.start
+ 1;
86 is_section
= words
.get_len(sect
.start
) > 2 ||
87 (words
.get_len(sect
.start
) >= 2 &&
88 !words
.get_fuzzy_map(sect
.start
).empty());
91 // now merge two sections (this and the previous) if needed
93 Section
&prev
= sects
.back();
94 if (sect
.start
- (prev
.start
+ prev
.len
) < NGRAM_LENGTH
-1)
95 prev
.len
= pos
- prev
.start
+ 1; // merge
97 sects
.push_back(sect
); // not merge
99 sects
.push_back(sect
); // sects is empty -> nothing to merge
106 sect
.len
= n
-sect
.start
;
107 sects
.push_back(sect
);
111 std::ostream
& operator <<(std::ostream
&os
,const Segmentation
&seg
)
113 int i
,n
= seg
.size();
114 for (i
= 0;i
< n
;i
++)
115 os
<< "[" << seg
[i
].node
<< "] ";
119 void Section::segment_best(const Lattice
&w
,Segmentation
&final_seg
)
121 Segmentation
seg(w
.we
);
123 final_seg
.prob
= 1000000;
125 segtor
.init(w
, // Lattice
129 VocabIndex
*vi
= new VocabIndex
[NGRAM_LENGTH
];
130 while (segtor
.step(seg
)) {
131 // compute ngram. take the best seg.
133 vi
[NGRAM_LENGTH
] = Vocab_None
;
134 for (unsigned int ii
= NGRAM_LENGTH
-1;ii
< seg
.size();ii
++) {
135 for (unsigned int j
= 0;j
< NGRAM_LENGTH
-1;j
++)
136 vi
[j
] = seg
[ii
-1-j
].node
.node
->get_id();
137 seg
.prob
+= -get_ngram().wordProb(seg
[ii
].node
.node
->get_id(),vi
);
140 if (seg
.prob
< final_seg
.prob
)
143 //cerr << seg << " " << seg.prob << endl;