1 #include "softcount.h" // -*- tab-width: 2 -*-
3 #include <boost/format.hpp>
7 Based on the article "Discovering Chinese Words from Unsegmented Text".
8 The idea is that we count all possible words in a sentence with a fraction count
9 instead of just count the words of the best segmentation in the sentence (count=1)
17 ostream
& SoftCounter::count_lattice(const Lattice
&w
, ostream
&os
, bool first_count
)
19 vector
<long double> Sleft
,Sright
;
20 vector
<vector
<uint
> > prev
;
21 //boost::shared_ptr<WordEntries> p_we = w.we;
22 //const WordEntries &we = *p_we;
23 int i
,n
= w
.get_word_count(),v
,vv
;
29 Sleft
.resize(w
.we
->size());
30 Sright
.resize(w
.we
->size());
31 prev
.resize(w
.we
->size());
36 for (i
= 0;i
< n
;i
++) {
37 const WordEntryRefs
&wers
= w
.get_we(i
);
38 int ii
,nn
= wers
.size();
40 for (ii
= 0;ii
< nn
;ii
++) {
41 // wers[ii] is the first node (W).
45 vi
[0] = get_id(START_ID
);
46 Sleft
[v
] = first_count
? 1 : -(long double)get_ngram().wordProb((*w
.we
)[v
].node
.node
->get_id(),vi
);
47 //cerr << "Sleft(" << vi[0] << "," << v << ") = " << Sleft[v] << endl;
50 int next
= wers
[ii
]->pos
+wers
[ii
]->len
;
52 const WordEntryRefs
&wers2
= w
.get_we(next
);
53 int iii
,nnn
= wers2
.size();
54 for (iii
= 0;iii
< nnn
;iii
++) {
55 // wers2[iii] is the second node (W').
57 vi
[0] = (*w
.we
)[v
].node
.node
->get_id();
58 add
= first_count
? 1 : Sleft
[v
]*(-(long double)get_ngram().wordProb((*w
.we
)[vv
].node
.node
->get_id(),vi
));
61 //cerr << "Sleft(" << vi[0] << "," << vv << ") = " << Sleft[vv] << endl;
63 // init prev references for Sright phase
64 prev
[vv
].push_back(v
);
67 vi
[0] = (*w
.we
)[v
].node
.node
->get_id();
68 Sright
[v
] = first_count
? 1 : -(long double)get_ngram().wordProb(get_id(STOP_ID
),vi
);
69 //cerr << "Sright(" << vi[0] << "," << v << ") = " << Sright[v] << endl;
71 //cerr << "Sum " << sum << endl;
77 // second pass: Sright
78 Sright
[w
.we
->size()-1] = 1;
79 for (i
= n
-1;i
>= 0;i
--) {
80 const WordEntryRefs
&wers
= w
.get_we(i
);
81 int ii
,nn
= wers
.size();
83 for (ii
= 0;ii
< nn
;ii
++) {
84 // wers[ii] is the first node (W).
87 int iii
,nnn
= prev
[v
].size();
88 for (iii
= 0;iii
< nnn
;iii
++) {
89 // vv is the second node (W').
91 vi
[0] = (*w
.we
)[vv
].node
.node
->get_id();
92 add
= first_count
? 1 : Sright
[v
]*(-(long double)get_ngram().wordProb((*w
.we
)[v
].node
.node
->get_id(),vi
));
95 //cerr << "Sright(" << vi[0] << "," << vv << ") = " << Sright[vv] << endl;
97 // collect fractional counts
98 fc
= Sleft
[vv
]*add
/sum
; // P(v/vv)
99 vi
[0] = (*w
.we
)[vv
].node
.node
->get_id();
100 vi
[1] = (*w
.we
)[v
].node
.node
->get_id();
102 //*stats.insertCount(vi) += fc;
103 os
<< boost::format("%s %s %f\n") % get_ngram()[vi
[0]] % get_ngram()[vi
[1]] << fc
;
104 //cerr << "Gram " << vi[0] << "," << vi[1] << "+=" << fc << endl;
110 // collect fc of ends
111 // we can use Sright with no problems becase there is only one edge
112 // from ends[i] to the end.
115 vi
[1] = get_id(STOP_ID
);
116 for (i
= 0;i
< n
;i
++) {
117 fc
= Sleft
[ends
[i
]]*Sright
[ends
[i
]]/sum
;
118 vi
[0] = (*w
.we
)[ends
[i
]].node
.node
->get_id();
119 //cerr << "Gram " << vi[0] << "," << vi[1] << "+=" << fc << endl;
120 //*stats.insertCount(vi) += fc;
121 os
<< boost::format("%s %s %f\n") % get_ngram()[vi
[0]] % get_ngram()[vi
[1]] << fc
;
124 vi
[0] = get_id(START_ID
);
125 const WordEntryRefs
&wers
= w
.get_we(0);
127 for (i
= 0;i
< n
;i
++) {
128 fc
= Sleft
[wers
[i
]->id
]*Sright
[wers
[i
]->id
]/sum
;
129 vi
[1] = (*w
.we
)[wers
[i
]->id
].node
.node
->get_id();
130 //cerr << "Gram " << vi[0] << "," << vi[1] << "+=" << fc << endl;
131 //*stats.insertCount(vi) += fc;
132 os
<< boost::format("%s %s %f\n") % get_ngram()[vi
[0]] % get_ngram()[vi
[1]] << fc
;
138 ostream
& SoftCounter::count_dag(const DAG
&dag
,ostream
&os
,int id
, bool first_count
)
140 int n
= dag
.node_count();
141 vector
<long double> Sleft(n
),Sright(n
);
142 vector
<set
<uint
> > prev(n
);
146 //cerr << "Nodes: " << n << endl;
148 vector
<bool> mark(n
);
150 Sleft
[dag
.node_begin()] = 1;
151 traces
.push_back(dag
.node_begin());
153 while (!traces
.empty()) {
160 std::vector
<uint
> nexts
;
161 dag
.get_next(v
,nexts
);
164 for (i
= 0;i
< n
;i
++) {
166 add
= Sleft
[v
]*(first_count
? 1 : LogPtoProb(-dag
.edge_value(v
,vv
)));
167 if (add
== 0.0 || add
== -0.0)
168 cerr
<< boost::format("WARNING: %d: Sleft addition for %d is zero (Sleft[%d] = %Lg, prob=%g)") % id
% vv
% v
% Sleft
[v
] % LogPtoProb(-dag
.edge_value(v
,vv
)) << endl
;
171 traces
.push_back(vv
);
173 // init prev references for Sright phase
178 //cerr << "Sleft done" << endl;
181 // second pass: Sright
182 long double sum
= Sleft
[dag
.node_end()];
183 if (sum
== 0.0 || sum
== -0.0) {
184 cerr
<< boost::format("WARNING: %d: Sum is zero") % id
<< endl
;
185 // Can do nothing more because sum is zero
188 Sright
[dag
.node_end()] = 1;
190 traces
.push_back(dag
.node_end()); // the last v above
191 while (!traces
.empty()) {
199 set
<uint
>::iterator iter
;
200 for (iter
= prev
[vv
].begin();iter
!= prev
[vv
].end(); ++iter
) {
204 add
= Sright
[vv
]*(first_count
? 1 : LogPtoProb(-dag
.edge_value(v
,vv
)));
205 if (add
== 0.0 || add
== -0.0)
206 cerr
<< boost::format("WARNING: %d: Sright addition for %d is zero") % id
% v
<< endl
;
209 // collect fractional counts
210 fc
= Sleft
[v
]*add
/sum
; // P(vv/v)
213 if (dag
.fill_vi(v
,vv
,vvv
,vi
,9)) {
215 for (jn
= 0;vi
[jn
] != 0 && jn
< 9; jn
++);
216 if (jn
< 9 && vi
[jn
] == 0) {
217 for (j
= 0;j
< jn
/2;j
++) {
224 //stats.countSentence(vi,/*LogPtoProb(--fc)*/fc);
225 //*stats.insertCount(vi) += fc;
227 for (int i_vi
= 0; vi
[i_vi
] != 0; i_vi
++)
228 cout
<< get_ngram()[vi
[i_vi
]] << " ";
235 //cerr << "Sright done" << endl;
239 void SoftCounter::record2(const DAG
&dag
,FILE *fp
,int id
)
241 int n
= dag
.node_count();
242 vector
<set
<uint
> > prev(n
);
245 vector
<bool> mark(n
);
247 traces
.push_back(dag
.node_begin());
249 fprintf(fp
,"%d %d %d\n",n
,dag
.node_begin(),dag
.node_end());
251 while (!traces
.empty()) {
258 std::vector
<uint
> nexts
;
259 dag
.get_next(v
,nexts
);
262 for (i
= 0;i
< n
;i
++) {
265 VocabIndex edge_vi
[2],edge_v
;
266 dag
.fill_vi(v
,vv
,edge_v
,edge_vi
,2);
267 fprintf(fp
,"L %d %d %s %s\n",v
,vv
,get_ngram()[edge_vi
[0]],get_ngram()[edge_v
]);
269 traces
.push_back(vv
);
271 // init prev references for Sright phase
277 traces
.push_back(dag
.node_end()); // the last v above
278 while (!traces
.empty()) {
286 set
<uint
>::iterator iter
;
287 for (iter
= prev
[vv
].begin();iter
!= prev
[vv
].end(); ++iter
) {
291 VocabIndex edge_vi
[2],edge_v
;
292 dag
.fill_vi(v
,vv
,edge_v
,edge_vi
,2);
293 fprintf(fp
,"R %d %d %s %s\n",v
,vv
,get_ngram()[edge_vi
[0]],get_ngram()[edge_v
]);
296 fprintf(fp
,"E 0 0 none none\n");
299 int SoftCounter::replay2(FILE *fp_in
,FILE *fp_out
, int id
,bool first_count
)
301 int n
, node_begin
, node_end
;
303 if (fscanf(fp_in
,"%d %d %d\n",&n
,&node_begin
,&node_end
) != 3) {
304 fprintf(stderr
,"Error: %d: Could not read dag count\n",id
);
308 vector
<long double> Sleft(n
),Sright(n
);
311 char type
[2]; // should not be longer than one
312 char str1
[100],str2
[100];
316 //cerr << "Nodes: " << n << endl;
318 Sleft
[node_begin
] = 1;
320 while (fscanf(fp_in
,"%s %d %d %s %s\n",type
,&v
,&vv
,str1
,str2
) == 5) {
323 //fprintf(stderr,"Got %s %d %d %s %s\n",type,v,vv,str1,str2);
324 VocabIndex edge_vi
[2],edge_v
;
325 edge_v
= get_ngram()[str2
];
326 edge_vi
[0] = get_ngram()[str1
];
328 if (type
[0] == 'L') {
329 add
= Sleft
[v
]*(first_count
? 1 : (long double)LogPtoProb(get_ngram().wordProb(edge_v
,edge_vi
)));
330 if (add
== 0.0 || add
== -0.0) {
331 fprintf(stderr
,"WARNING: %d: Sleft addition for %d is zero (Sleft[%d] = %Lg, prob=%g %s %s)\n",id
,vv
,v
,Sleft
[v
],LogPtoProb(get_ngram().wordProb(edge_v
,edge_vi
)),str1
,str2
);
339 sum
= Sleft
[node_end
];
340 if (sum
== 0.0 || sum
== -0.0) {
341 fprintf(stderr
,"WARNING: %d: Sum is zero\n",id
);
343 Sright
[node_end
] = 1;
345 if (sum
== 0.0 || sum
== -0.0)
347 add
= Sright
[vv
]*(first_count
? 1 : (long double)LogPtoProb(get_ngram().wordProb(edge_v
,edge_vi
)));
348 if (add
== 0.0 || add
== -0.0) {
349 fprintf(stderr
,"WARNING: %d: Sright addition for %d is zero (%s %s)\n",id
,v
,str1
,str2
);
354 // collect fractional counts
355 long double fc
= Sleft
[v
]*add
/sum
; // P(vv/v)
356 fprintf(fp_out
,"%s %s %Lg\n",str1
,str2
,fc
);
363 A work-around because NgramFractionalStats is still buggy.
367 ostream
& SoftCounter::count_dag_fixed(const DAG
&dag
,ostream
&os
,bool first_count
)
369 vector
<long double> Sleft
,Sright
;
370 vector
<set
<uint
> > prev
;
374 n
= dag
.node_count();
378 //cerr << "Nodes: " << n << endl;
382 traces
.push_back(dag
.node_begin());
384 while (itrace
< traces
.size()) {
385 v
= traces
[itrace
++];
386 std::vector
<uint
> nexts
;
387 dag
.get_next(v
,nexts
);
390 for (i
= 0;i
< n
;i
++) {
391 vector
<uint
>::iterator iter
= find(traces
.begin(),traces
.end(),nexts
[i
]);
392 if (iter
!= traces
.end()) {
394 if (iter
- traces
.begin() <= itrace
-1)
397 traces
.push_back(nexts
[i
]);
402 uint ntrace
= traces
.size();
403 Sleft
[dag
.node_begin()] = 1; // log(1) = 0
404 for (itrace
= 0;itrace
< ntrace
;itrace
++) {
406 //cout << " " << v << ":" << Sleft[v];
407 std::vector
<uint
> nexts
;
408 dag
.get_next(v
,nexts
);
411 for (i
= 0;i
< n
;i
++) {
413 add
= Sleft
[v
]*dag
.edge_value(v
,vv
);
414 //cout << "-" << dag.edge_value(v,vv) << "-" << Sleft[vv] << ">" << vv;
417 // init prev references for Sright phase
422 //cerr << "Sleft done" << endl;
425 // second pass: Sright
426 long double sum
= Sleft
[dag
.node_end()];
429 //cout << "Sum " << sum << endl;
432 traces
.push_back(dag
.node_end());
434 while (itrace
< traces
.size()) {
435 vv
= traces
[itrace
++];
437 set
<uint
>::iterator iter
;
438 for (iter
= prev
[vv
].begin();iter
!= prev
[vv
].end(); ++iter
) {
439 vector
<uint
>::iterator iiter
= find(traces
.begin(),traces
.end(),*iter
);
440 if (iiter
!= traces
.end()) {
442 if (iiter
- traces
.begin() <= itrace
-1)
445 traces
.push_back(*iter
);
449 ntrace
= traces
.size();
450 Sright
[dag
.node_end()] = 1; // log(1) = 0
451 for (itrace
= 0;itrace
< ntrace
;itrace
++) {
453 //cout << " " << vv << ":" << Sright[vv];
454 set
<uint
>::iterator iter
;
455 for (iter
= prev
[vv
].begin();iter
!= prev
[vv
].end(); ++iter
) {
457 add
= Sright
[vv
]*dag
.edge_value(v
,vv
);
458 //cout << "-" << dag.edge_value(v,vv)<< "-" << Sright[v] << ">" << v;
461 // collect fractional counts
462 fc
= 100-(unsigned int)((Sleft
[v
]*add
)*100.0/sum
); // P(vv/v)
463 //cout << Sleft[v] << "+" << dag.edge_value(v,vv) << Sright[vv]<< "=("<< (Sleft[v]+add)<< ")"<<((Sleft[v]+add)/sum) <<"_" << fc << endl;
464 cerr
<< v
<< " " << vv
<< " " << fc
<< endl
;
468 if (dag
.fill_vi(v
,vv
,vvv
,vi
,9)) {
470 for (jn
= 0;vi
[jn
] != 0 && jn
< 9; jn
++);
471 if (jn
< 9 && vi
[jn
] == 0) {
472 for (j
= 0;j
< jn
/2;j
++) {
479 //stats.countSentence(vi,(unsigned int)(fc*10.0));
480 //*stats.insertCount(vi) += fc;
488 //long double sum2 = Sright[dag.node_begin()];
489 //cout << sum2 << " " << (traces[ntrace-1] == dag.node_begin()) << " " << (sum2 == sum ? "Ok" : "Failed") << endl;
490 //cerr << "Sright done" << endl;