1 // -*- coding: viscii -*-
9 #include <srilm/NgramStats.h>
11 void iterate(ostream
&os
,int level
);
16 vector
<Sentence
> sentences
;
20 Dictionary::initialize();
22 cerr
<< "Loading... ";
23 Dictionary::get_root()->load("wordlist.wl");
24 cerr
<< "done" << endl
;
26 wfst
.set_wordlist(Dictionary::get_root());
27 ifstream
ifs("corpus2");
29 cerr
<< "Can not open corpus\n";
34 while (getline(ifs
,s
)) {
36 sentences
.push_back(Sentence(s
));
37 Sentence
&st
= sentences
.back();
43 for (int i
= 0;i
< 50;i
++) {
46 ofstream
ofs(oss
.str().c_str());
47 cerr
<< "Iteration " << i
<< "... ";
49 cerr
<< "done" << endl
;
55 void print_all_words(const Words
&words
);
56 void iterate(ostream
&os
,int level
)
58 int ist
,nr_sentences
= sentences
.size();
59 NgramStats
stats(Dictionary::sarch
.get_dict(),2);
60 for (ist
= 0;ist
< nr_sentences
;ist
++) {
61 Sentence
&st
= sentences
[ist
];
65 wfst
.get_all_words(st
,words
);
66 //print_all_words(words);
67 wfst
.segment_best(st
,words
,seg
);
72 int i
,ii
,iii
,n
,nn
,nnn
;
74 VocabIndex
*vi
= new VocabIndex
[n
+1];
76 for (i
= 0;i
< n
;i
++)
77 vi
[i
] = seg
.items
[i
].state
->get_id();
78 stats
.countSentence(vi
);
81 for (i
= 0;i
< n
;i
++) {
83 for (ii
= 0;ii
< nn
;ii
++) {
84 nnn
= words
[i
][ii
].fuzzy_match
.size();
85 for (iii
= 0;iii
< nnn
;iii
++) {
86 words
[i
][ii
].fuzzy_match
[iii
].node
->inc_b();
92 for (i
= 0;i
< n
;i
++)
93 seg
.items
[i
].state
->inc_a();
97 cerr
<< "Calculating... ";
98 Dictionary::get_root()->get_next(Dictionary::unk_id
)->get_b() = 0;
99 Dictionary::get_root()->recalculate();
100 Dictionary::ngram
.estimate(stats
);
101 //wfst.enable_ngram(true);
103 cerr
<< "Saving... ";
105 oss
<< "wordlist.wl." << level
;
106 Dictionary::get_root()->save(oss
.str().c_str());
109 oss1
<< "ngram." << level
;
110 File
f(oss1
.str().c_str(),"wt");
111 Dictionary::ngram
.write(f
);