1 // -*- tab-width: 2 -*-
11 #include "propername.h"
12 #include <boost/format.hpp>
17 //NgramFractionalStats stats(sarch.get_dict(),2);
19 int main(int argc
,char **argv
)
22 fprintf(stderr
,"Need at least 2 argument.\n");
26 char *oldres
= argv
[1];
27 char *newres
= argv
[2];
34 cerr
<< "Loading... ";
35 //str = (boost::format("wordlist.%s") % oldres).str().c_str();
38 str
= (boost::format("ngram.%s") % oldres
).str().c_str();
43 cerr
<< "Ngram loading error..." << endl
;
44 cerr
<< "done" << endl
;
46 get_sarch().set_blocked(true);
48 //wfst.set_wordlist(get_root());
51 int i
,ii
,iii
,n
,nn
,nnn
,z
;
52 int count
= 0,ccount
= 0;
53 NgramStats
stats(get_sarch().get_dict(),3);
54 //NgramStats syllable_stats(get_sarch().get_dict(),2);
55 while (getline(cin
,s
)) {
57 //cerr << ">" << ccount << endl;
61 sentences_split(s
,ss
);
62 for (z
= 0;z
< ss
.size();z
++) {
64 if (count
% 1000 == 0)
65 cerr
<< count
<< endl
;
69 if (!st
.get_syllable_count())
74 WordStateFactories factories
;
75 ExactWordStateFactory exact
;
76 LowerWordStateFactory lower
;
77 //FuzzyWordStateFactory fuzzy;
78 factories
.push_back(&exact
);
79 factories
.push_back(&lower
);
80 //factories.push_back(&fuzzy);
81 words
.pre_construct(st
,wes
,factories
);
82 mark_proper_name(st
,wes
);
83 words
.post_construct(wes
);
84 //cerr << words << endl;
85 Segmentation
seg(words
.we
);
87 /* // pfs don't distinguish
89 wfst.segment_best_no_fuzzy(words,seg);
91 wfst.segment_best(words,seg);
95 wfst
.search(dag
,path
);
96 seg
.resize(path
.size()-2);
97 copy(path
.begin()+1,path
.end()-1,seg
.begin());
99 //seg.pretty_print(cout,st) << endl;
104 vi
= new VocabIndex
[n
+1];
106 for (i
= 0;i
< n
;i
++) {
107 if (path
[i
] == dag
.node_begin())
108 vi
[i
] = get_id(START_ID
);
109 else if (path
[i
] == dag
.node_end())
110 vi
[i
] = get_id(STOP_ID
);
112 vi
[i
] = ((WordEntry
*)dag
.node_info(path
[i
]))->node
.node
->get_id();
114 if (!sarch.in_dict(vi[i])) {
115 cerr << ">>" << ccount << " " << count << " " << vi[i] << endl;
116 vi[i] = get_id(UNK_ID);
120 //cerr << "<" << sarch[vi[i]] << "> ";
124 stats
.countSentence(vi
);
125 //cerr << "done" << endl;
130 const WordEntries &we = *words.we;
132 for (i = 0;i < n;i ++) {
133 we[i].node.node->inc_b();
137 for (i = 0;i < n;i ++)
138 seg[i].node.node->inc_a();
144 cerr
<< "Calculating... ";
145 //get_root()->get_next(unk_id)->get_b() = 0;
146 //get_root()->recalculate();
147 get_ngram().estimate(stats
);
148 //wfst.enable_ngram(true);
150 cerr
<< "Saving... ";
151 //str = (boost::format("wordlist.wl.%s") % newres).str().c_str();
152 //get_root()->save(str);
154 str
= (boost::format("ngram.%s") % newres
).str().c_str();
156 get_ngram().write(ff
);
159 for (int i = 0;i < 50;i ++) {
162 ofstream ofs(oss.str().c_str());
163 cerr << "Iteration " << i << "... ";
165 cerr << "done" << endl;