Fix the strange bug mentioned in the last revision. The reason is changing from multi...
[vspell.git] / tests / wfst-train.cpp
blob1165c6190f73f851aedeabf9571ba99544bb60c5
1 // -*- tab-width: 2 -*-
2 #include "pfs.h"
3 #include "distance.h"
4 #include <string>
5 #include <fstream>
6 #include <cmath>
7 #include <cstdio>
8 #include <sstream>
9 #include <iostream>
10 #include "sentence.h"
11 #include <boost/format.hpp>
13 using namespace std;
16 //NgramFractionalStats stats(sarch.get_dict(),2);
18 int main(int argc,char **argv)
20 if (argc < 3) {
21 fprintf(stderr,"Need at least 2 argument.\n");
22 return 0;
25 char *oldres = argv[1];
26 char *newres = argv[2];
27 bool nofuz = true;
28 bool nofuz2 = true;
29 const char *str;
31 dic_init();
33 cerr << "Loading... ";
34 //str = (boost::format("wordlist.%s") % oldres).str().c_str();
35 str = "wordlist";
36 warch.load(str);
37 str = (boost::format("ngram.%s") % oldres).str().c_str();
38 File f(str,"rt",0);
39 if (!f.error())
40 get_ngram().read(f);
41 else
42 cerr << "Ngram loading error..." << endl;
43 cerr << "done" << endl;
45 get_sarch().set_blocked(true);
47 //wfst.set_wordlist(get_root());
49 string s;
50 int i,ii,iii,n,nn,nnn,z;
51 int count = 0;
52 NgramStats stats(get_sarch().get_dict(),3);
53 NgramStats syllable_stats(get_sarch().get_dict(),2);
54 while (getline(cin,s)) {
55 count ++;
56 if (count % 200 == 0)
57 cerr << count << endl;
58 if (s.empty())
59 continue;
60 vector<string> ss;
61 sentences_split(s,ss);
62 for (z = 0;z < ss.size();z ++) {
63 Sentence st(ss[z]);
64 st.standardize();
65 st.tokenize();
66 if (!st.get_syllable_count())
67 continue;
68 //cerr << st << endl;
69 Lattice words;
70 set<WordEntry> wes;
71 WordStateFactories factories;
72 ExactWordStateFactory exact;
73 LowerWordStateFactory lower;
74 //FuzzyWordStateFactory fuzzy;
75 factories.push_back(&exact);
76 factories.push_back(&lower);
77 //factories.push_back(&fuzzy);
78 words.pre_construct(sent,wes,factories);
79 mark_proper_name(sent,wes);
80 words.post_construct(wes);
81 //cerr << words << endl;
82 Segmentation seg(words.we);
83 PFS wfst;
84 /* // pfs don't distinguish
85 if (nofuz2)
86 wfst.segment_best_no_fuzzy(words,seg);
87 else
88 wfst.segment_best(words,seg);
90 Path path;
91 WordDAG dag(&words);
92 wfst.search(dag,path);
93 seg.resize(path.size()-2);
94 copy(path.begin()+1,path.end()-1,seg.begin());
96 //seg.pretty_print(cout,st) << endl;
98 VocabIndex *vi;
99 n = st.get_syllable_count();
100 vi = new VocabIndex[n+1];
101 vi[n] = Vocab_None;
102 for (i = 0;i < n;i ++)
103 vi[i] = st[i].get_cid();
104 syllable_stats.countSentence(vi);
105 delete[] vi;
106 n = path.size();
107 if (n > 3) {
108 vi = new VocabIndex[n+1];
109 vi[n] = Vocab_None;
110 for (i = 0;i < n;i ++) {
111 if (path[i] == dag.node_begin())
112 vi[i] = get_id(START_ID);
113 else if (path[i] == dag.node_end())
114 vi[i] = get_id(STOP_ID);
115 else
116 vi[i] = ((WordEntry*)dag.node_info(path[i]))->node.node->get_id();
117 //cerr << "<" << sarch[vi[i]] << "> ";
119 //cerr << endl;
120 //cerr << n << endl;
121 stats.countSentence(vi);
122 //cerr << "done" << endl;
123 delete[] vi;
127 const WordEntries &we = *words.we;
128 n = we.size();
129 for (i = 0;i < n;i ++) {
130 we[i].node.node->inc_b();
133 n = seg.size();
134 for (i = 0;i < n;i ++)
135 seg[i].node.node->inc_a();
140 cerr << "Calculating... ";
141 //get_root()->get_next(unk_id)->get_b() = 0;
142 //get_root()->recalculate();
143 get_ngram().estimate(stats);
144 get_syngram().estimate(syllable_stats);
145 //wfst.enable_ngram(true);
147 cerr << "Saving... ";
148 //str = (boost::format("wordlist.wl.%s") % newres).str().c_str();
149 //get_root()->save(str);
151 str = (boost::format("ngram.%s") % newres).str().c_str();
152 File ff(str,"wt");
153 get_ngram().write(ff);
154 str = (boost::format("syngram.%s") % newres).str().c_str();
155 File fff(str,"wt");
156 get_syngram().write(fff);
157 cerr << endl;
159 for (int i = 0;i < 50;i ++) {
160 ostringstream oss;
161 oss << "log." << i;
162 ofstream ofs(oss.str().c_str());
163 cerr << "Iteration " << i << "... ";
164 iterate(ofs,i);
165 cerr << "done" << endl;
168 return 0;