Change Words to Lattice
[vspell.git] / tests / wfst-train.cpp
blob6097c0152aba0b8bbfefd246548efae9b73d3573
1 // -*- tab-width: 2 -*-
2 #include "pfs.h"
3 #include "distance.h"
4 #include <string>
5 #include <fstream>
6 #include <cmath>
7 #include <cstdio>
8 #include <sstream>
9 #include <iostream>
10 #include "sentence.h"
11 #include <boost/format.hpp>
13 using namespace std;
16 //NgramFractionalStats stats(sarch.get_dict(),2);
18 int main(int argc,char **argv)
20 if (argc < 3) {
21 fprintf(stderr,"Need at least 2 argument.\n");
22 return 0;
25 char *oldres = argv[1];
26 char *newres = argv[2];
27 bool nofuz = true;
28 bool nofuz2 = true;
29 const char *str;
31 dic_init(nofuz ?
32 new WordNode(sarch["<root>"]) :
33 new FuzzyWordNode(sarch["<root>"]));
35 cerr << "Loading... ";
36 //str = (boost::format("wordlist.wl.%s") % oldres).str().c_str();
37 str = "wordlist.wl";
38 get_root()->load(str);
39 str = (boost::format("ngram.%s") % oldres).str().c_str();
40 File f(str,"rt",0);
41 if (!f.error())
42 ngram.read(f);
43 else
44 cerr << "Ngram loading error..." << endl;
45 cerr << "done" << endl;
47 sarch.set_blocked(true);
49 //wfst.set_wordlist(get_root());
51 string s;
52 int i,ii,iii,n,nn,nnn,z;
53 int count = 0;
54 NgramStats stats(sarch.get_dict(),2);
55 while (getline(cin,s)) {
56 count ++;
57 if (count % 200 == 0)
58 cerr << count << endl;
59 if (s.empty())
60 continue;
61 vector<string> ss;
62 sentences_split(s,ss);
63 for (z = 0;z < ss.size();z ++) {
64 Sentence st(ss[z]);
65 st.standardize();
66 st.tokenize();
67 if (!st.get_syllable_count())
68 continue;
69 Lattice words;
70 words.construct(st);
71 //cerr << words << endl;
72 Segmentation seg(words.we);
73 PFS wfst;
74 if (nofuz2)
75 wfst.segment_best_no_fuzzy(words,seg);
76 else
77 wfst.segment_best(words,seg);
79 //seg.pretty_print(cout,st) << endl;
81 n = seg.size();
82 VocabIndex *vi = new VocabIndex[n+3];
83 vi[0] = start_id;
84 vi[n+1] = stop_id;
85 vi[n+2] = Vocab_None;
86 for (i = 0;i < n;i ++) {
87 vi[i+1] = seg[i].node.node->get_id();
88 //cerr << "<" << sarch[vi[i]] << "> ";
90 //cerr << endl;
91 stats.countSentence(vi);
92 delete[] vi;
95 const WordEntries &we = *words.we;
96 n = we.size();
97 for (i = 0;i < n;i ++) {
98 we[i].node.node->inc_b();
101 n = seg.size();
102 for (i = 0;i < n;i ++)
103 seg[i].node.node->inc_a();
108 cerr << "Calculating... ";
109 //get_root()->get_next(unk_id)->get_b() = 0;
110 //get_root()->recalculate();
111 ngram.estimate(stats);
112 //wfst.enable_ngram(true);
114 cerr << "Saving... ";
115 //str = (boost::format("wordlist.wl.%s") % newres).str().c_str();
116 //get_root()->save(str);
118 str = (boost::format("ngram.%s") % newres).str().c_str();
119 File ff(str,"wt");
120 ngram.write(ff);
121 cerr << endl;
123 for (int i = 0;i < 50;i ++) {
124 ostringstream oss;
125 oss << "log." << i;
126 ofstream ofs(oss.str().c_str());
127 cerr << "Iteration " << i << "... ";
128 iterate(ofs,i);
129 cerr << "done" << endl;
132 return 0;