Fix the strange bug mentioned in the last revision. The reason is changing from multi...
[vspell.git] / tests / sc-train.cpp
blobda158952862a138d2f0b324bb89583394bfb54ac
1 // -*- tab-width: 2 -*-
2 #include "distance.h"
3 #include <string>
4 #include <fstream>
5 #include <cmath>
6 #include <cstdio>
7 #include <sstream>
8 #include <iostream>
9 #include "sentence.h"
10 #include "softcount.h"
11 #include <boost/format.hpp>
13 using namespace std;
15 void estimate(Ngram &ngram,NgramFractionalStats &stats);
17 int main(int argc,char **argv)
19 if (argc < 3) {
20 fprintf(stderr,"Need at least 2 argument.\n");
21 return 0;
24 char *oldres = argv[1];
25 char *newres = argv[2];
26 bool nofuz = true;
27 bool nofuz2 = true;
28 const char *str;
30 dic_init();
32 cerr << "Loading... ";
33 str = "wordlist.wl";
34 warch.load(str);
35 str = (boost::format("ngram.%s") % oldres).str().c_str();
36 File f(str,"rt",0);
37 if (!f.error())
38 get_ngram().read(f);
39 else
40 cerr << "Ngram loading error..." << endl;
41 cerr << "done" << endl;
43 get_sarch().set_blocked(true);
45 string s;
46 int i,ii,iii,n,nn,nnn,z;
47 int count = 0;
48 NgramFractionalStats stats(get_sarch().get_dict(),2);
49 while (getline(cin,s)) {
50 count ++;
51 if (count % 200 == 0)
52 cerr << count << endl;
53 if (s.empty())
54 continue;
55 vector<string> ss;
56 sentences_split(s,ss);
57 for (z = 0;z < ss.size();z ++) {
58 Sentence st(ss[z]);
59 st.standardize();
60 st.tokenize();
61 if (!st.get_syllable_count())
62 continue;
63 Lattice words;
64 words.construct(st);
65 Segmentation seg(words.we);
66 SoftCounter sc;
67 sc.count(words,stats);
71 cerr << "Calculating... ";
72 //estimate(ngram,stats);
73 get_ngram().estimate(stats,NULL);
74 //wfst.enable_ngram(true);
76 cerr << "Saving... ";
77 str = (boost::format("get_ngram().%s") % newres).str().c_str();
78 File ff(str,"wt");
79 get_ngram().write(ff);
80 cerr << endl;
82 for (int i = 0;i < 50;i ++) {
83 ostringstream oss;
84 oss << "log." << i;
85 ofstream ofs(oss.str().c_str());
86 cerr << "Iteration " << i << "... ";
87 iterate(ofs,i);
88 cerr << "done" << endl;
91 return 0;
94 void estimate(Ngram &ngram,NgramFractionalStats &stats)
97 * If no discount method was specified we do the default, standard
98 * thing. Good Turing discounting with the specified min and max counts
99 * for all orders.
101 unsigned order = get_ngram().setorder(0);
102 Discount *discounts[order];
103 unsigned i;
104 Boolean error = false;
106 for (i = 1; !error & i <= order; i++) {
107 discounts[i-1] = new GoodTuring(GT_defaultMinCount, GT_defaultMaxCount);
109 * Transfer the LMStats's debug level to the newly
110 * created discount objects
112 discounts[i-1]->debugme(stats.debuglevel());
114 if (!discounts[i-1]->estimate(stats, i)) {
115 std::cerr << "failed to estimate GT discount for order " << i + 1
116 << std::endl;
117 error = true;
121 if (!error) {
122 error = !get_ngram().estimate((NgramCounts<FloatCount>&)stats, discounts);
125 for (i = 1; i <= order; i++) {
126 delete discounts[i-1];