softcount: tolerate zero ngrams
[vspell.git] / utils / sc-train.cpp
bloba14f68f423eb6aa91be4e5f95bc15ad58c316615
1 // -*- tab-width: 2 -*-
2 #include "distance.h"
3 #include <string>
4 #include <fstream>
5 #include <cmath>
6 #include <cstdio>
7 #include <sstream>
8 #include <iostream>
9 #include "sentence.h"
10 #include "softcount.h"
11 #include "propername.h"
12 #include <boost/format.hpp>
14 using namespace std;
16 int main(int argc,char **argv)
18 bool nofuz = true;
19 bool nofuz2 = true;
20 bool trigram = false;
21 bool first_count = false;
22 bool record_sc = false;
23 bool replay_sc = false;
24 const char *str;
25 const char *wordlist = NULL;
27 int argi = 1;
28 argc --;
30 if (argc >= 2 && !strcmp(argv[argi], "--wordlist")) {
31 wordlist = argv[argi+1];
32 argi += 2;
33 argc -= 2;
35 if (argc >= 1 && !strcmp(argv[argi],"--record")) {
36 record_sc = true;
37 argc --;
38 argi ++;
41 if (argc >= 1 && !strcmp(argv[argi],"--replay")) {
42 replay_sc = true;
43 argc --;
44 argi ++;
47 if (argc >= 1) {
48 str = argv[argi];
49 if (!get_ngram().read(str)) {
50 first_count = true;
51 cerr << "Ngram loading error..." << endl;
54 else
55 first_count = true;
56 if (first_count)
57 cerr << "First count " << endl;
58 dic_init();
59 if (!warch.load(first_count ? wordlist : NULL))
60 cerr << "Error loading " << wordlist << endl;
62 get_ngram().set_blocked(true);
64 int count = 0;
65 if (replay_sc) {
66 while (!feof(stdin)) {
67 if (++count % 200 == 0)
68 fprintf(stderr,"%d\n",count);
69 if (SoftCounter::replay2(stdin,stdout,count-1,first_count) == -2)
70 break;
72 return 0;
74 while (!cin.eof()) {
75 if (++count % 200 == 0)
76 cerr << count << endl;
77 Lattice lat;
78 cin >> lat;
79 WordDAG dagw(&lat);
80 DAG *dag = &dagw;
81 WordDAG2 *dagw2;
82 if (trigram && !record_sc) {
83 dagw2 = new WordDAG2(&dagw);
84 dag = dagw2;
86 //sc.count(words,stats);
87 if (record_sc) {
88 SoftCounter::record2(*dag,stdout,count-1);
90 else
91 SoftCounter::count_dag(*dag,cout,count-1,first_count);
92 if (trigram && !record_sc)
93 delete (WordDAG2*)dag;
94 get_ngram().clear_oov();
96 return 0;