test
[ws10smt.git] / extools / filter_grammar.cc
blob290a77f54efd03e91ee29b3d0bc4a797bd9280a4
1 /*
2 * Build suffix tree representation of a data set for grammar filtering
3 * ./filter_grammar <test set> < unfiltered.grammar > filter.grammar
5 */
6 #include <iostream>
7 #include <string>
8 #include <map>
9 #include <vector>
10 #include <utility>
11 #include <cstdlib>
12 #include <fstream>
13 #include <tr1/unordered_map>
15 #include "filelib.h"
16 #include "sentence_pair.h"
17 #include "suffix_tree.h"
18 #include "extract.h"
19 #include "fdict.h"
20 #include "tdict.h"
22 #include <boost/functional/hash.hpp>
23 #include <boost/program_options.hpp>
24 #include <boost/program_options/variables_map.hpp>
27 using namespace std;
28 using namespace std::tr1;
30 static const size_t MAX_LINE_LENGTH = 64000000;
32 typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
35 namespace {
36 inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
37 inline bool IsBracket(char c){return c == '[' || c == ']';}
38 inline void SkipWhitespace(const char* buf, int* ptr) {
39 while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
45 int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
46 static const WordID kDIV = TD::Convert("|||");
48 int ptr = sstart;
49 while(ptr < end) {
50 while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
51 int start = ptr;
52 while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
53 if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
54 //look in the buffer and see if its a nonterminal marker before integerizing it to wordID-anything with [...] or |||
56 const WordID w = TD::Convert(string(buf, start, ptr - start));
58 if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
59 p->push_back(-1);
60 else {
61 if (w == kDIV) return ptr;
62 p->push_back(w);
65 return ptr;
70 void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
71 static const WordID kDIV = TD::Convert("|||");
72 counts->clear();
73 int ptr = 0;
74 while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
75 if (buf[ptr] != '\t') {
76 cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
77 exit(1);
79 cur_key->clear();
80 // key is: "[X] ||| word word word"
81 int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
82 cur_key->push_back(kDIV);
83 ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
84 ++ptr;
85 int start = ptr;
86 int end = ptr;
87 int state = 0; // 0=reading label, 1=reading count
88 vector<WordID> name;
89 while(buf[ptr] != 0) {
90 while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
91 if (buf[ptr] == '|') {
92 ++ptr;
93 if (buf[ptr] == '|') {
94 ++ptr;
95 if (buf[ptr] == '|') {
96 ++ptr;
97 end = ptr - 3;
98 while (end > start && IsWhitespace(buf[end-1])) { --end; }
99 if (start == end) {
100 cerr << "Got empty token!\n LINE=" << buf << endl;
101 exit(1);
103 switch (state) {
104 case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
105 case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
106 default: cerr << "Can't happen\n"; abort();
108 SkipWhitespace(buf, &ptr);
109 start = ptr;
114 end=ptr;
115 while (end > start && IsWhitespace(buf[end-1])) { --end; }
116 if (end > start) {
117 switch (state) {
118 case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
119 case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
120 default: cerr << "Can't happen\n"; abort();
131 int main(int argc, char* argv[]){
132 if (argc != 2) {
133 cerr << "Usage: " << argv[0] << " testset.txt < unfiltered.grammar\n";
134 return 1;
137 assert(FileExists(argv[1]));
138 ReadFile rfts(argv[1]);
139 istream& testSet = *rfts.stream();
140 ofstream filter_grammar_;
141 bool DEBUG = false;
144 AnnotatedParallelSentence sent;
145 char* buf = new char[MAX_LINE_LENGTH];
146 cerr << "Build suffix tree from test set in " << argv[1] << endl;
147 //root of the suffix tree
148 Node<int> root;
149 int line=0;
151 /* process the data set to build suffix tree
153 while(!testSet.eof()) {
154 ++line;
155 testSet.getline(buf, MAX_LINE_LENGTH);
156 if (buf[0] == 0) continue;
158 //hack to read in the test set using the alignedparallelsentence methods
159 strcat(buf," ||| fake ||| 0-0");
160 sent.ParseInputLine(buf);
162 if (DEBUG)cerr << line << "||| " << buf << " -- " << sent.f_len << endl;
164 //add each successive suffix to the tree
165 for(int i =0;i<sent.f_len;i++)
166 root.InsertPath(sent.f, i, sent.f_len - 1);
167 if(DEBUG)cerr<<endl;
171 cerr << "Filtering grammar..." << endl;
172 //process the unfiltered, unscored grammar
174 ID2RuleStatistics cur_counts;
175 vector<WordID> cur_key;
176 line = 0;
178 while(cin)
180 ++line;
181 cin.getline(buf, MAX_LINE_LENGTH);
182 if (buf[0] == 0) continue;
183 ParseLine(buf, &cur_key, &cur_counts);
184 const Node<int>* curnode = &root;
185 for(int i=0;i<cur_key.size() - 1; i++)
187 if (DEBUG) cerr << line << " " << cur_key[i] << " ::: ";
188 if(cur_key[i] == -1)
190 curnode = &root;
191 } else if (curnode) {
192 curnode = root.Extend(cur_key[i]);
193 if (!curnode) break;
197 if(curnode)
198 cout << buf << endl;
201 return 0;