2 * Build suffix tree representation of a data set for grammar filtering
3 * ./filter_grammar <test set> < unfiltered.grammar > filter.grammar
13 #include <tr1/unordered_map>
16 #include "sentence_pair.h"
17 #include "suffix_tree.h"
22 #include <boost/functional/hash.hpp>
23 #include <boost/program_options.hpp>
24 #include <boost/program_options/variables_map.hpp>
28 using namespace std::tr1
;
30 static const size_t MAX_LINE_LENGTH
= 64000000;
32 typedef unordered_map
<vector
<WordID
>, RuleStatistics
, boost::hash
<vector
<WordID
> > > ID2RuleStatistics
;
36 inline bool IsWhitespace(char c
) { return c
== ' ' || c
== '\t'; }
37 inline bool IsBracket(char c
){return c
== '[' || c
== ']';}
38 inline void SkipWhitespace(const char* buf
, int* ptr
) {
39 while (buf
[*ptr
] && IsWhitespace(buf
[*ptr
])) { ++(*ptr
); }
45 int ReadPhraseUntilDividerOrEnd(const char* buf
, const int sstart
, const int end
, vector
<WordID
>* p
) {
46 static const WordID kDIV
= TD::Convert("|||");
50 while(ptr
< end
&& IsWhitespace(buf
[ptr
])) { ++ptr
; }
52 while(ptr
< end
&& !IsWhitespace(buf
[ptr
])) { ++ptr
; }
53 if (ptr
== start
) {cerr
<< "Warning! empty token.\n"; return ptr
; }
54 //look in the buffer and see if its a nonterminal marker before integerizing it to wordID-anything with [...] or |||
56 const WordID w
= TD::Convert(string(buf
, start
, ptr
- start
));
58 if((IsBracket(buf
[start
]) and IsBracket(buf
[ptr
-1])) or( w
== kDIV
))
61 if (w
== kDIV
) return ptr
;
70 void ParseLine(const char* buf
, vector
<WordID
>* cur_key
, ID2RuleStatistics
* counts
) {
71 static const WordID kDIV
= TD::Convert("|||");
74 while(buf
[ptr
] != 0 && buf
[ptr
] != '\t') { ++ptr
; }
75 if (buf
[ptr
] != '\t') {
76 cerr
<< "Missing tab separator between key and value!\n INPUT=" << buf
<< endl
;
80 // key is: "[X] ||| word word word"
81 int tmpp
= ReadPhraseUntilDividerOrEnd(buf
, 0, ptr
, cur_key
);
82 cur_key
->push_back(kDIV
);
83 ReadPhraseUntilDividerOrEnd(buf
, tmpp
, ptr
, cur_key
);
87 int state
= 0; // 0=reading label, 1=reading count
89 while(buf
[ptr
] != 0) {
90 while(buf
[ptr
] != 0 && buf
[ptr
] != '|') { ++ptr
; }
91 if (buf
[ptr
] == '|') {
93 if (buf
[ptr
] == '|') {
95 if (buf
[ptr
] == '|') {
98 while (end
> start
&& IsWhitespace(buf
[end
-1])) { --end
; }
100 cerr
<< "Got empty token!\n LINE=" << buf
<< endl
;
104 case 0: ++state
; name
.clear(); ReadPhraseUntilDividerOrEnd(buf
, start
, end
, &name
); break;
105 case 1: --state
; (*counts
)[name
].ParseRuleStatistics(buf
, start
, end
); break;
106 default: cerr
<< "Can't happen\n"; abort();
108 SkipWhitespace(buf
, &ptr
);
115 while (end
> start
&& IsWhitespace(buf
[end
-1])) { --end
; }
118 case 0: ++state
; name
.clear(); ReadPhraseUntilDividerOrEnd(buf
, start
, end
, &name
); break;
119 case 1: --state
; (*counts
)[name
].ParseRuleStatistics(buf
, start
, end
); break;
120 default: cerr
<< "Can't happen\n"; abort();
131 int main(int argc
, char* argv
[]){
133 cerr
<< "Usage: " << argv
[0] << " testset.txt < unfiltered.grammar\n";
137 assert(FileExists(argv
[1]));
138 ReadFile
rfts(argv
[1]);
139 istream
& testSet
= *rfts
.stream();
140 ofstream filter_grammar_
;
144 AnnotatedParallelSentence sent
;
145 char* buf
= new char[MAX_LINE_LENGTH
];
146 cerr
<< "Build suffix tree from test set in " << argv
[1] << endl
;
147 //root of the suffix tree
151 /* process the data set to build suffix tree
153 while(!testSet
.eof()) {
155 testSet
.getline(buf
, MAX_LINE_LENGTH
);
156 if (buf
[0] == 0) continue;
158 //hack to read in the test set using the alignedparallelsentence methods
159 strcat(buf
," ||| fake ||| 0-0");
160 sent
.ParseInputLine(buf
);
162 if (DEBUG
)cerr
<< line
<< "||| " << buf
<< " -- " << sent
.f_len
<< endl
;
164 //add each successive suffix to the tree
165 for(int i
=0;i
<sent
.f_len
;i
++)
166 root
.InsertPath(sent
.f
, i
, sent
.f_len
- 1);
171 cerr
<< "Filtering grammar..." << endl
;
172 //process the unfiltered, unscored grammar
174 ID2RuleStatistics cur_counts
;
175 vector
<WordID
> cur_key
;
181 cin
.getline(buf
, MAX_LINE_LENGTH
);
182 if (buf
[0] == 0) continue;
183 ParseLine(buf
, &cur_key
, &cur_counts
);
184 const Node
<int>* curnode
= &root
;
185 for(int i
=0;i
<cur_key
.size() - 1; i
++)
187 if (DEBUG
) cerr
<< line
<< " " << cur_key
[i
] << " ::: ";
191 } else if (curnode
) {
192 curnode
= root
.Extend(cur_key
[i
]);