extools/filter_grammar.cc

   1 /*
   2  * Build suffix tree representation of a data set for grammar filtering
   3  * ./filter_grammar <test set> < unfiltered.grammar > filter.grammar
   4  *
   5  */
   6 #include <iostream>
   7 #include <string>
   8 #include <map>
   9 #include <vector>
  10 #include <utility>
  11 #include <cstdlib>
  12 #include <fstream>
  13 #include <tr1/unordered_map>
  14
  15 #include "filelib.h"
  16 #include "sentence_pair.h"
  17 #include "suffix_tree.h"
  18 #include "extract.h"
  19 #include "fdict.h"
  20 #include "tdict.h"
  21
  22 #include <boost/functional/hash.hpp>
  23 #include <boost/program_options.hpp>
  24 #include <boost/program_options/variables_map.hpp>
  25
  26
  27 using namespace std;
  28 using namespace std::tr1;
  29
  30 static const size_t MAX_LINE_LENGTH = 64000000;
  31
  32 typedef unordered_map<vector<WordID>, RuleStatistics, boost::hash<vector<WordID> > > ID2RuleStatistics;
  33
  34
  35 namespace {
  36   inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
  37   inline bool IsBracket(char c){return c == '[' || c == ']';}
  38   inline void SkipWhitespace(const char* buf, int* ptr) {
  39     while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
  40   }
  41 }
  42
  43
  44
  45 int ReadPhraseUntilDividerOrEnd(const char* buf, const int sstart, const int end, vector<WordID>* p) {
  46   static const WordID kDIV = TD::Convert("|||");
  47
  48   int ptr = sstart;
  49   while(ptr < end) {
  50     while(ptr < end && IsWhitespace(buf[ptr])) { ++ptr; }
  51     int start = ptr;
  52     while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
  53     if (ptr == start) {cerr << "Warning! empty token.\n"; return ptr; }
  54     //look in the buffer and see if its a nonterminal marker before integerizing it to wordID-anything with [...] or |||
  55
  56     const WordID w = TD::Convert(string(buf, start, ptr - start));
  57
  58     if((IsBracket(buf[start]) and IsBracket(buf[ptr-1])) or( w == kDIV))
  59       p->push_back(-1);
  60     else {
  61         if (w == kDIV) return ptr;
  62         p->push_back(w);
  63     }
  64   }
  65   return ptr;
  66 }
  67
  68
  69
  70 void ParseLine(const char* buf, vector<WordID>* cur_key, ID2RuleStatistics* counts) {
  71   static const WordID kDIV = TD::Convert("|||");
  72   counts->clear();
  73   int ptr = 0;
  74   while(buf[ptr] != 0 && buf[ptr] != '\t') { ++ptr; }
  75   if (buf[ptr] != '\t') {
  76     cerr << "Missing tab separator between key and value!\n INPUT=" << buf << endl;
  77     exit(1);
  78   }
  79   cur_key->clear();
  80   // key is: "[X] ||| word word word"
  81   int tmpp = ReadPhraseUntilDividerOrEnd(buf, 0, ptr, cur_key);
  82   cur_key->push_back(kDIV);
  83   ReadPhraseUntilDividerOrEnd(buf, tmpp, ptr, cur_key);
  84   ++ptr;
  85   int start = ptr;
  86   int end = ptr;
  87   int state = 0; // 0=reading label, 1=reading count
  88   vector<WordID> name;
  89   while(buf[ptr] != 0) {
  90     while(buf[ptr] != 0 && buf[ptr] != '|') { ++ptr; }
  91     if (buf[ptr] == '|') {
  92       ++ptr;
  93       if (buf[ptr] == '|') {
  94         ++ptr;
  95         if (buf[ptr] == '|') {
  96           ++ptr;
  97           end = ptr - 3;
  98           while (end > start && IsWhitespace(buf[end-1])) { --end; }
  99           if (start == end) {
 100             cerr << "Got empty token!\n  LINE=" << buf << endl;
 101             exit(1);
 102           }
 103           switch (state) {
 104             case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
 105             case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
 106             default: cerr << "Can't happen\n"; abort();
 107           }
 108           SkipWhitespace(buf, &ptr);
 109           start = ptr;
 110         }
 111       }
 112     }
 113   }
 114   end=ptr;
 115   while (end > start && IsWhitespace(buf[end-1])) { --end; }
 116   if (end > start) {
 117     switch (state) {
 118       case 0: ++state; name.clear(); ReadPhraseUntilDividerOrEnd(buf, start, end, &name); break;
 119       case 1: --state; (*counts)[name].ParseRuleStatistics(buf, start, end); break;
 120       default: cerr << "Can't happen\n"; abort();
 121     }
 122   }
 123 }
 124
 125
 126
 127
 128
 129
 130
 131 int main(int argc, char* argv[]){
 132   if (argc != 2) {
 133     cerr << "Usage: " << argv[0] << " testset.txt < unfiltered.grammar\n";
 134     return 1;
 135   }
 136
 137   assert(FileExists(argv[1]));
 138   ReadFile rfts(argv[1]);
 139   istream& testSet = *rfts.stream();
 140   ofstream filter_grammar_;
 141   bool DEBUG = false;
 142
 143
 144   AnnotatedParallelSentence sent;
 145   char* buf = new char[MAX_LINE_LENGTH];
 146   cerr << "Build suffix tree from test set in " << argv[1] << endl;
 147   //root of the suffix tree
 148   Node<int> root;
 149   int line=0;
 150
 151   /* process the data set to build suffix tree
 152    */
 153   while(!testSet.eof()) {
 154     ++line;
 155     testSet.getline(buf, MAX_LINE_LENGTH);
 156     if (buf[0] == 0) continue;
 157
 158     //hack to read in the test set using the alignedparallelsentence methods
 159     strcat(buf," ||| fake ||| 0-0");
 160     sent.ParseInputLine(buf);
 161
 162     if (DEBUG)cerr << line << "||| " << buf << " -- " << sent.f_len << endl;
 163
 164     //add each successive suffix to the tree
 165     for(int i =0;i<sent.f_len;i++)
 166         root.InsertPath(sent.f, i, sent.f_len - 1);
 167     if(DEBUG)cerr<<endl;
 168
 169   }
 170
 171   cerr << "Filtering grammar..." << endl;
 172   //process the unfiltered, unscored grammar
 173
 174   ID2RuleStatistics cur_counts;
 175   vector<WordID> cur_key;
 176   line = 0;
 177
 178   while(cin)
 179       {
 180         ++line;
 181         cin.getline(buf, MAX_LINE_LENGTH);
 182         if (buf[0] == 0) continue;
 183         ParseLine(buf, &cur_key, &cur_counts);
 184         const Node<int>* curnode = &root;
 185         for(int i=0;i<cur_key.size() - 1; i++)
 186           {
 187                 if (DEBUG) cerr << line << " " << cur_key[i] << " ::: ";
 188                 if(cur_key[i] == -1)
 189                   {
 190                         curnode = &root;
 191                 } else if (curnode) {
 192                         curnode = root.Extend(cur_key[i]);
 193                         if (!curnode) break;
 194                   }
 195               }
 196
 197         if(curnode)
 198           cout << buf << endl;
 199       }
 200
 201   return 0;
 202 }