extools/extract.cc

   1 #include "extract.h"
   2
   3 #include <queue>
   4 #include <vector>
   5 #include <utility>
   6 #include <tr1/unordered_map>
   7 #include <set>
   8
   9 #include <boost/functional/hash.hpp>
  10
  11 #include "sentence_pair.h"
  12 #include "tdict.h"
  13 #include "wordid.h"
  14 #include "array2d.h"
  15
  16 using namespace std;
  17 using namespace tr1;
  18
  19 namespace {
  20   inline bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
  21
  22   inline void SkipWhitespace(const char* buf, int* ptr) {
  23     while (buf[*ptr] && IsWhitespace(buf[*ptr])) { ++(*ptr); }
  24   }
  25 }
  26
  27 Extract::RuleObserver::~RuleObserver() {
  28   cerr << "Rules extracted: " << count << endl;
  29 }
  30
  31 void Extract::ExtractBasePhrases(const int max_base_phrase_size,
  32                         const AnnotatedParallelSentence& sentence,
  33                         vector<ParallelSpan>* phrases) {
  34   phrases->clear();
  35
  36   vector<pair<int,int> > f_spans(sentence.f_len, pair<int,int>(sentence.e_len, 0));
  37   vector<pair<int,int> > e_spans(sentence.e_len, pair<int,int>(sentence.f_len, 0));
  38   // for each alignment point in e, precompute the minimal consistent phrases in f
  39   // for each alignment point in f, precompute the minimal consistent phrases in e
  40   for (int i = 0; i < sentence.f_len; ++i) {
  41     for (int j = 0; j < sentence.e_len; ++j) {
  42       if (sentence.aligned(i,j)) {
  43         if (j < f_spans[i].first) f_spans[i].first = j;
  44         f_spans[i].second = j+1;
  45         if (i < e_spans[j].first) e_spans[j].first = i;
  46         e_spans[j].second = i+1;
  47       }
  48     }
  49   }
  50
  51   for (int i1 = 0; i1 < sentence.f_len; ++i1) {
  52     if (sentence.f_aligned[i1] == 0) continue;
  53     int j1 = sentence.e_len;
  54     int j2 = 0;
  55     const int i_limit = min(sentence.f_len, i1 + max_base_phrase_size);
  56     for (int i2 = i1 + 1; i2 <= i_limit; ++i2) {
  57       if (sentence.f_aligned[i2-1] == 0) continue;
  58       // cerr << "F has aligned span " << i1 << " to " << i2 << endl;
  59       j1 = min(j1, f_spans[i2-1].first);
  60       j2 = max(j2, f_spans[i2-1].second);
  61       if (j1 >= j2) continue;
  62       if (j2 - j1 > max_base_phrase_size) continue;
  63       int condition = 0;
  64       for (int j = j1; j < j2; ++j) {
  65         if (e_spans[j].first < i1) { condition = 1; break; }
  66         if (e_spans[j].second > i2) { condition = 2; break; }
  67       }
  68       if (condition == 1) break;
  69       if (condition == 2) continue;
  70       // category types added later!
  71       phrases->push_back(ParallelSpan(i1, i2, j1, j2));
  72       // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl;
  73     }
  74   }
  75 }
  76
  77 void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence& sentence,
  78                                  const int max_base_phrase_size,
  79                                  vector<ParallelSpan>* phrases) {
  80   const int num_phrases = phrases->size();
  81   map<int, map<int, map<int, map<int, bool> > > > marker;
  82   for (int i = 0; i < num_phrases; ++i) {
  83     const ParallelSpan& cur = (*phrases)[i];
  84     marker[cur.i1][cur.i2][cur.j1][cur.j2] = true;
  85   }
  86   for (int i = 0; i < num_phrases; ++i) {
  87     const ParallelSpan& cur = (*phrases)[i];
  88     const int i1_max = cur.i1;
  89     const int i2_min = cur.i2;
  90     const int j1_max = cur.j1;
  91     const int j2_min = cur.j2;
  92     int i1_min = i1_max;
  93     while (i1_min > 0 && sentence.f_aligned[i1_min-1] == 0) { --i1_min; }
  94     int j1_min = j1_max;
  95     while (j1_min > 0 && sentence.e_aligned[j1_min-1] == 0) { --j1_min; }
  96     int i2_max = i2_min;
  97     while (i2_max < sentence.f_len && sentence.f_aligned[i2_max] == 0) { ++i2_max; }
  98     int j2_max = j2_min;
  99     while (j2_max < sentence.e_len && sentence.e_aligned[j2_max] == 0) { ++j2_max; }
 100     for (int i1 = i1_min; i1 <= i1_max; ++i1) {
 101       const int ilim = min(i2_max, i1 + max_base_phrase_size);
 102       for (int i2 = max(i1+1,i2_min); i2 <= ilim; ++i2) {
 103         for (int j1 = j1_min; j1 <= j1_max; ++j1) {
 104           const int jlim = min(j2_max, j1 + max_base_phrase_size);
 105           for (int j2 = max(j1+1, j2_min); j2 <= jlim; ++j2) {
 106             bool& seen = marker[i1][i2][j1][j2];
 107             if (!seen)
 108               phrases->push_back(ParallelSpan(i1,i2,j1,j2));
 109             seen = true;
 110           }
 111         }
 112       }
 113     }
 114   }
 115 }
 116
 117 // this uses the TARGET span (i,j) to annotate phrases, will copy
 118 // phrases if there is more than one annotation.
 119 // TODO: support source annotation
 120 void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat,
 121                                       const Array2D<vector<WordID> >& types,
 122                                       vector<ParallelSpan>* phrases) {
 123   const int num_unannotated_phrases = phrases->size();
 124   // have to use num_unannotated_phrases since we may grow the vector
 125   for (int i = 0; i < num_unannotated_phrases; ++i) {
 126     ParallelSpan& phrase = (*phrases)[i];
 127     const vector<WordID>* pcats = &types(phrase.j1, phrase.j2);
 128     if (pcats->empty() && default_cat != 0) {
 129       static vector<WordID> s_default(1, default_cat);
 130       pcats = &s_default;
 131     }
 132     if (pcats->empty()) {
 133       cerr << "ERROR span " << phrase.i1 << "," << phrase.i2 << "-"
 134            << phrase.j1 << "," << phrase.j2 << " has no type. "
 135               "Did you forget --default_category?\n";
 136     }
 137     const vector<WordID>& cats = *pcats;
 138     phrase.cat = cats[0];
 139     for (int ci = 1; ci < cats.size(); ++ci) {
 140       ParallelSpan new_phrase = phrase;
 141       new_phrase.cat = cats[ci];
 142       phrases->push_back(new_phrase);
 143     }
 144   }
 145 }
 146
 147 // a partially complete (f-side) of a rule
 148 struct RuleItem {
 149   vector<ParallelSpan> f;
 150   int i,j,syms,vars;
 151   explicit RuleItem(int pi) : i(pi), j(pi), syms(), vars() {}
 152   void Extend(const WordID& fword) {
 153     f.push_back(ParallelSpan(fword));
 154     ++j;
 155     ++syms;
 156   }
 157   void Extend(const ParallelSpan& subphrase) {
 158     f.push_back(subphrase);
 159     j += subphrase.i2 - subphrase.i1;
 160     ++vars;
 161     ++syms;
 162   }
 163   bool RuleFEndsInVariable() const {
 164     if (f.size() > 0) {
 165       return f.back().IsVariable();
 166     } else { return false; }
 167   }
 168 };
 169
 170 void Extract::ExtractConsistentRules(const AnnotatedParallelSentence& sentence,
 171                           const vector<ParallelSpan>& phrases,
 172                           const int max_vars,
 173                           const int max_syms,
 174                           const bool permit_adjacent_nonterminals,
 175                           const bool require_aligned_terminal,
 176                           RuleObserver* observer) {
 177   queue<RuleItem> q;  // agenda for BFS
 178   int max_len = -1;
 179   unordered_map<pair<short, short>, vector<ParallelSpan>, boost::hash<pair<short, short> > > fspans;
 180   vector<vector<ParallelSpan> > spans_by_start(sentence.f_len);
 181   set<int> starts;
 182   for (int i = 0; i < phrases.size(); ++i) {
 183     fspans[make_pair(phrases[i].i1,phrases[i].i2)].push_back(phrases[i]);
 184     max_len = max(max_len, phrases[i].i2 - phrases[i].i1);
 185     // have we already added a rule item starting at phrases[i].i1?
 186     if (starts.insert(phrases[i].i1).second)
 187       q.push(RuleItem(phrases[i].i1));
 188     spans_by_start[phrases[i].i1].push_back(phrases[i]);
 189   }
 190   starts.clear();
 191   vector<pair<int,int> > next_e(sentence.e_len);
 192   vector<WordID> cur_rhs_f, cur_rhs_e;
 193   vector<pair<short, short> > cur_terminal_align;
 194   vector<int> cur_es, cur_fs;
 195   while(!q.empty()) {
 196     const RuleItem& rule = q.front();
 197
 198     // extend the partial rule
 199     if (rule.j < sentence.f_len && (rule.j - rule.i) < max_len && rule.syms < max_syms) {
 200       RuleItem ew = rule;
 201
 202       // extend with a word
 203       ew.Extend(sentence.f[ew.j]);
 204       q.push(ew);
 205
 206       // with variables
 207       if (rule.vars < max_vars &&
 208           !spans_by_start[rule.j].empty() &&
 209           ((!rule.RuleFEndsInVariable()) || permit_adjacent_nonterminals)) {
 210         const vector<ParallelSpan>& sub_phrases = spans_by_start[rule.j];
 211         for (int it = 0; it < sub_phrases.size(); ++it) {
 212           if (sub_phrases[it].i2 - sub_phrases[it].i1 + rule.j - rule.i <= max_len) {
 213             RuleItem ev = rule;
 214             ev.Extend(sub_phrases[it]);
 215             q.push(ev);
 216             assert(ev.j <= sentence.f_len);
 217           }
 218         }
 219       }
 220     }
 221     // determine if rule is consistent
 222     if (rule.syms > 0 &&
 223         fspans.count(make_pair(rule.i,rule.j)) &&
 224         (!rule.RuleFEndsInVariable() || rule.syms > 1)) {
 225       const vector<ParallelSpan>& orig_spans = fspans[make_pair(rule.i,rule.j)];
 226       for (int s = 0; s < orig_spans.size(); ++s) {
 227         const ParallelSpan& orig_span = orig_spans[s];
 228         const WordID lhs = orig_span.cat;
 229         for (int j = orig_span.j1; j < orig_span.j2; ++j) next_e[j].first = -1;
 230         int nt_index_e = 0;
 231         for (int i = 0; i < rule.f.size(); ++i) {
 232           const ParallelSpan& cur = rule.f[i];
 233           if (cur.IsVariable())
 234             next_e[cur.j1] = pair<int,int>(cur.j2, ++nt_index_e);
 235         }
 236         cur_rhs_f.clear();
 237         cur_rhs_e.clear();
 238         cur_terminal_align.clear();
 239         cur_fs.clear();
 240         cur_es.clear();
 241
 242         const int elen = orig_span.j2 - orig_span.j1;
 243         vector<int> isvar(elen, 0);
 244         int fbias = rule.i;
 245         bool bad_rule = false;
 246         bool has_aligned_terminal = false;
 247         for (int i = 0; i < rule.f.size(); ++i) {
 248           const ParallelSpan& cur = rule.f[i];
 249           cur_rhs_f.push_back(cur.cat);
 250           if (cur.cat > 0) {   // terminal
 251             if (sentence.f_aligned[fbias + i]) has_aligned_terminal = true;
 252             cur_fs.push_back(fbias + i);
 253           } else {             // non-terminal
 254             int subj1 = cur.j1 - orig_span.j1;
 255             int subj2 = cur.j2 - orig_span.j1;
 256             if (subj1 < 0 || subj2 > elen) { bad_rule = true; break; }
 257             for (int j = subj1; j < subj2 && !bad_rule; ++j) {
 258               int& isvarj = isvar[j];
 259               isvarj = true;
 260             }
 261             if (bad_rule) break;
 262             cur_fs.push_back(-1);
 263             fbias += cur.i2 - cur.i1 - 1;
 264           }
 265         }
 266         if (require_aligned_terminal && !has_aligned_terminal) bad_rule = true;
 267         if (!bad_rule) {
 268           for (int j = orig_span.j1; j < orig_span.j2; ++j) {
 269             if (next_e[j].first < 0) {
 270               cur_rhs_e.push_back(sentence.e[j]);
 271               cur_es.push_back(j);
 272             } else {
 273               cur_rhs_e.push_back(1 - next_e[j].second);  // next_e[j].second is NT gap index
 274               cur_es.push_back(-1);
 275               j = next_e[j].first - 1;
 276             }
 277           }
 278           for (short i = 0; i < cur_fs.size(); ++i)
 279             if (cur_fs[i] >= 0)
 280               for (short j = 0; j < cur_es.size(); ++j)
 281                 if (cur_es[j] >= 0 && sentence.aligned(cur_fs[i],cur_es[j]))
 282                   cur_terminal_align.push_back(make_pair(i,j));
 283           observer->CountRule(lhs, cur_rhs_f, cur_rhs_e, cur_terminal_align);
 284         }
 285       }
 286     }
 287     q.pop();
 288   }
 289 }
 290
 291 void RuleStatistics::ParseRuleStatistics(const char* buf, int start, int end) {
 292   int ptr = start;
 293   counts.clear();
 294   aligns.clear();
 295   while (ptr < end) {
 296     SkipWhitespace(buf, &ptr);
 297     int vstart = ptr;
 298     while(ptr < end && buf[ptr] != '=') ++ptr;
 299     assert(buf[ptr] == '=');
 300     assert(ptr > vstart);
 301     if (buf[vstart] == 'A' && buf[vstart+1] == '=') {
 302       ++ptr;
 303       while (ptr < end && !IsWhitespace(buf[ptr])) {
 304         while(ptr < end && buf[ptr] == ',') { ++ptr; }
 305         assert(ptr < end);
 306         vstart = ptr;
 307         while(ptr < end && buf[ptr] != ',' && !IsWhitespace(buf[ptr])) { ++ptr; }
 308         if (ptr > vstart) {
 309           short a, b;
 310           AnnotatedParallelSentence::ReadAlignmentPoint(buf, vstart, ptr, false, &a, &b);
 311           aligns.push_back(make_pair(a,b));
 312         }
 313       }
 314     } else {
 315       int name = FD::Convert(string(buf,vstart,ptr-vstart));
 316       ++ptr;
 317       vstart = ptr;
 318       while(ptr < end && !IsWhitespace(buf[ptr])) { ++ptr; }
 319       assert(ptr > vstart);
 320       counts.set_value(name, strtod(buf + vstart, NULL));
 321     }
 322   }
 323 }
 324
 325 ostream& operator<<(ostream& os, const RuleStatistics& s) {
 326   bool needspace = false;
 327   for (SparseVector<float>::const_iterator it = s.counts.begin(); it != s.counts.end(); ++it) {
 328     if (needspace) os << ' '; else needspace = true;
 329     os << FD::Convert(it->first) << '=' << it->second;
 330   }
 331   if (s.aligns.size() > 0) {
 332     os << " A=";
 333     needspace = false;
 334     for (int i = 0; i < s.aligns.size(); ++i) {
 335       if (needspace) os << ','; else needspace = true;
 336       os << s.aligns[i].first << '-' << s.aligns[i].second;
 337     }
 338   }
 339   return os;
 340 }
 341