6 #include <tr1/unordered_map>
9 #include <boost/functional/hash.hpp>
11 #include "sentence_pair.h"
20 inline bool IsWhitespace(char c
) { return c
== ' ' || c
== '\t'; }
22 inline void SkipWhitespace(const char* buf
, int* ptr
) {
23 while (buf
[*ptr
] && IsWhitespace(buf
[*ptr
])) { ++(*ptr
); }
27 Extract::RuleObserver::~RuleObserver() {
28 cerr
<< "Rules extracted: " << count
<< endl
;
31 void Extract::ExtractBasePhrases(const int max_base_phrase_size
,
32 const AnnotatedParallelSentence
& sentence
,
33 vector
<ParallelSpan
>* phrases
) {
36 vector
<pair
<int,int> > f_spans(sentence
.f_len
, pair
<int,int>(sentence
.e_len
, 0));
37 vector
<pair
<int,int> > e_spans(sentence
.e_len
, pair
<int,int>(sentence
.f_len
, 0));
38 // for each alignment point in e, precompute the minimal consistent phrases in f
39 // for each alignment point in f, precompute the minimal consistent phrases in e
40 for (int i
= 0; i
< sentence
.f_len
; ++i
) {
41 for (int j
= 0; j
< sentence
.e_len
; ++j
) {
42 if (sentence
.aligned(i
,j
)) {
43 if (j
< f_spans
[i
].first
) f_spans
[i
].first
= j
;
44 f_spans
[i
].second
= j
+1;
45 if (i
< e_spans
[j
].first
) e_spans
[j
].first
= i
;
46 e_spans
[j
].second
= i
+1;
51 for (int i1
= 0; i1
< sentence
.f_len
; ++i1
) {
52 if (sentence
.f_aligned
[i1
] == 0) continue;
53 int j1
= sentence
.e_len
;
55 const int i_limit
= min(sentence
.f_len
, i1
+ max_base_phrase_size
);
56 for (int i2
= i1
+ 1; i2
<= i_limit
; ++i2
) {
57 if (sentence
.f_aligned
[i2
-1] == 0) continue;
58 // cerr << "F has aligned span " << i1 << " to " << i2 << endl;
59 j1
= min(j1
, f_spans
[i2
-1].first
);
60 j2
= max(j2
, f_spans
[i2
-1].second
);
61 if (j1
>= j2
) continue;
62 if (j2
- j1
> max_base_phrase_size
) continue;
64 for (int j
= j1
; j
< j2
; ++j
) {
65 if (e_spans
[j
].first
< i1
) { condition
= 1; break; }
66 if (e_spans
[j
].second
> i2
) { condition
= 2; break; }
68 if (condition
== 1) break;
69 if (condition
== 2) continue;
70 // category types added later!
71 phrases
->push_back(ParallelSpan(i1
, i2
, j1
, j2
));
72 // cerr << i1 << " " << i2 << " : " << j1 << " " << j2 << endl;
77 void Extract::LoosenPhraseBounds(const AnnotatedParallelSentence
& sentence
,
78 const int max_base_phrase_size
,
79 vector
<ParallelSpan
>* phrases
) {
80 const int num_phrases
= phrases
->size();
81 map
<int, map
<int, map
<int, map
<int, bool> > > > marker
;
82 for (int i
= 0; i
< num_phrases
; ++i
) {
83 const ParallelSpan
& cur
= (*phrases
)[i
];
84 marker
[cur
.i1
][cur
.i2
][cur
.j1
][cur
.j2
] = true;
86 for (int i
= 0; i
< num_phrases
; ++i
) {
87 const ParallelSpan
& cur
= (*phrases
)[i
];
88 const int i1_max
= cur
.i1
;
89 const int i2_min
= cur
.i2
;
90 const int j1_max
= cur
.j1
;
91 const int j2_min
= cur
.j2
;
93 while (i1_min
> 0 && sentence
.f_aligned
[i1_min
-1] == 0) { --i1_min
; }
95 while (j1_min
> 0 && sentence
.e_aligned
[j1_min
-1] == 0) { --j1_min
; }
97 while (i2_max
< sentence
.f_len
&& sentence
.f_aligned
[i2_max
] == 0) { ++i2_max
; }
99 while (j2_max
< sentence
.e_len
&& sentence
.e_aligned
[j2_max
] == 0) { ++j2_max
; }
100 for (int i1
= i1_min
; i1
<= i1_max
; ++i1
) {
101 const int ilim
= min(i2_max
, i1
+ max_base_phrase_size
);
102 for (int i2
= max(i1
+1,i2_min
); i2
<= ilim
; ++i2
) {
103 for (int j1
= j1_min
; j1
<= j1_max
; ++j1
) {
104 const int jlim
= min(j2_max
, j1
+ max_base_phrase_size
);
105 for (int j2
= max(j1
+1, j2_min
); j2
<= jlim
; ++j2
) {
106 bool& seen
= marker
[i1
][i2
][j1
][j2
];
108 phrases
->push_back(ParallelSpan(i1
,i2
,j1
,j2
));
117 // this uses the TARGET span (i,j) to annotate phrases, will copy
118 // phrases if there is more than one annotation.
119 // TODO: support source annotation
120 void Extract::AnnotatePhrasesWithCategoryTypes(const WordID default_cat
,
121 const Array2D
<vector
<WordID
> >& types
,
122 vector
<ParallelSpan
>* phrases
) {
123 const int num_unannotated_phrases
= phrases
->size();
124 // have to use num_unannotated_phrases since we may grow the vector
125 for (int i
= 0; i
< num_unannotated_phrases
; ++i
) {
126 ParallelSpan
& phrase
= (*phrases
)[i
];
127 const vector
<WordID
>* pcats
= &types(phrase
.j1
, phrase
.j2
);
128 if (pcats
->empty() && default_cat
!= 0) {
129 static vector
<WordID
> s_default(1, default_cat
);
132 if (pcats
->empty()) {
133 cerr
<< "ERROR span " << phrase
.i1
<< "," << phrase
.i2
<< "-"
134 << phrase
.j1
<< "," << phrase
.j2
<< " has no type. "
135 "Did you forget --default_category?\n";
137 const vector
<WordID
>& cats
= *pcats
;
138 phrase
.cat
= cats
[0];
139 for (int ci
= 1; ci
< cats
.size(); ++ci
) {
140 ParallelSpan new_phrase
= phrase
;
141 new_phrase
.cat
= cats
[ci
];
142 phrases
->push_back(new_phrase
);
147 // a partially complete (f-side) of a rule
149 vector
<ParallelSpan
> f
;
151 explicit RuleItem(int pi
) : i(pi
), j(pi
), syms(), vars() {}
152 void Extend(const WordID
& fword
) {
153 f
.push_back(ParallelSpan(fword
));
157 void Extend(const ParallelSpan
& subphrase
) {
158 f
.push_back(subphrase
);
159 j
+= subphrase
.i2
- subphrase
.i1
;
163 bool RuleFEndsInVariable() const {
165 return f
.back().IsVariable();
166 } else { return false; }
170 void Extract::ExtractConsistentRules(const AnnotatedParallelSentence
& sentence
,
171 const vector
<ParallelSpan
>& phrases
,
174 const bool permit_adjacent_nonterminals
,
175 const bool require_aligned_terminal
,
176 RuleObserver
* observer
) {
177 queue
<RuleItem
> q
; // agenda for BFS
179 unordered_map
<pair
<short, short>, vector
<ParallelSpan
>, boost::hash
<pair
<short, short> > > fspans
;
180 vector
<vector
<ParallelSpan
> > spans_by_start(sentence
.f_len
);
182 for (int i
= 0; i
< phrases
.size(); ++i
) {
183 fspans
[make_pair(phrases
[i
].i1
,phrases
[i
].i2
)].push_back(phrases
[i
]);
184 max_len
= max(max_len
, phrases
[i
].i2
- phrases
[i
].i1
);
185 // have we already added a rule item starting at phrases[i].i1?
186 if (starts
.insert(phrases
[i
].i1
).second
)
187 q
.push(RuleItem(phrases
[i
].i1
));
188 spans_by_start
[phrases
[i
].i1
].push_back(phrases
[i
]);
191 vector
<pair
<int,int> > next_e(sentence
.e_len
);
192 vector
<WordID
> cur_rhs_f
, cur_rhs_e
;
193 vector
<pair
<short, short> > cur_terminal_align
;
194 vector
<int> cur_es
, cur_fs
;
196 const RuleItem
& rule
= q
.front();
198 // extend the partial rule
199 if (rule
.j
< sentence
.f_len
&& (rule
.j
- rule
.i
) < max_len
&& rule
.syms
< max_syms
) {
202 // extend with a word
203 ew
.Extend(sentence
.f
[ew
.j
]);
207 if (rule
.vars
< max_vars
&&
208 !spans_by_start
[rule
.j
].empty() &&
209 ((!rule
.RuleFEndsInVariable()) || permit_adjacent_nonterminals
)) {
210 const vector
<ParallelSpan
>& sub_phrases
= spans_by_start
[rule
.j
];
211 for (int it
= 0; it
< sub_phrases
.size(); ++it
) {
212 if (sub_phrases
[it
].i2
- sub_phrases
[it
].i1
+ rule
.j
- rule
.i
<= max_len
) {
214 ev
.Extend(sub_phrases
[it
]);
216 assert(ev
.j
<= sentence
.f_len
);
221 // determine if rule is consistent
223 fspans
.count(make_pair(rule
.i
,rule
.j
)) &&
224 (!rule
.RuleFEndsInVariable() || rule
.syms
> 1)) {
225 const vector
<ParallelSpan
>& orig_spans
= fspans
[make_pair(rule
.i
,rule
.j
)];
226 for (int s
= 0; s
< orig_spans
.size(); ++s
) {
227 const ParallelSpan
& orig_span
= orig_spans
[s
];
228 const WordID lhs
= orig_span
.cat
;
229 for (int j
= orig_span
.j1
; j
< orig_span
.j2
; ++j
) next_e
[j
].first
= -1;
231 for (int i
= 0; i
< rule
.f
.size(); ++i
) {
232 const ParallelSpan
& cur
= rule
.f
[i
];
233 if (cur
.IsVariable())
234 next_e
[cur
.j1
] = pair
<int,int>(cur
.j2
, ++nt_index_e
);
238 cur_terminal_align
.clear();
242 const int elen
= orig_span
.j2
- orig_span
.j1
;
243 vector
<int> isvar(elen
, 0);
245 bool bad_rule
= false;
246 bool has_aligned_terminal
= false;
247 for (int i
= 0; i
< rule
.f
.size(); ++i
) {
248 const ParallelSpan
& cur
= rule
.f
[i
];
249 cur_rhs_f
.push_back(cur
.cat
);
250 if (cur
.cat
> 0) { // terminal
251 if (sentence
.f_aligned
[fbias
+ i
]) has_aligned_terminal
= true;
252 cur_fs
.push_back(fbias
+ i
);
253 } else { // non-terminal
254 int subj1
= cur
.j1
- orig_span
.j1
;
255 int subj2
= cur
.j2
- orig_span
.j1
;
256 if (subj1
< 0 || subj2
> elen
) { bad_rule
= true; break; }
257 for (int j
= subj1
; j
< subj2
&& !bad_rule
; ++j
) {
258 int& isvarj
= isvar
[j
];
262 cur_fs
.push_back(-1);
263 fbias
+= cur
.i2
- cur
.i1
- 1;
266 if (require_aligned_terminal
&& !has_aligned_terminal
) bad_rule
= true;
268 for (int j
= orig_span
.j1
; j
< orig_span
.j2
; ++j
) {
269 if (next_e
[j
].first
< 0) {
270 cur_rhs_e
.push_back(sentence
.e
[j
]);
273 cur_rhs_e
.push_back(1 - next_e
[j
].second
); // next_e[j].second is NT gap index
274 cur_es
.push_back(-1);
275 j
= next_e
[j
].first
- 1;
278 for (short i
= 0; i
< cur_fs
.size(); ++i
)
280 for (short j
= 0; j
< cur_es
.size(); ++j
)
281 if (cur_es
[j
] >= 0 && sentence
.aligned(cur_fs
[i
],cur_es
[j
]))
282 cur_terminal_align
.push_back(make_pair(i
,j
));
283 observer
->CountRule(lhs
, cur_rhs_f
, cur_rhs_e
, cur_terminal_align
);
291 void RuleStatistics::ParseRuleStatistics(const char* buf
, int start
, int end
) {
296 SkipWhitespace(buf
, &ptr
);
298 while(ptr
< end
&& buf
[ptr
] != '=') ++ptr
;
299 assert(buf
[ptr
] == '=');
300 assert(ptr
> vstart
);
301 if (buf
[vstart
] == 'A' && buf
[vstart
+1] == '=') {
303 while (ptr
< end
&& !IsWhitespace(buf
[ptr
])) {
304 while(ptr
< end
&& buf
[ptr
] == ',') { ++ptr
; }
307 while(ptr
< end
&& buf
[ptr
] != ',' && !IsWhitespace(buf
[ptr
])) { ++ptr
; }
310 AnnotatedParallelSentence::ReadAlignmentPoint(buf
, vstart
, ptr
, false, &a
, &b
);
311 aligns
.push_back(make_pair(a
,b
));
315 int name
= FD::Convert(string(buf
,vstart
,ptr
-vstart
));
318 while(ptr
< end
&& !IsWhitespace(buf
[ptr
])) { ++ptr
; }
319 assert(ptr
> vstart
);
320 counts
.set_value(name
, strtod(buf
+ vstart
, NULL
));
325 ostream
& operator<<(ostream
& os
, const RuleStatistics
& s
) {
326 bool needspace
= false;
327 for (SparseVector
<float>::const_iterator it
= s
.counts
.begin(); it
!= s
.counts
.end(); ++it
) {
328 if (needspace
) os
<< ' '; else needspace
= true;
329 os
<< FD::Convert(it
->first
) << '=' << it
->second
;
331 if (s
.aligns
.size() > 0) {
334 for (int i
= 0; i
< s
.aligns
.size(); ++i
) {
335 if (needspace
) os
<< ','; else needspace
= true;
336 os
<< s
.aligns
[i
].first
<< '-' << s
.aligns
[i
].second
;