2 * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar
4 * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn
13 #include <tr1/unordered_map>
15 #include "sentence_pair.h"
20 #include <boost/functional/hash.hpp>
21 #include <boost/program_options.hpp>
22 #include <boost/program_options/variables_map.hpp>
25 using namespace std::tr1
;
27 static const size_t MAX_LINE_LENGTH
= 64000000;
29 int main(int argc
, char* argv
[]){
33 map
<WordID
, map
<WordID
, int> > word_translation
;
34 map
<WordID
, int> total_foreign
;
35 map
<WordID
, int> total_english
;
37 AnnotatedParallelSentence sent
;
38 char* buf
= new char[MAX_LINE_LENGTH
];
41 cin
.getline(buf
, MAX_LINE_LENGTH
);
42 if (buf
[0] == 0) continue;
44 sent
.ParseInputLine(buf
);
46 map
<WordID
, int> foreign_aligned
;
47 map
<WordID
, int> english_aligned
;
49 //iterate over the alignment to compute aligned words
51 for(int i
=0;i
<sent
.aligned
.width();i
++)
53 for (int j
=0;j
<sent
.aligned
.height();j
++)
55 if (DEBUG
) cout
<< sent
.aligned(i
,j
) << " ";
56 if( sent
.aligned(i
,j
))
58 if (DEBUG
) cout
<< TD::Convert(sent
.f
[i
]) << " aligned to " << TD::Convert(sent
.e
[j
]);
60 ++foreign_aligned
[sent
.f
[i
]];
61 ++english_aligned
[sent
.e
[j
]];
64 ++word_translation
[sent
.f
[i
]][sent
.e
[j
]];
65 ++total_foreign
[sent
.f
[i
]];
66 ++total_english
[sent
.e
[j
]];
69 if (DEBUG
) cout
<< endl
;
71 if (DEBUG
) cout
<< endl
;
73 static const WordID NULL_
= TD::Convert("NULL");
74 //handle unaligned words - align them to null
75 map
<WordID
, int>& nullcounts
= word_translation
[NULL_
];
76 for (int j
=0; j
< sent
.e_len
; j
++)
78 if (english_aligned
.count(sent
.e
[j
])) continue;
79 ++nullcounts
[sent
.e
[j
]];
80 ++total_foreign
[NULL_
];
81 ++total_english
[sent
.e
[j
]];
84 for (int i
=0; i
< sent
.f_len
; i
++)
86 if (foreign_aligned
.count(sent
.f
[i
])) continue;
87 ++word_translation
[sent
.f
[i
]][NULL_
];
88 ++total_english
[NULL_
];
89 ++total_foreign
[sent
.f
[i
]];
94 for(map
< WordID
, map
<WordID
,int> >::iterator it
= word_translation
.begin(); it
!= word_translation
.end(); ++it
)
96 const map
<WordID
, int>& trans
= it
->second
;
97 for (map
<WordID
,int>::const_iterator iit
= trans
.begin(); iit
!= trans
.end(); ++iit
) {
98 cout
<< TD::Convert(it
->first
) << "," << TD::Convert(iit
->first
) << "=" << iit
->second
<< "/" << total_foreign
[it
->first
] << endl
;