test
[ws10smt.git] / extools / build_lexical_translation.cc
blobf609f56a38b5c36803d695bd547063049fde048f
1 /*
2 * Build lexical translation table from alignment file to use for lexical translation probabilties when scoring a grammar
4 * Ported largely from the train-factored-phrase-model.perl script by Philipp Koehn
5 */
6 #include <iostream>
7 #include <string>
8 #include <map>
9 #include <vector>
10 #include <utility>
11 #include <cstdlib>
12 #include <fstream>
13 #include <tr1/unordered_map>
15 #include "sentence_pair.h"
16 #include "extract.h"
17 #include "fdict.h"
18 #include "tdict.h"
20 #include <boost/functional/hash.hpp>
21 #include <boost/program_options.hpp>
22 #include <boost/program_options/variables_map.hpp>
24 using namespace std;
25 using namespace std::tr1;
27 static const size_t MAX_LINE_LENGTH = 64000000;
29 int main(int argc, char* argv[]){
31 bool DEBUG = false;
33 map <WordID, map<WordID, int> > word_translation;
34 map <WordID, int> total_foreign;
35 map <WordID, int> total_english;
37 AnnotatedParallelSentence sent;
38 char* buf = new char[MAX_LINE_LENGTH];
39 while(cin)
41 cin.getline(buf, MAX_LINE_LENGTH);
42 if (buf[0] == 0) continue;
44 sent.ParseInputLine(buf);
46 map <WordID, int> foreign_aligned;
47 map <WordID, int> english_aligned;
49 //iterate over the alignment to compute aligned words
51 for(int i =0;i<sent.aligned.width();i++)
53 for (int j=0;j<sent.aligned.height();j++)
55 if (DEBUG) cout << sent.aligned(i,j) << " ";
56 if( sent.aligned(i,j))
58 if (DEBUG) cout << TD::Convert(sent.f[i]) << " aligned to " << TD::Convert(sent.e[j]);
59 //local counts
60 ++foreign_aligned[sent.f[i]];
61 ++english_aligned[sent.e[j]];
63 //global counts
64 ++word_translation[sent.f[i]][sent.e[j]];
65 ++total_foreign[sent.f[i]];
66 ++total_english[sent.e[j]];
69 if (DEBUG) cout << endl;
71 if (DEBUG) cout << endl;
73 static const WordID NULL_ = TD::Convert("NULL");
74 //handle unaligned words - align them to null
75 map<WordID, int>& nullcounts = word_translation[NULL_];
76 for (int j =0; j < sent.e_len; j++)
78 if (english_aligned.count(sent.e[j])) continue;
79 ++nullcounts[sent.e[j]];
80 ++total_foreign[NULL_];
81 ++total_english[sent.e[j]];
84 for (int i =0; i < sent.f_len; i++)
86 if (foreign_aligned.count(sent.f[i])) continue;
87 ++word_translation[sent.f[i]][NULL_];
88 ++total_english[NULL_];
89 ++total_foreign[sent.f[i]];
94 for(map < WordID, map<WordID,int> >::iterator it = word_translation.begin(); it != word_translation.end(); ++it)
96 const map<WordID, int>& trans = it->second;
97 for (map<WordID,int>::const_iterator iit = trans.begin(); iit != trans.end(); ++iit) {
98 cout << TD::Convert(it->first) << "," << TD::Convert(iit->first) << "=" << iit->second << "/" << total_foreign[it->first] << endl;
103 return 0;