Renamed *Node* classes to *NNode* ones
[vspell.git] / tests / all-words-test.cpp
blobfc0a46011180d8a54ec17b4fef3702b6f69a6c2a
1 #include <fstream>
2 #include <iostream>
3 #include <algorithm>
4 #include <iterator>
5 #include <set>
6 #include "vspell.h"
7 #include "syllable.h"
8 #include "propername.h"
10 using namespace std;
12 void apply_separators(const Sentence &st,set<WordEntry> &wes,vector<unsigned> &seps)
14 sort(seps.begin(),seps.end());
15 //copy(seps1.begin(),seps1.end(),inserter(seps,seps.begin()));
16 int sep = 0,offset=0;
17 int i,n = st.get_syllable_count();
19 for (i = 0;i < n-1 && sep < seps.size();i ++) {
20 int p = offset+st[i].start+strlen(get_sarch()[st[i].get_id()]);
21 if (p <= seps[sep] && seps[sep] <= offset+st[i+1].start) {
22 apply_separator(wes,i);
23 sep ++;
28 void lattice_to_dot(ostream &os,Lattice &w2,bool spare,bool has_seps,bool edge_value)
30 uint i,n;
31 const Sentence &st = *w2.st;
32 WordEntries &wes = *w2.we;
33 n = wes.size();
34 os << "digraph wordlattice {" << endl;
35 os << "\trankdir=LR;" << endl;
36 os << "\tstyle=invis;" << endl;
37 os << "\thead;" << endl;
38 os << "\ttail;" << endl;
39 //set<strid> nodes;
40 int old_pos = -1;
41 int cc;
42 int anchor[st.get_syllable_count()];
43 for (i = 0;i < n;i ++) {
44 //if (nodes.find(wes[i].node.node->get_id()) == nodes.end()) {
45 //nodes.insert(wes[i].node.node->get_id());
46 if (wes[i].pos != old_pos) {
47 if (wes[i].pos) {
48 os << "\t}" << endl;
50 os << "\tsubgraph cluster_" << wes[i].pos << " {" << endl;
51 old_pos = wes[i].pos;
52 cc = 0;
55 if (spare && cc++ == w2.get_we(wes[i].pos).size()/2)
56 //os << "\tanchor_" << wes[i].pos << " [shape=\"point\"];" << endl;
57 anchor[wes[i].pos] = i;
59 os << "\tn" << i << " [label=\"";
60 std::vector<strid> syll;
61 if (wes[i].node.node) {
62 wes[i].node.node->get_syllables(syll);
63 for (std::vector<strid>::size_type ii = 0;ii < syll.size();ii ++) {
64 if (i)
65 os << " ";
66 Syllable sy;
67 if (sy.parse(get_sarch()[syll[ii]]))
68 os << sy.to_str();
69 else
70 os << get_sarch()[syll[ii]];
72 } else
73 os << "UNK";
74 os << "\"];" << endl;
76 //}
78 os << "\t}" << endl; // end of the last cluster
80 if (spare)
81 for (i = 0;i < st.get_syllable_count()-1;i ++) {
82 //os << "anchor_" << i << " -> anchor_" << (i+1) << " [style=invis, weight=10000];" << endl;
83 os << "n" << anchor[i] << " -> n" << anchor[i+1] << " [style=invis, weight=10000];" << endl;
86 VocabIndex vi[2];
87 vi[1] = Vocab_None;
88 float val;
89 int ii,nn;
90 for (i = 0;i < n;i ++) {
91 WordEntry &we = wes[i];
93 if (we.pos == 0) {
94 if (edge_value) {
95 vi[0] = get_id(START_ID);
96 val = -get_ngram().wordProb(we.node.node->get_id(),vi);
97 os << "\thead -> n" << we.id << " [ label=\"" << val << "\"];" << endl;
98 } else
99 os << "\thead -> n" << we.id << ";" << endl;
101 if (we.pos+we.len >= w2.get_word_count()) {
102 if (edge_value) {
103 vi[0] = we.node.node->get_id();
104 val = -get_ngram().wordProb(get_id(STOP_ID),vi);
105 os << "\tn" << we.id << " -> tail [ label=\"" << val << "\"];" << endl;
106 } else
107 os << "\tn" << we.id << " -> tail;" << endl;
108 } else {
109 if (spare)
110 os << "\tn" << we.id << " -> n" << anchor[(we.pos+we.len)] << ";" << endl;
111 else {
112 const WordEntryRefs &wers = w2.get_we(we.pos+we.len);
113 nn = wers.size();
114 for (ii = 0;ii < nn; ii ++) {
115 if (edge_value) {
116 vi[0] = we.node.node->get_id();
117 val = -get_ngram().wordProb(wers[ii]->node.node->get_id(),vi);
118 os << "\tn" << we.id << " -> n" << wers[ii]->id << " [label=\"" << val << "\"];" << endl;
119 } else
120 os << "\tn" << we.id << " -> n" << wers[ii]->id << ";" << endl;
126 os << "}" << endl;
129 void total_combinations(ostream &os,Lattice &w)
131 WordEntries &wes = *w.we;
132 unsigned long long nn = wes.size();
133 vector<unsigned long long> val(nn);
135 vector<vector<uint> > prev;
136 int i,n = w.get_word_count(),v,vv;
137 float sum = 0;
139 prev.resize(nn);
141 for (i = 0;i < n;i ++) {
142 const WordEntryRefs &wers = w.get_we(i);
143 int ii,nn = wers.size();
144 for (ii = 0;ii < nn;ii ++) {
145 // wers[ii] is the first node (W).
146 v = wers[ii]->id;
147 if (i == 0)
148 val[v] = 1;
149 int next = wers[ii]->pos+wers[ii]->len;
150 if (next < n) {
151 const WordEntryRefs &wers2 = w.get_we(next);
152 int iii,nnn = wers2.size();
153 for (iii = 0;iii < nnn;iii ++) {
154 //wers2[iii] is the second node (W).
155 vv = wers2[iii]->id;
156 prev[vv].push_back(v);
162 unsigned long long final_val = 0;
163 for (i = 0;i < n;i ++) {
164 const WordEntryRefs &wers = w.get_we(i);
165 int ii,nn = wers.size();
166 for (ii = 0;ii < nn;ii ++) {
167 // wers[ii] is the first node (W).
168 v = wers[ii]->id;
169 int iii,nnn = prev[v].size();
170 for (iii = 0;iii < nnn;iii ++) {
171 os << v << "(" << val[v] << ") <- " << prev[v][iii] << "(" << val[prev[v][iii]] << ")" << endl;
172 val[v] += val[prev[v][iii]];
174 if (wers[ii]->pos+wers[ii]->len == w.get_word_count()) {
175 final_val += val[v];
176 os << "Final: " << final_val << endl;
180 cout << final_val << endl;
183 int main(int argc,char **argv)
185 //WFST wfst;
186 bool fuzzy = true;
187 bool dot = false;
188 bool spare = false;
189 bool has_seps = false;
190 bool edge_value = false;
191 bool total_comb = false;
193 int i,n;
194 vector<unsigned> seps;
196 for (i = 1;i < argc;i ++) {
197 if (!strcmp(argv[i],"nofuzzy")) fuzzy = false;
198 if (!strcmp(argv[i],"dot")) dot = true;
199 if (!strcmp(argv[i],"spare")) spare = true;
200 if (!strcmp(argv[i],"seps")) has_seps = true;
201 if (!strcmp(argv[i],"edgeval")) edge_value = true;
202 if (!strcmp(argv[i],"total_comb")) total_comb = true;
205 dic_init();
207 cerr << "Loading... ";
208 warch.load("wordlist");
209 File f("ngram","rt",0);
210 if (!f.error())
211 get_ngram().read(f);
212 cerr << "done" << endl;
214 get_sarch().set_blocked(true);
216 //wfst.set_wordlist(get_root());
218 string s;
219 while (getline(cin,s)) {
220 if (s.empty()) continue;
222 if (has_seps) {
223 string::size_type p;
224 while ((p = s.find('|')) != string::npos) {
225 seps.push_back(p);
226 s.erase(p,1);
230 Sentence st(s);
231 st.standardize();
232 st.tokenize();
233 Lattice words,w2;
234 set<WordEntry> wes;
235 WordStateFactories factories;
236 ExactWordStateFactory exact;
237 LowerWordStateFactory lower;
238 FuzzyWordStateFactory ffuzzy;
239 factories.push_back(&exact);
240 factories.push_back(&lower);
241 if (fuzzy)
242 factories.push_back(&ffuzzy);
243 w2.pre_construct(st,wes,factories);
244 mark_proper_name(st,wes);
245 if (has_seps)
246 apply_separators(st,wes,seps);
247 w2.post_construct(wes);
248 //w2.based_on(words);
249 if (total_comb)
250 total_combinations(cout,w2);
251 else {
252 if (!dot)
253 cout << w2;
254 else
255 lattice_to_dot(cout,w2,spare,has_seps,edge_value);
257 get_sarch().clear_rest();
260 return 0;