Reimplemented LogPtoProb (which is just an exp10 call)
[vspell.git] / utils / vspell-check.cpp
blob425896653d837866160945c29c5d148a29552d7e
1 #include <stdlib.h> // -*- tab-width:2 coding: viscii mode: c++ -*-
2 #include <stdio.h>
3 #include <string.h>
4 #include <sstream>
5 #include <fstream>
6 #include "config.h"
7 #include <spell.h>
8 #include <vspell.h>
9 #include <syllable.h>
10 #include <cgen.h>
11 #include <boost/format.hpp>
12 #include <set>
14 using namespace std;
16 class MyText : public Text
18 public:
19 MyText(VSpell* vs):Text(vs) {}
21 bool word_check();
22 bool syllable_check();
24 protected:
25 virtual bool ui_syllable_check();
26 virtual bool ui_word_check();
27 string word_to_utf8(unsigned seg_id);
30 class MyTextFactory : public TextFactory
32 public:
33 Text* create(VSpell *vs) const {
34 return new MyText(vs);
38 static MyTextFactory myfactory;
39 static VSpell vspell(myfactory);
41 struct Item
43 int pos,len;
44 vector<string> candidates;
47 typedef vector<Item> Items;
49 struct Pattern
51 int trigram,normalization,strich_checking;
52 float penalty,penalty2;
53 friend ostream& operator << (ostream &os,const Pattern &p) {
54 os << p.trigram
55 << "_" << p.normalization
56 << "_" << p.strich_checking
57 << "_" << p.penalty
58 << "_" << p.penalty2;
59 return os;
61 friend istream& operator >> (istream &is,Pattern &pat) {
62 is >> pat.trigram >> pat.normalization >> pat.strich_checking >> pat.penalty >> pat.penalty2;
63 return is;
67 struct Test
69 string sentence;
70 Items items;
71 vector<uint> pos;
72 vector<uint> len;
73 set<uint> error;
74 int corrects,positives,candidates;
75 bool syllable_checked,word_checked;
78 static Test* mytest;
79 static ostream *os;
80 vector<Test> tests;
81 static string prefix("pattern");
83 void check_pattern(Pattern &pat)
85 vspell.set_penalty(pat.penalty);
86 vspell.set_penalty2(pat.penalty2);
87 vspell.set_normalization(pat.normalization);
88 vspell.set_trigram(pat.trigram);
89 vspell.set_strict_word_checking(pat.strich_checking);
90 uint i_corpus,n_corpus = tests.size();
91 cerr << "Pattern " << pat;
92 ostringstream oss;
93 oss << prefix << "." << pat;
94 os = new ofstream(oss.str().c_str());
95 (*os) << "#Pattern " << pat << endl;
96 for (i_corpus = 0;i_corpus < n_corpus;i_corpus ++) {
97 mytest = &tests[i_corpus];
98 mytest->corrects = mytest->positives = mytest->candidates = 0;
99 mytest->syllable_checked = mytest->word_checked = false;
100 //(*os) << "#Sentence: " << mytest->sentence << endl;
101 vspell.check(tests[i_corpus].sentence.c_str());
102 (*os) << boost::format("%d %d %d %d %d %d") %
103 mytest->syllable_checked %
104 mytest->word_checked %
105 mytest->corrects %
106 mytest->error.size() %
107 mytest->positives %
108 mytest->candidates
109 << endl;
111 delete os;
114 void save_corpus(const char *filename)
116 ofstream corpus(filename);
118 if (!corpus.is_open()) {
119 cerr << "could not open file" << endl;
120 return;
123 uint i,n = tests.size();
124 for (i = 0;i < n;i ++) {
125 if (tests[i].error.empty())
126 corpus << endl;
127 corpus << tests[i].sentence << endl;
130 void load_corpus(const char *filename)
132 ifstream corpus(filename);
134 if (!corpus.is_open()) {
135 cerr << "could not open file" << endl;
136 return;
139 string s;
140 while (getline(corpus,s)) {
141 if (s.empty() ||s[0] == '%')
142 continue;
144 vector<Item> items;
145 string::size_type p = 0;
146 while ((p = s.find('{',p)) != string::npos) {
147 string::size_type p2 = s.find('}',p);
148 if (p2 == string::npos)
149 continue;
150 Item item;
151 item.pos = p;
152 item.len = p2-p+1;
153 string s2 = s.substr(item.pos+1,item.len-2);
154 while (!s2.empty()) {
155 p = s2.find(',');
156 if (p == string::npos)
157 p = s2.size();
158 item.candidates.push_back(s2.substr(0,p));
159 s2.erase(0,p);
160 while (!s2.empty() && s2[0] == ',')
161 s2.erase(0,1);
163 items.push_back(item);
164 p = p2;
167 CGen cg;
168 vector<uint> limits,pos;
169 int i,n = items.size();
170 limits.resize(n);
171 for (i = 0;i < n;i ++)
172 limits[i] = items[i].candidates.size();
174 cg.init(limits);
175 while (cg.step(pos)) {
176 Test test;
177 test.pos.resize(n);
178 test.len.resize(n);
179 p = 0;
180 for (i = 0;i < n;i ++) {
181 test.sentence += s.substr(p,items[i].pos-p);
182 p = items[i].pos+items[i].len;
183 test.pos[i] = test.sentence.size();
184 test.sentence += items[i].candidates[pos[i]];
185 test.len[i] = items[i].candidates[pos[i]].size();
186 if (pos[i]) test.error.insert(i);
188 test.sentence += s.substr(p);
189 test.items = items;
190 tests.push_back(test);
192 cg.done();
194 cerr << "Tests: " << tests.size() << endl;
197 int main(int argc,char **argv)
199 string s;
202 vector<Pattern> patterns;
204 ifstream rules("vspell-check.rules");
205 if (rules.is_open()) {
206 while (getline(rules,s)) {
207 if (s.empty() || s[0] == '#')
208 continue;
209 Pattern pat;
210 if (sscanf(s.c_str(),"%d %d %d %f",
211 &pat.trigram,
212 &pat.normalization,
213 &pat.strich_checking,
214 &pat.penalty) == 4)
215 patterns.push_back(pat);
216 else
217 cerr << "Error pattern " << s << endl;
220 cerr << "Patterns: " << patterns.size() << endl;
223 vspell.init();
225 uint i_pat,n_pat = patterns.size();
226 for (i_pat = 0;i_pat < n_pat;i_pat ++) {
227 Pattern &pat = patterns[i_pat];
228 check_pattern(pat);
231 Pattern pat;
232 string line;
233 while (getline(cin,line)) {
234 if (line[0] == '#')
235 continue;
236 istringstream is(line);
237 is >> s;
238 if (s == "load") {
239 string filename;
240 is >> filename;
241 load_corpus(filename.c_str());
242 } else if (s == "empty") {
243 tests.clear();
244 } else if (s == "run" ) {
245 is >> pat;
246 check_pattern(pat);
247 } else if (s == "save") {
248 string filename;
249 is >> filename;
250 save_corpus(filename.c_str());
251 } else if (s == "prefix") {
252 is >> prefix;
253 } else
254 cerr << "unknown command:" << s << " ";
255 cerr << "done" << endl;
259 bool MyText::ui_syllable_check()
261 unsigned i,n = suggestions.size();
262 unsigned ii,nn;
263 for (i = 0;i < n;i ++) {
264 int from,len;
265 from = st[suggestions[i].id].start;
266 len = strlen(get_ngram()[st[suggestions[i].id].id]);
267 int utf8_from,utf8_len;
268 utf8_from = utf8_pos(from);
269 utf8_len = utf8_pos(from+len)-utf8_from;
270 nn = mytest->items.size();
271 for (ii = 0;ii < nn;ii ++)
272 if (mytest->error.find(ii) != mytest->error.end() &&
273 utf8_from == mytest->pos[ii] &&
274 utf8_len == mytest->len[ii])
275 break;
276 if (ii == nn) {
277 mytest->positives ++;
279 (*os) << "#Syllable " << utf8_from << "-" << utf8_len
280 << "(" << from << "-" << len << ")" << endl;
281 for (ii = 0;ii < nn;ii ++)
282 (*os) << "# " << mytest->pos[ii] << "-" << mytest->len[ii] << endl;
284 continue;
288 vector<string> candidates;
289 Candidates c;
290 candidates_reset();
291 get_syllable_candidates(get_ngram()[st[suggestions[i].id].id],c);
292 c.get_list(candidates);
294 mytest->corrects ++;
296 mytest->syllable_checked = true;
297 return true;
300 bool MyText::ui_word_check()
302 unsigned i,n = suggestions.size();
303 int pos,pos2,count;
304 unsigned ii,nn;
306 for (i = 0;i < n;i ++) {
307 // query
308 count = seg[suggestions[i].id].node->get_syllable_count();
309 pos = (*seg.we)[seg[suggestions[i].id].id].pos;
310 pos2 = pos+count-1;
311 int from,len;
312 from = st[pos].start;
313 len = st[pos2].start+strlen(get_ngram()[st[pos2].id])-from;
314 int utf8_from,utf8_len;
315 utf8_from = utf8_pos(from);
316 utf8_len = utf8_pos(from+len)-utf8_from;
317 nn = mytest->items.size();
318 for (ii = 0;ii < nn;ii ++)
319 if (mytest->error.find(ii) != mytest->error.end() &&
320 utf8_from <= mytest->pos[ii] &&
321 utf8_from + utf8_len >= mytest->pos[ii] + mytest->len[ii])
322 break;
323 if (ii == nn) {
324 mytest->positives ++;
326 (*os) << "#Word " << utf8_from << "-" << utf8_len
327 << "(" << from << "-" << len << ")"
328 << endl;
330 continue;
333 string s = mytest->sentence.substr(utf8_from,utf8_len);
334 s.replace(mytest->pos[ii]-utf8_from,mytest->len[ii],mytest->items[ii].candidates[0]);
336 if (s == word_to_utf8(suggestions[i].id).c_str())
337 mytest->corrects ++;
338 else {
339 mytest->candidates ++;
340 (*os) << "# " << mytest->sentence << endl;
341 (*os) << boost::format("#Word2 %d-%d(%d-%d) %s-%s (%s)" ) %
342 utf8_from %
343 utf8_len %
344 from %
345 len %
346 mytest->items[ii].candidates[0] %
347 word_to_utf8(suggestions[i].id) %
349 << endl;
352 mytest->word_checked = true;
353 return true;
356 bool MyText::word_check()
358 bool ret = Text::word_check();
359 return ret;
362 bool MyText::syllable_check()
364 bool ret = Text::syllable_check();
365 return ret;
368 string MyText::word_to_utf8(unsigned seg_id)
370 vector<strid> sylls;
371 string s;
372 seg[seg_id].node->get_syllables(sylls);
373 int i,n = sylls.size();
374 for (i = 0;i < n;i ++) {
375 if (i)
376 s += " ";
377 Syllable syll;
378 syll.parse(get_ngram()[sylls[i]]);
379 s += viet_to_utf8(syll.to_str().c_str());
381 return s;