Rewrite train for easier maintainance
[vspell.git] / utils / vspell-check.cpp
blob29131171930a5d21470398470a6eb05f8947668c
1 #include <stdlib.h> // -*- tab-width:2 coding: viscii mode: c++ -*-
2 #include <stdio.h>
3 #include <string.h>
4 #include <sstream>
5 #include <fstream>
6 #include "config.h"
7 #include <spell.h>
8 #include <vspell.h>
9 #include <syllable.h>
10 #include <cgen.h>
11 #include <boost/format.hpp>
12 #include <set>
14 using namespace std;
16 class MyText : public Text
18 public:
19 MyText(VSpell* vs):Text(vs) {}
21 bool word_check();
22 bool syllable_check();
24 protected:
25 virtual bool ui_syllable_check();
26 virtual bool ui_word_check();
27 string word_to_utf8(unsigned seg_id);
30 class MyTextFactory : public TextFactory
32 public:
33 Text* create(VSpell *vs) const {
34 return new MyText(vs);
38 static MyTextFactory myfactory;
39 static VSpell vspell(myfactory);
41 struct Item
43 int pos,len;
44 vector<string> candidates;
47 typedef vector<Item> Items;
49 struct Pattern
51 int trigram,strich_checking;
52 float penalty,penalty2;
53 friend ostream& operator << (ostream &os,const Pattern &p) {
54 os << p.trigram
55 << "_" << p.strich_checking
56 << "_" << p.penalty
57 << "_" << p.penalty2;
58 return os;
60 friend istream& operator >> (istream &is,Pattern &pat) {
61 is >> pat.trigram >> pat.strich_checking >> pat.penalty >> pat.penalty2;
62 return is;
66 struct Test
68 string sentence;
69 Items items;
70 vector<uint> pos;
71 vector<uint> len;
72 set<uint> error;
73 int corrects,positives,candidates;
74 bool syllable_checked,word_checked;
75 ostream *os;
78 static Test* mytest;
79 vector<Test> tests;
80 static string prefix("pattern");
82 void check_pattern(Pattern &pat)
84 ostream *os;
85 vspell.set_penalty(pat.penalty);
86 vspell.set_penalty2(pat.penalty2);
87 vspell.set_trigram(pat.trigram);
88 vspell.set_strict_word_checking(pat.strich_checking);
89 uint i_corpus,n_corpus = tests.size();
90 cerr << "Pattern " << pat;
91 ostringstream oss;
92 oss << prefix << "." << pat;
93 os = new ofstream(oss.str().c_str());
94 (*os) << "#Pattern " << pat << endl;
95 (*os) << "#Output test_errors syllable_check word_check corrects positive_errors candidate errors" << endl;
96 (*os) << endl;
98 for (i_corpus = 0;i_corpus < n_corpus;i_corpus ++) {
99 mytest = &tests[i_corpus];
100 ostringstream *oss;
101 mytest->os = oss = new ostringstream();
102 mytest->corrects = mytest->positives = mytest->candidates = 0;
103 mytest->syllable_checked = mytest->word_checked = false;
104 (*os) << "#Sentence: " << mytest->sentence << endl;
105 vspell.check(mytest->sentence.c_str());
106 (*os) << boost::format("%d %d %d %d %d %d") %
107 mytest->error.size() %
108 mytest->syllable_checked %
109 mytest->word_checked %
110 mytest->corrects %
111 mytest->positives %
112 mytest->candidates
113 << endl;
114 mytest->os = NULL;
115 (*os) << oss->str() << endl;
116 delete oss;
118 delete os;
121 void save_corpus(const char *filename)
123 ofstream corpus(filename);
125 if (!corpus.is_open()) {
126 cerr << "could not open file" << endl;
127 return;
130 uint i,n = tests.size();
131 for (i = 0;i < n;i ++) {
132 if (tests[i].error.empty())
133 corpus << endl;
134 corpus << tests[i].sentence << endl;
137 void load_corpus(const char *filename)
139 ifstream corpus(filename);
141 if (!corpus.is_open()) {
142 cerr << "could not open file" << endl;
143 return;
146 string s;
147 while (getline(corpus,s)) {
148 if (s.empty() ||s[0] == '%')
149 continue;
151 vector<Item> items;
152 string::size_type p = 0;
153 while ((p = s.find('{',p)) != string::npos) {
154 string::size_type p2 = s.find('}',p);
155 if (p2 == string::npos)
156 continue;
157 Item item;
158 item.pos = p;
159 item.len = p2-p+1;
160 string s2 = s.substr(item.pos+1,item.len-2);
161 while (!s2.empty()) {
162 p = s2.find(',');
163 if (p == string::npos)
164 p = s2.size();
165 item.candidates.push_back(s2.substr(0,p));
166 s2.erase(0,p);
167 while (!s2.empty() && s2[0] == ',')
168 s2.erase(0,1);
170 items.push_back(item);
171 p = p2;
174 CGen cg;
175 vector<uint> limits,pos;
176 int i,n = items.size();
177 limits.resize(n);
178 for (i = 0;i < n;i ++)
179 limits[i] = items[i].candidates.size();
181 cg.init(limits);
182 while (cg.step(pos)) {
183 Test test;
184 test.pos.resize(n);
185 test.len.resize(n);
186 p = 0;
187 for (i = 0;i < n;i ++) {
188 test.sentence += s.substr(p,items[i].pos-p);
189 p = items[i].pos+items[i].len;
190 test.pos[i] = test.sentence.size();
191 test.sentence += items[i].candidates[pos[i]];
192 test.len[i] = items[i].candidates[pos[i]].size();
193 if (pos[i]) test.error.insert(i);
195 test.sentence += s.substr(p);
196 test.items = items;
197 tests.push_back(test);
199 cg.done();
201 cerr << "Tests: " << tests.size() << endl;
204 int main(int argc,char **argv)
206 string s;
209 vector<Pattern> patterns;
211 ifstream rules("vspell-check.rules");
212 if (rules.is_open()) {
213 while (getline(rules,s)) {
214 if (s.empty() || s[0] == '#')
215 continue;
216 Pattern pat;
217 if (sscanf(s.c_str(),"%d %d %d %f",
218 &pat.trigram,
219 &pat.normalization,
220 &pat.strich_checking,
221 &pat.penalty) == 4)
222 patterns.push_back(pat);
223 else
224 cerr << "Error pattern " << s << endl;
227 cerr << "Patterns: " << patterns.size() << endl;
230 vspell.init();
232 uint i_pat,n_pat = patterns.size();
233 for (i_pat = 0;i_pat < n_pat;i_pat ++) {
234 Pattern &pat = patterns[i_pat];
235 check_pattern(pat);
238 Pattern pat;
239 string line;
240 while (getline(cin,line)) {
241 if (line[0] == '#')
242 continue;
243 istringstream is(line);
244 is >> s;
245 if (s == "load") {
246 string filename;
247 is >> filename;
248 load_corpus(filename.c_str());
249 } else if (s == "empty") {
250 tests.clear();
251 } else if (s == "run" ) {
252 is >> pat;
253 check_pattern(pat);
254 } else if (s == "save") {
255 string filename;
256 is >> filename;
257 save_corpus(filename.c_str());
258 } else if (s == "prefix") {
259 is >> prefix;
260 } else
261 cerr << "unknown command:" << s << " ";
262 cerr << "done" << endl;
266 bool MyText::ui_syllable_check()
268 unsigned i,n = suggestions.size();
269 unsigned ii,nn;
270 for (i = 0;i < n;i ++) {
271 int from,len;
272 from = (*st)[suggestions[i].id].start;
273 len = strlen(get_ngram()[(*st)[suggestions[i].id].id]);
274 int utf8_from,utf8_len;
275 utf8_from = utf8_pos(from);
276 utf8_len = utf8_pos(from+len)-utf8_from;
277 nn = mytest->items.size();
278 for (ii = 0;ii < nn;ii ++)
279 if (mytest->error.find(ii) != mytest->error.end() &&
280 utf8_from == mytest->pos[ii] &&
281 utf8_len == mytest->len[ii])
282 break;
283 if (ii == nn) {
284 mytest->positives ++;
285 (*mytest->os) << "##SP " << utf8_from << "-" << utf8_len
286 << "(" << from << "-" << len << ")" << endl;
287 for (ii = 0;ii < nn;ii ++)
288 (*mytest->os) << "## " << mytest->pos[ii] << "-" << mytest->len[ii] << endl;
289 continue;
293 vector<string> candidates;
294 Candidates c;
295 candidates_reset();
296 get_syllable_candidates(get_ngram()[st[suggestions[i].id].id],c);
297 c.get_list(candidates);
299 mytest->corrects ++;
301 mytest->syllable_checked = true;
302 return true;
305 bool MyText::ui_word_check()
307 unsigned i,n = suggestions.size();
308 int pos,pos2,count;
309 unsigned ii,nn;
311 nn = seg.size();
312 ostringstream oss;
313 oss << "##W:";
314 for (ii = 0;ii < nn;ii ++) {
315 std::vector<strid> syll;
316 seg[ii].node.node->get_syllables(syll);
317 for (std::vector<strid>::size_type i = 0;i < syll.size();i ++) {
318 oss << (i > 0 ? "_" : " ");
319 oss << sarch[syll[i]];
322 oss << endl;
323 (*mytest->os) << viet_to_utf8(oss.str().c_str());
325 for (i = 0;i < n;i ++) {
326 // query
327 count = seg[suggestions[i].id].node->get_syllable_count();
328 pos = (*seg.we)[seg[suggestions[i].id].id].pos;
329 pos2 = pos+count-1;
330 int from,len;
331 from = (*st)[pos].start;
332 len = (*st)[pos2].start+strlen(get_ngram()[(*st)[pos2].id])-from;
333 int utf8_from,utf8_len;
334 utf8_from = utf8_pos(from);
335 utf8_len = utf8_pos(from+len)-utf8_from;
336 nn = mytest->items.size();
337 for (ii = 0;ii < nn;ii ++)
338 if (mytest->error.find(ii) != mytest->error.end() &&
339 utf8_from <= mytest->pos[ii] &&
340 utf8_from + utf8_len >= mytest->pos[ii] + mytest->len[ii])
341 break;
342 if (ii == nn) {
343 mytest->positives ++;
344 (*mytest->os) << "##WP " << utf8_from << "-" << utf8_len
345 << "(" << from << "-" << len << ")"
346 << endl;
347 continue;
350 string s = mytest->sentence.substr(utf8_from,utf8_len);
351 s.replace(mytest->pos[ii]-utf8_from,mytest->len[ii],mytest->items[ii].candidates[0]);
353 if (s == word_to_utf8(suggestions[i].id).c_str())
354 mytest->corrects ++;
355 else {
356 mytest->candidates ++;
357 // (*os) << "# " << mytest->sentence << endl;
358 (*mytest->os) << boost::format("##WC %d-%d(%d-%d) %s-%s (%s)" ) %
359 utf8_from %
360 utf8_len %
361 from %
362 len %
363 mytest->items[ii].candidates[0] %
364 word_to_utf8(suggestions[i].id) %
366 << endl;
369 mytest->word_checked = true;
370 return true;
373 bool MyText::word_check()
375 bool ret = Text::word_check();
376 return ret;
379 bool MyText::syllable_check()
381 bool ret = Text::syllable_check();
382 return ret;
385 string MyText::word_to_utf8(unsigned seg_id)
387 vector<strid> sylls;
388 string s;
389 seg[seg_id].node->get_syllables(sylls);
390 int i,n = sylls.size();
391 for (i = 0;i < n;i ++) {
392 if (i)
393 s += " ";
394 Syllable syll;
395 syll.parse(get_ngram()[sylls[i]]);
396 s += viet_to_utf8(syll.to_str().c_str());
398 return s;