terrible bug in PenaltyDAG and Penalty2DAG.
[vspell.git] / tests / vspell-check.cpp
blob27f13e351dd095a45af5a888ecfb4a629e4e3e91
1 #include <stdlib.h> // -*- tab-width:2 coding: viscii mode: c++ -*-
2 #include <stdio.h>
3 #include <string.h>
4 #include <sstream>
5 #include <fstream>
6 #include "config.h"
7 #include <spell.h>
8 #include <vspell.h>
9 #include <syllable.h>
10 #include <cgen.h>
11 #include <boost/format.hpp>
12 #include <set>
14 using namespace std;
16 class MyText : public Text
18 public:
19 MyText(VSpell* vs):Text(vs) {}
21 bool word_check();
22 bool syllable_check();
24 protected:
25 virtual bool ui_syllable_check();
26 virtual bool ui_word_check();
27 string word_to_utf8(unsigned seg_id);
30 class MyTextFactory : public TextFactory
32 public:
33 Text* create(VSpell *vs) const {
34 return new MyText(vs);
38 static MyTextFactory myfactory;
39 static VSpell vspell(myfactory);
41 struct Item
43 int pos,len;
44 vector<string> candidates;
47 typedef vector<Item> Items;
49 struct Pattern
51 int trigram,normalization,strich_checking;
52 float penalty,penalty2;
53 friend ostream& operator << (ostream &os,const Pattern &p) {
54 os << p.trigram
55 << "_" << p.normalization
56 << "_" << p.strich_checking
57 << "_" << p.penalty
58 << "_" << p.penalty2;
59 return os;
61 friend istream& operator >> (istream &is,Pattern &pat) {
62 is >> pat.trigram >> pat.normalization >> pat.strich_checking >> pat.penalty >> pat.penalty2;
63 return is;
67 struct Test
69 string sentence;
70 Items items;
71 vector<uint> pos;
72 vector<uint> len;
73 set<uint> error;
74 int corrects,positives;
75 bool syllable_checked,word_checked;
78 static Test* mytest;
79 static ostream *os;
80 vector<Test> tests;
82 void check_pattern(Pattern &pat)
84 vspell.set_penalty(pat.penalty);
85 vspell.set_penalty2(pat.penalty2);
86 vspell.set_normalization(pat.normalization);
87 vspell.set_trigram(pat.trigram);
88 vspell.set_strict_word_checking(pat.strich_checking);
89 uint i_corpus,n_corpus = tests.size();
90 cerr << "Pattern " << pat;
91 ostringstream oss;
92 oss << "pattern." << pat;
93 os = new ofstream(oss.str().c_str());
94 (*os) << "#Pattern " << pat << endl;
95 for (i_corpus = 0;i_corpus < n_corpus;i_corpus ++) {
96 mytest = &tests[i_corpus];
97 mytest->corrects = mytest->positives = 0;
98 mytest->syllable_checked = mytest->word_checked = false;
99 //(*os) << "#Sentence: " << mytest->sentence << endl;
100 vspell.check(tests[i_corpus].sentence.c_str());
101 (*os) << boost::format("%d %d %d %d %d") %
102 mytest->syllable_checked %
103 mytest->word_checked %
104 mytest->corrects %
105 mytest->error.size() %
106 mytest->positives
107 << endl;
109 delete os;
110 cerr << "done" << endl;
113 void save_corpus(const char *filename)
115 ofstream corpus(filename);
117 if (!corpus.is_open()) {
118 cerr << "could not open file" << endl;
119 return;
122 uint i,n = tests.size();
123 for (i = 0;i < n;i ++) {
124 if (tests[i].error.empty())
125 corpus << endl;
126 corpus << tests[i].sentence << endl;
129 void load_corpus(const char *filename)
131 ifstream corpus(filename);
133 if (!corpus.is_open()) {
134 cerr << "could not open file" << endl;
135 return;
138 string s;
139 while (getline(corpus,s)) {
140 if (s.empty() ||s[0] == '%')
141 continue;
143 vector<Item> items;
144 string::size_type p = 0;
145 while ((p = s.find('{',p)) != string::npos) {
146 string::size_type p2 = s.find('}',p);
147 if (p2 == string::npos)
148 continue;
149 Item item;
150 item.pos = p;
151 item.len = p2-p+1;
152 string s2 = s.substr(item.pos+1,item.len-2);
153 while (!s2.empty()) {
154 p = s2.find(',');
155 if (p == string::npos)
156 p = s2.size();
157 item.candidates.push_back(s2.substr(0,p));
158 s2.erase(0,p);
159 while (!s2.empty() && s2[0] == ',')
160 s2.erase(0,1);
162 items.push_back(item);
163 p = p2;
166 CGen cg;
167 vector<uint> limits,pos;
168 int i,n = items.size();
169 limits.resize(n);
170 for (i = 0;i < n;i ++)
171 limits[i] = items[i].candidates.size();
173 cg.init(limits);
174 while (cg.step(pos)) {
175 Test test;
176 test.pos.resize(n);
177 test.len.resize(n);
178 p = 0;
179 for (i = 0;i < n;i ++) {
180 test.sentence += s.substr(p,items[i].pos-p);
181 p = items[i].pos+items[i].len;
182 test.pos[i] = test.sentence.size();
183 test.sentence += items[i].candidates[pos[i]];
184 test.len[i] = items[i].candidates[pos[i]].size();
185 if (pos[i]) test.error.insert(i);
187 test.sentence += s.substr(p);
188 test.items = items;
189 tests.push_back(test);
191 cg.done();
193 cerr << "Tests: " << tests.size() << endl;
196 int main(int argc,char **argv)
198 string s;
201 vector<Pattern> patterns;
203 ifstream rules("vspell-check.rules");
204 if (rules.is_open()) {
205 while (getline(rules,s)) {
206 if (s.empty() || s[0] == '#')
207 continue;
208 Pattern pat;
209 if (sscanf(s.c_str(),"%d %d %d %f",
210 &pat.trigram,
211 &pat.normalization,
212 &pat.strich_checking,
213 &pat.penalty) == 4)
214 patterns.push_back(pat);
215 else
216 cerr << "Error pattern " << s << endl;
219 cerr << "Patterns: " << patterns.size() << endl;
222 vspell.init();
224 uint i_pat,n_pat = patterns.size();
225 for (i_pat = 0;i_pat < n_pat;i_pat ++) {
226 Pattern &pat = patterns[i_pat];
227 check_pattern(pat);
230 Pattern pat;
231 while (cin >> s) {
232 if (s == "load") {
233 string filename;
234 cin >> filename;
235 load_corpus(filename.c_str());
236 } else if (s == "empty") {
237 tests.clear();
238 } else if (s == "run" ) {
239 cin >> pat;
240 check_pattern(pat);
241 } else if (s == "save") {
242 string filename;
243 cin >> filename;
244 save_corpus(filename.c_str());
249 bool MyText::ui_syllable_check()
251 unsigned i,n = suggestions.size();
252 unsigned ii,nn;
253 for (i = 0;i < n;i ++) {
254 int from,len;
255 from = st[suggestions[i].id].start;
256 len = strlen(get_sarch()[st[suggestions[i].id].id]);
257 int utf8_from,utf8_len;
258 utf8_from = utf8_pos(from);
259 utf8_len = utf8_pos(from+len)-utf8_from;
260 nn = mytest->items.size();
261 for (ii = 0;ii < nn;ii ++)
262 if (mytest->error.find(ii) != mytest->error.end() &&
263 utf8_from == mytest->pos[ii] &&
264 utf8_len == mytest->len[ii])
265 break;
266 if (ii == nn) {
267 mytest->positives ++;
269 (*os) << "#Syllable " << utf8_from << "-" << utf8_len
270 << "(" << from << "-" << len << ")" << endl;
271 for (ii = 0;ii < nn;ii ++)
272 (*os) << "# " << mytest->pos[ii] << "-" << mytest->len[ii] << endl;
274 continue;
278 vector<string> candidates;
279 Candidates c;
280 candidates_reset();
281 get_syllable_candidates(get_sarch()[st[suggestions[i].id].id],c);
282 c.get_list(candidates);
284 mytest->corrects ++;
286 mytest->syllable_checked = true;
287 return true;
290 bool MyText::ui_word_check()
292 unsigned i,n = suggestions.size();
293 int pos,pos2,count;
294 unsigned ii,nn;
296 for (i = 0;i < n;i ++) {
297 // query
298 count = seg[suggestions[i].id].node->get_syllable_count();
299 pos = (*seg.we)[seg[suggestions[i].id].id].pos;
300 pos2 = pos+count-1;
301 int from,len;
302 from = st[pos].start;
303 len = st[pos2].start+strlen(get_sarch()[st[pos2].id])-from;
304 int utf8_from,utf8_len;
305 utf8_from = utf8_pos(from);
306 utf8_len = utf8_pos(from+len)-utf8_from;
307 nn = mytest->items.size();
308 for (ii = 0;ii < nn;ii ++)
309 if (mytest->error.find(ii) != mytest->error.end() &&
310 utf8_from == mytest->pos[ii] &&
311 utf8_len == mytest->len[ii])
312 break;
313 if (ii == nn) {
314 mytest->positives ++;
316 (*os) << "#Word " << utf8_from << "-" << utf8_len
317 << "(" << from << "-" << len << ")"
318 << endl;
320 continue;
323 if (mytest->items[ii].candidates[0] == word_to_utf8(suggestions[i].id).c_str())
324 mytest->corrects ++;
325 else
326 (*os) << "#Word2 " << utf8_from << "-" << utf8_len
327 << "(" << from << "-" << len << ")"
328 << mytest->items[ii].candidates[0] << "-"
329 << word_to_utf8(suggestions[i].id)
330 << endl;
332 mytest->word_checked = true;
333 return true;
336 bool MyText::word_check()
338 bool ret = Text::word_check();
339 return ret;
342 bool MyText::syllable_check()
344 bool ret = Text::syllable_check();
345 return ret;
348 string MyText::word_to_utf8(unsigned seg_id)
350 vector<strid> sylls;
351 string s;
352 seg[seg_id].node->get_syllables(sylls);
353 int i,n = sylls.size();
354 for (i = 0;i < n;i ++) {
355 if (i)
356 s += " ";
357 Syllable syll;
358 syll.parse(get_sarch()[sylls[i]]);
359 s += viet_to_utf8(syll.to_str().c_str());
361 return s;