1 #include <stdlib.h> // -*- tab-width:2 coding: viscii mode: c++ -*-
11 #include <boost/format.hpp>
16 class MyText
: public Text
19 MyText(VSpell
* vs
):Text(vs
) {}
22 bool syllable_check();
25 virtual bool ui_syllable_check();
26 virtual bool ui_word_check();
27 string
word_to_utf8(unsigned seg_id
);
30 class MyTextFactory
: public TextFactory
33 Text
* create(VSpell
*vs
) const {
34 return new MyText(vs
);
38 static MyTextFactory myfactory
;
39 static VSpell
vspell(myfactory
);
44 vector
<string
> candidates
;
47 typedef vector
<Item
> Items
;
51 int trigram
,strich_checking
;
52 float penalty
,penalty2
;
53 friend ostream
& operator << (ostream
&os
,const Pattern
&p
) {
55 << "_" << p
.strich_checking
60 friend istream
& operator >> (istream
&is
,Pattern
&pat
) {
61 is
>> pat
.trigram
>> pat
.strich_checking
>> pat
.penalty
>> pat
.penalty2
;
73 int corrects
,positives
,candidates
;
74 bool syllable_checked
,word_checked
;
80 static string
prefix("pattern");
82 void check_pattern(Pattern
&pat
)
85 vspell
.set_penalty(pat
.penalty
);
86 vspell
.set_penalty2(pat
.penalty2
);
87 vspell
.set_trigram(pat
.trigram
);
88 vspell
.set_strict_word_checking(pat
.strich_checking
);
89 uint i_corpus
,n_corpus
= tests
.size();
90 cerr
<< "Pattern " << pat
;
92 oss
<< prefix
<< "." << pat
;
93 os
= new ofstream(oss
.str().c_str());
94 (*os
) << "#Pattern " << pat
<< endl
;
95 (*os
) << "#Output test_errors syllable_check word_check corrects positive_errors candidate errors" << endl
;
98 for (i_corpus
= 0;i_corpus
< n_corpus
;i_corpus
++) {
99 mytest
= &tests
[i_corpus
];
101 mytest
->os
= oss
= new ostringstream();
102 mytest
->corrects
= mytest
->positives
= mytest
->candidates
= 0;
103 mytest
->syllable_checked
= mytest
->word_checked
= false;
104 (*os
) << "#Sentence: " << mytest
->sentence
<< endl
;
105 vspell
.check(mytest
->sentence
.c_str());
106 (*os
) << boost::format("%d %d %d %d %d %d") %
107 mytest
->error
.size() %
108 mytest
->syllable_checked
%
109 mytest
->word_checked
%
115 (*os
) << oss
->str() << endl
;
121 void save_corpus(const char *filename
)
123 ofstream
corpus(filename
);
125 if (!corpus
.is_open()) {
126 cerr
<< "could not open file" << endl
;
130 uint i
,n
= tests
.size();
131 for (i
= 0;i
< n
;i
++) {
132 if (tests
[i
].error
.empty())
134 corpus
<< tests
[i
].sentence
<< endl
;
137 void load_corpus(const char *filename
)
139 ifstream
corpus(filename
);
141 if (!corpus
.is_open()) {
142 cerr
<< "could not open file" << endl
;
147 while (getline(corpus
,s
)) {
148 if (s
.empty() ||s
[0] == '%')
152 string::size_type p
= 0;
153 while ((p
= s
.find('{',p
)) != string::npos
) {
154 string::size_type p2
= s
.find('}',p
);
155 if (p2
== string::npos
)
160 string s2
= s
.substr(item
.pos
+1,item
.len
-2);
161 while (!s2
.empty()) {
163 if (p
== string::npos
)
165 item
.candidates
.push_back(s2
.substr(0,p
));
167 while (!s2
.empty() && s2
[0] == ',')
170 items
.push_back(item
);
175 vector
<uint
> limits
,pos
;
176 int i
,n
= items
.size();
178 for (i
= 0;i
< n
;i
++)
179 limits
[i
] = items
[i
].candidates
.size();
182 while (cg
.step(pos
)) {
187 for (i
= 0;i
< n
;i
++) {
188 test
.sentence
+= s
.substr(p
,items
[i
].pos
-p
);
189 p
= items
[i
].pos
+items
[i
].len
;
190 test
.pos
[i
] = test
.sentence
.size();
191 test
.sentence
+= items
[i
].candidates
[pos
[i
]];
192 test
.len
[i
] = items
[i
].candidates
[pos
[i
]].size();
193 if (pos
[i
]) test
.error
.insert(i
);
195 test
.sentence
+= s
.substr(p
);
197 tests
.push_back(test
);
201 cerr
<< "Tests: " << tests
.size() << endl
;
204 int main(int argc
,char **argv
)
209 vector<Pattern> patterns;
211 ifstream rules("vspell-check.rules");
212 if (rules.is_open()) {
213 while (getline(rules,s)) {
214 if (s.empty() || s[0] == '#')
217 if (sscanf(s.c_str(),"%d %d %d %f",
220 &pat.strich_checking,
222 patterns.push_back(pat);
224 cerr << "Error pattern " << s << endl;
227 cerr << "Patterns: " << patterns.size() << endl;
232 uint i_pat,n_pat = patterns.size();
233 for (i_pat = 0;i_pat < n_pat;i_pat ++) {
234 Pattern &pat = patterns[i_pat];
240 while (getline(cin
,line
)) {
243 istringstream
is(line
);
248 load_corpus(filename
.c_str());
249 } else if (s
== "empty") {
251 } else if (s
== "run" ) {
254 } else if (s
== "save") {
257 save_corpus(filename
.c_str());
258 } else if (s
== "prefix") {
261 cerr
<< "unknown command:" << s
<< " ";
262 cerr
<< "done" << endl
;
266 bool MyText::ui_syllable_check()
268 unsigned i
,n
= suggestions
.size();
270 for (i
= 0;i
< n
;i
++) {
272 from
= (*st
)[suggestions
[i
].id
].start
;
273 len
= strlen(get_ngram()[(*st
)[suggestions
[i
].id
].id
]);
274 int utf8_from
,utf8_len
;
275 utf8_from
= utf8_pos(from
);
276 utf8_len
= utf8_pos(from
+len
)-utf8_from
;
277 nn
= mytest
->items
.size();
278 for (ii
= 0;ii
< nn
;ii
++)
279 if (mytest
->error
.find(ii
) != mytest
->error
.end() &&
280 utf8_from
== mytest
->pos
[ii
] &&
281 utf8_len
== mytest
->len
[ii
])
284 mytest
->positives
++;
285 (*mytest
->os
) << "##SP " << utf8_from
<< "-" << utf8_len
286 << "(" << from
<< "-" << len
<< ")" << endl
;
287 for (ii
= 0;ii
< nn
;ii
++)
288 (*mytest
->os
) << "## " << mytest
->pos
[ii
] << "-" << mytest
->len
[ii
] << endl
;
293 vector<string> candidates;
296 get_syllable_candidates(get_ngram()[st[suggestions[i].id].id],c);
297 c.get_list(candidates);
301 mytest
->syllable_checked
= true;
305 bool MyText::ui_word_check()
307 unsigned i
,n
= suggestions
.size();
314 for (ii
= 0;ii
< nn
;ii
++) {
315 std::vector
<strid
> syll
;
316 seg
[ii
].node
.node
->get_syllables(syll
);
317 for (std::vector
<strid
>::size_type i
= 0;i
< syll
.size();i
++) {
318 oss
<< (i
> 0 ? "_" : " ");
319 oss
<< sarch
[syll
[i
]];
323 (*mytest
->os
) << viet_to_utf8(oss
.str().c_str());
325 for (i
= 0;i
< n
;i
++) {
327 count
= seg
[suggestions
[i
].id
].node
->get_syllable_count();
328 pos
= (*seg
.we
)[seg
[suggestions
[i
].id
].id
].pos
;
331 from
= (*st
)[pos
].start
;
332 len
= (*st
)[pos2
].start
+strlen(get_ngram()[(*st
)[pos2
].id
])-from
;
333 int utf8_from
,utf8_len
;
334 utf8_from
= utf8_pos(from
);
335 utf8_len
= utf8_pos(from
+len
)-utf8_from
;
336 nn
= mytest
->items
.size();
337 for (ii
= 0;ii
< nn
;ii
++)
338 if (mytest
->error
.find(ii
) != mytest
->error
.end() &&
339 utf8_from
<= mytest
->pos
[ii
] &&
340 utf8_from
+ utf8_len
>= mytest
->pos
[ii
] + mytest
->len
[ii
])
343 mytest
->positives
++;
344 (*mytest
->os
) << "##WP " << utf8_from
<< "-" << utf8_len
345 << "(" << from
<< "-" << len
<< ")"
350 string s
= mytest
->sentence
.substr(utf8_from
,utf8_len
);
351 s
.replace(mytest
->pos
[ii
]-utf8_from
,mytest
->len
[ii
],mytest
->items
[ii
].candidates
[0]);
353 if (s
== word_to_utf8(suggestions
[i
].id
).c_str())
356 mytest
->candidates
++;
357 // (*os) << "# " << mytest->sentence << endl;
358 (*mytest
->os
) << boost::format("##WC %d-%d(%d-%d) %s-%s (%s)" ) %
363 mytest
->items
[ii
].candidates
[0] %
364 word_to_utf8(suggestions
[i
].id
) %
369 mytest
->word_checked
= true;
373 bool MyText::word_check()
375 bool ret
= Text::word_check();
379 bool MyText::syllable_check()
381 bool ret
= Text::syllable_check();
385 string
MyText::word_to_utf8(unsigned seg_id
)
389 seg
[seg_id
].node
->get_syllables(sylls
);
390 int i
,n
= sylls
.size();
391 for (i
= 0;i
< n
;i
++) {
395 syll
.parse(get_ngram()[sylls
[i
]]);
396 s
+= viet_to_utf8(syll
.to_str().c_str());