1 #include <stdlib.h> // -*- tab-width:2 coding: viscii mode: c++ -*-
11 #include <boost/format.hpp>
16 class MyText
: public Text
19 MyText(VSpell
* vs
):Text(vs
) {}
22 bool syllable_check();
25 virtual bool ui_syllable_check();
26 virtual bool ui_word_check();
27 string
word_to_utf8(unsigned seg_id
);
30 class MyTextFactory
: public TextFactory
33 Text
* create(VSpell
*vs
) const {
34 return new MyText(vs
);
38 static MyTextFactory myfactory
;
39 static VSpell
vspell(myfactory
);
44 vector
<string
> candidates
;
47 typedef vector
<Item
> Items
;
51 int trigram
,normalization
,strich_checking
;
52 float penalty
,penalty2
;
53 friend ostream
& operator << (ostream
&os
,const Pattern
&p
) {
55 << "_" << p
.normalization
56 << "_" << p
.strich_checking
61 friend istream
& operator >> (istream
&is
,Pattern
&pat
) {
62 is
>> pat
.trigram
>> pat
.normalization
>> pat
.strich_checking
>> pat
.penalty
>> pat
.penalty2
;
74 int corrects
,positives
,candidates
;
75 bool syllable_checked
,word_checked
;
81 static string
prefix("pattern");
83 void check_pattern(Pattern
&pat
)
85 vspell
.set_penalty(pat
.penalty
);
86 vspell
.set_penalty2(pat
.penalty2
);
87 vspell
.set_normalization(pat
.normalization
);
88 vspell
.set_trigram(pat
.trigram
);
89 vspell
.set_strict_word_checking(pat
.strich_checking
);
90 uint i_corpus
,n_corpus
= tests
.size();
91 cerr
<< "Pattern " << pat
;
93 oss
<< prefix
<< "." << pat
;
94 os
= new ofstream(oss
.str().c_str());
95 (*os
) << "#Pattern " << pat
<< endl
;
96 for (i_corpus
= 0;i_corpus
< n_corpus
;i_corpus
++) {
97 mytest
= &tests
[i_corpus
];
98 mytest
->corrects
= mytest
->positives
= mytest
->candidates
= 0;
99 mytest
->syllable_checked
= mytest
->word_checked
= false;
100 //(*os) << "#Sentence: " << mytest->sentence << endl;
101 vspell
.check(tests
[i_corpus
].sentence
.c_str());
102 (*os
) << boost::format("%d %d %d %d %d %d") %
103 mytest
->syllable_checked
%
104 mytest
->word_checked
%
106 mytest
->error
.size() %
114 void save_corpus(const char *filename
)
116 ofstream
corpus(filename
);
118 if (!corpus
.is_open()) {
119 cerr
<< "could not open file" << endl
;
123 uint i
,n
= tests
.size();
124 for (i
= 0;i
< n
;i
++) {
125 if (tests
[i
].error
.empty())
127 corpus
<< tests
[i
].sentence
<< endl
;
130 void load_corpus(const char *filename
)
132 ifstream
corpus(filename
);
134 if (!corpus
.is_open()) {
135 cerr
<< "could not open file" << endl
;
140 while (getline(corpus
,s
)) {
141 if (s
.empty() ||s
[0] == '%')
145 string::size_type p
= 0;
146 while ((p
= s
.find('{',p
)) != string::npos
) {
147 string::size_type p2
= s
.find('}',p
);
148 if (p2
== string::npos
)
153 string s2
= s
.substr(item
.pos
+1,item
.len
-2);
154 while (!s2
.empty()) {
156 if (p
== string::npos
)
158 item
.candidates
.push_back(s2
.substr(0,p
));
160 while (!s2
.empty() && s2
[0] == ',')
163 items
.push_back(item
);
168 vector
<uint
> limits
,pos
;
169 int i
,n
= items
.size();
171 for (i
= 0;i
< n
;i
++)
172 limits
[i
] = items
[i
].candidates
.size();
175 while (cg
.step(pos
)) {
180 for (i
= 0;i
< n
;i
++) {
181 test
.sentence
+= s
.substr(p
,items
[i
].pos
-p
);
182 p
= items
[i
].pos
+items
[i
].len
;
183 test
.pos
[i
] = test
.sentence
.size();
184 test
.sentence
+= items
[i
].candidates
[pos
[i
]];
185 test
.len
[i
] = items
[i
].candidates
[pos
[i
]].size();
186 if (pos
[i
]) test
.error
.insert(i
);
188 test
.sentence
+= s
.substr(p
);
190 tests
.push_back(test
);
194 cerr
<< "Tests: " << tests
.size() << endl
;
197 int main(int argc
,char **argv
)
202 vector<Pattern> patterns;
204 ifstream rules("vspell-check.rules");
205 if (rules.is_open()) {
206 while (getline(rules,s)) {
207 if (s.empty() || s[0] == '#')
210 if (sscanf(s.c_str(),"%d %d %d %f",
213 &pat.strich_checking,
215 patterns.push_back(pat);
217 cerr << "Error pattern " << s << endl;
220 cerr << "Patterns: " << patterns.size() << endl;
225 uint i_pat,n_pat = patterns.size();
226 for (i_pat = 0;i_pat < n_pat;i_pat ++) {
227 Pattern &pat = patterns[i_pat];
233 while (getline(cin
,line
)) {
236 istringstream
is(line
);
241 load_corpus(filename
.c_str());
242 } else if (s
== "empty") {
244 } else if (s
== "run" ) {
247 } else if (s
== "save") {
250 save_corpus(filename
.c_str());
251 } else if (s
== "prefix") {
254 cerr
<< "unknown command:" << s
<< " ";
255 cerr
<< "done" << endl
;
259 bool MyText::ui_syllable_check()
261 unsigned i
,n
= suggestions
.size();
263 for (i
= 0;i
< n
;i
++) {
265 from
= st
[suggestions
[i
].id
].start
;
266 len
= strlen(get_ngram()[st
[suggestions
[i
].id
].id
]);
267 int utf8_from
,utf8_len
;
268 utf8_from
= utf8_pos(from
);
269 utf8_len
= utf8_pos(from
+len
)-utf8_from
;
270 nn
= mytest
->items
.size();
271 for (ii
= 0;ii
< nn
;ii
++)
272 if (mytest
->error
.find(ii
) != mytest
->error
.end() &&
273 utf8_from
== mytest
->pos
[ii
] &&
274 utf8_len
== mytest
->len
[ii
])
277 mytest
->positives
++;
279 (*os) << "#Syllable " << utf8_from << "-" << utf8_len
280 << "(" << from << "-" << len << ")" << endl;
281 for (ii = 0;ii < nn;ii ++)
282 (*os) << "# " << mytest->pos[ii] << "-" << mytest->len[ii] << endl;
288 vector<string> candidates;
291 get_syllable_candidates(get_ngram()[st[suggestions[i].id].id],c);
292 c.get_list(candidates);
296 mytest
->syllable_checked
= true;
300 bool MyText::ui_word_check()
302 unsigned i
,n
= suggestions
.size();
306 for (i
= 0;i
< n
;i
++) {
308 count
= seg
[suggestions
[i
].id
].node
->get_syllable_count();
309 pos
= (*seg
.we
)[seg
[suggestions
[i
].id
].id
].pos
;
312 from
= st
[pos
].start
;
313 len
= st
[pos2
].start
+strlen(get_ngram()[st
[pos2
].id
])-from
;
314 int utf8_from
,utf8_len
;
315 utf8_from
= utf8_pos(from
);
316 utf8_len
= utf8_pos(from
+len
)-utf8_from
;
317 nn
= mytest
->items
.size();
318 for (ii
= 0;ii
< nn
;ii
++)
319 if (mytest
->error
.find(ii
) != mytest
->error
.end() &&
320 utf8_from
<= mytest
->pos
[ii
] &&
321 utf8_from
+ utf8_len
>= mytest
->pos
[ii
] + mytest
->len
[ii
])
324 mytest
->positives
++;
326 (*os) << "#Word " << utf8_from << "-" << utf8_len
327 << "(" << from << "-" << len << ")"
333 string s
= mytest
->sentence
.substr(utf8_from
,utf8_len
);
334 s
.replace(mytest
->pos
[ii
]-utf8_from
,mytest
->len
[ii
],mytest
->items
[ii
].candidates
[0]);
336 if (s
== word_to_utf8(suggestions
[i
].id
).c_str())
339 mytest
->candidates
++;
340 (*os
) << "# " << mytest
->sentence
<< endl
;
341 (*os
) << boost::format("#Word2 %d-%d(%d-%d) %s-%s (%s)" ) %
346 mytest
->items
[ii
].candidates
[0] %
347 word_to_utf8(suggestions
[i
].id
) %
352 mytest
->word_checked
= true;
356 bool MyText::word_check()
358 bool ret
= Text::word_check();
362 bool MyText::syllable_check()
364 bool ret
= Text::syllable_check();
368 string
MyText::word_to_utf8(unsigned seg_id
)
372 seg
[seg_id
].node
->get_syllables(sylls
);
373 int i
,n
= sylls
.size();
374 for (i
= 0;i
< n
;i
++) {
378 syll
.parse(get_ngram()[sylls
[i
]]);
379 s
+= viet_to_utf8(syll
.to_str().c_str());