Sentence::tokenize() now uses flex-based ::tokenize()
[vspell.git] / libvspell / spell.cpp
blobf51ed33edc0806ee2581869bb054cc4195d9dc1c
1 #include "config.h" // -*- tab-width: 2 -*-
2 #include <iterator>
3 #include <algorithm>
4 #include <sstream>
5 #include <iostream>
6 #include <fstream>
7 #include "wfst.h"
8 #include "spell.h"
10 using namespace std;
14 The process:
15 1. Sentence segmentation. (sentences_split)
16 2. Separate "words" by spaces. (tokenize)
17 3. Punctuation separation. (tokenize/tokenize_punctuation)
18 4. Foreign/Abbreviation detection.
19 5. Proper name detection.
20 6. Generalization (into class e.g. number_class, foreign_class ...). Try to
21 generalize all capitalized words.
22 6* Syllable checking. (check1)
23 7. Find all possible (misspelled) words. (**) (get_all_words)
24 8. "pre-separate" sentence into phrases.
25 9. Word segmentation. (**)
26 10. Find the best segmentation. (segment_best)
27 10* Word checking. (check2)
31 namespace Spell {
33 void spell_check1(Sentence &st,Suggestions &sugg)
35 int i,n = st.get_syllable_count();
36 for (i = 0;i < n;i ++) {
37 strid id = st[i].get_cid();
38 if (sarch.in_dict(id))
39 continue;
41 st[i].sid = unk_id;
43 VocabString s = sarch[id];
44 if (strlen(s) == 1 && !viet_isalpha(s[0])) {
45 st[i].sid = sarch["<PUNCT>"];
46 continue;
49 Suggestion _s;
50 _s.id = i;
51 sugg.push_back(_s);
55 void spell_check2(Sentence &st,Segmentation &seg,Suggestions &sugg)
57 int i,n = seg.size();
58 int cc = 0;
60 for (i = 0;i < n;i ++) {
61 vector<strid> sylls;
62 int len = seg[i].node->get_syllable_count();
63 if (len == 1) {
64 cc += len;
65 continue;
68 int start;
69 WordNodePtr node(get_root());
70 for (start = 0;start < len && node != NULL; start ++)
71 node = node->get_next(st[start+cc].cid);
73 cc += len;
74 if (node == NULL) {
75 Suggestion _s;
76 _s.id = i;
77 sugg.push_back(_s);