fix minitoc/heading error
[vspell.git] / libvspell / propername.cpp
blobdb510f26a27aa369a9dcce8cc12bad19426b4f4c
1 #include "propername.h" // -*- tab-width: 2 -*-
2 #include "dictionary.h"
3 #include "syllable.h"
4 #include <libsrilm/File.h>
5 #include <set>
7 //using namespace Dictionary;
8 using namespace std;
10 bool find_capital_words(Sentence &st,int start,int &pos,int &len);
12 static set<strid> propernames;
14 bool proper_name_init()
16 File ifs("pname","rt");
18 if (ifs.error())
19 return false;
21 char *line;
22 while ((line = ifs.getline()) != NULL) {
23 while (line[strlen(line)-1] == '\n' || line[strlen(line)-1] == '\r' || line[strlen(line)-1] == ' ')
24 line[strlen(line)-1] = 0;
25 propernames.insert(get_sarch()[get_std_syllable(line)]);
27 return true;
31 void mark_proper_name(Sentence &st)
33 // find all possibilities:
34 // a proper name is a name consisting of words,
35 // which are started with a capitalized character.
36 // e.g. Cong hoa Xa hoi Chu nghia Viet Nam
37 // hmm.. these should be treated like other words for spelling checking.
39 // individual name is easier: Nguyen Thai Ngoc Duy
40 // these are usually started with common last name like Nguyen, Tran, ...
41 // there are 2-5 words in a name.
43 // there are 2 approaches:
44 // one tries to find contiguous capitalized word then check for proper name
45 // other tries to find last name first and find the rest.
46 // which one is better?
48 // if one started with a capital word, then we should check the rest with
49 // our proper name list.
51 // here i choose the first approach so that i can check for other proper
52 // names in addition to personal names.
54 int pos = 0,len = 0,npos,nlen;
55 while (1) {
56 if (!find_capital_words(st,pos+len,npos,nlen))
57 break;
59 // check for last name
60 if (last_names.find(st[npos].id) != last_names.end()) {
61 // mark as a personal name.
62 st[npos].span = nlen; // skip next nlen syllables.
63 st[npos].cid = get_sarch()["<propername>"];//proper_name_id;
66 // check for other proper names
67 pos = npos;
68 len = nlen;
73 void mark_proper_name(const Sentence &sent,set<WordEntry> &we)
75 int i,n;
76 int start = -1;
77 n = sent.get_syllable_count();
79 for (i = 0;i < n;i ++) {
80 if (propernames.find(sent[i].get_cid()) != propernames.end()) {
81 if (start == -1)
82 start = i;
83 } else {
84 if (start != -1) {
85 if (i-start > 1) {
86 WordEntry e;
87 e.pos = start;
88 e.len = i-start;
89 e.fuzid = 0;
90 e.node = get_special_node(PROPER_NAME_ID);
91 we.insert(e);
93 start = -1;
98 // i == n at the moment
99 if (start != -1) {
100 WordEntry e;
101 e.pos = start;
102 e.len = i-start;
103 e.fuzid = 0;
104 e.node = get_special_node(PROPER_NAME_ID);
105 we.insert(e);
106 start = -1;
110 bool find_capital_words(Sentence &st,int start,int &pos,int &len)
112 int i, sz = st.get_syllable_count();
113 bool found = false;
114 string sent = st.get();
115 for (i = start;i < sz;i ++) {
116 if (viet_isupper(sent[st[i].start])) {
117 if (!found) {
118 found = true;
119 pos = i;
120 len = 1;
121 } else {
122 len ++;
124 } else {
125 if (found)
126 return true;
129 return found;
132 bool is_first_capitalized_word(const char *input)
134 uint i,n = strlen(input);
135 // check for upper-case chars
136 for (i = n-1;i >= 0;i --)
137 if (viet_toupper(input[i]) == input[i])
138 break;
139 return i == 0;
142 bool is_all_capitalized_word(const char *input)
144 uint i,n = strlen(input);
145 // check for upper-case chars
146 for (i = n-1;i >= 0;i --)
147 if (viet_toupper(input[i]) != input[i])
148 return false;
149 return true;
152 bool is_lower_cased_word(const char *input)
154 uint i,n = strlen(input);
155 // check for upper-case chars
156 for (i = 0;i < n;i ++)
157 if (viet_toupper(input[i]) == input[i])
158 return false;
159 return true;
162 bool is_first_capitalized_word(const strid_string &s)
164 if (s.empty())
165 return true;
167 uint i,n = s.size();
169 if (is_first_capitalized_word(sarch[s[0]])) {
170 for (i = 1;i < n;i ++)
171 if (!is_lower_cased_word(sarch[s[i]]))
172 return false;
173 return true;
176 return false;
179 bool is_all_capitalized_word(const strid_string &s)
181 if (s.empty())
182 return true;
184 uint i,n = s.size();
186 if (is_all_capitalized_word(sarch[s[0]])) {
187 for (i = 1;i < n;i ++)
188 if (!is_all_capitalized_word(sarch[s[i]]))
189 return false;
190 return true;
193 return false;
196 bool is_lower_cased_word(const strid_string &s)
198 if (s.empty())
199 return true;
201 uint i,n = s.size();
203 if (is_lower_cased_word(sarch[s[0]])) {
204 for (i = 1;i < n;i ++)
205 if (!is_lower_cased_word(sarch[s[i]]))
206 return false;
207 return true;
210 return false;