1 #include "propername.h" // -*- tab-width: 2 -*-
2 #include "dictionary.h"
4 #include <libsrilm/File.h>
7 //using namespace Dictionary;
10 bool find_capital_words(Sentence
&st
,int start
,int &pos
,int &len
);
12 static set
<strid
> propernames
;
14 bool proper_name_init()
16 File
ifs("pname","rt");
22 while ((line
= ifs
.getline()) != NULL
) {
23 while (line
[strlen(line
)-1] == '\n' || line
[strlen(line
)-1] == '\r' || line
[strlen(line
)-1] == ' ')
24 line
[strlen(line
)-1] = 0;
25 propernames
.insert(get_sarch()[get_std_syllable(line
)]);
31 void mark_proper_name(Sentence &st)
33 // find all possibilities:
34 // a proper name is a name consisting of words,
35 // which are started with a capitalized character.
36 // e.g. Cong hoa Xa hoi Chu nghia Viet Nam
37 // hmm.. these should be treated like other words for spelling checking.
39 // individual name is easier: Nguyen Thai Ngoc Duy
40 // these are usually started with common last name like Nguyen, Tran, ...
41 // there are 2-5 words in a name.
43 // there are 2 approaches:
44 // one tries to find contiguous capitalized word then check for proper name
45 // other tries to find last name first and find the rest.
46 // which one is better?
48 // if one started with a capital word, then we should check the rest with
49 // our proper name list.
51 // here i choose the first approach so that i can check for other proper
52 // names in addition to personal names.
54 int pos = 0,len = 0,npos,nlen;
56 if (!find_capital_words(st,pos+len,npos,nlen))
59 // check for last name
60 if (last_names.find(st[npos].id) != last_names.end()) {
61 // mark as a personal name.
62 st[npos].span = nlen; // skip next nlen syllables.
63 st[npos].cid = get_sarch()["<propername>"];//proper_name_id;
66 // check for other proper names
73 void mark_proper_name(const Sentence
&sent
,set
<WordEntry
> &we
)
77 n
= sent
.get_syllable_count();
79 for (i
= 0;i
< n
;i
++) {
80 if (propernames
.find(sent
[i
].get_cid()) != propernames
.end()) {
90 e
.node
= get_special_node(PROPER_NAME_ID
);
98 // i == n at the moment
104 e
.node
= get_special_node(PROPER_NAME_ID
);
110 bool find_capital_words(Sentence
&st
,int start
,int &pos
,int &len
)
112 int i
, sz
= st
.get_syllable_count();
114 string sent
= st
.get();
115 for (i
= start
;i
< sz
;i
++) {
116 if (viet_isupper(sent
[st
[i
].start
])) {
132 bool is_first_capitalized_word(const char *input
)
134 uint i
,n
= strlen(input
);
135 // check for upper-case chars
136 for (i
= n
-1;i
>= 0;i
--)
137 if (viet_toupper(input
[i
]) == input
[i
])
142 bool is_all_capitalized_word(const char *input
)
144 uint i
,n
= strlen(input
);
145 // check for upper-case chars
146 for (i
= n
-1;i
>= 0;i
--)
147 if (viet_toupper(input
[i
]) != input
[i
])
152 bool is_lower_cased_word(const char *input
)
154 uint i
,n
= strlen(input
);
155 // check for upper-case chars
156 for (i
= 0;i
< n
;i
++)
157 if (viet_toupper(input
[i
]) == input
[i
])
162 bool is_first_capitalized_word(const strid_string
&s
)
169 if (is_first_capitalized_word(sarch
[s
[0]])) {
170 for (i
= 1;i
< n
;i
++)
171 if (!is_lower_cased_word(sarch
[s
[i
]]))
179 bool is_all_capitalized_word(const strid_string
&s
)
186 if (is_all_capitalized_word(sarch
[s
[0]])) {
187 for (i
= 1;i
< n
;i
++)
188 if (!is_all_capitalized_word(sarch
[s
[i
]]))
196 bool is_lower_cased_word(const strid_string
&s
)
203 if (is_lower_cased_word(sarch
[s
[0]])) {
204 for (i
= 1;i
< n
;i
++)
205 if (!is_lower_cased_word(sarch
[s
[i
]]))