1 #include "sentence.h" // -*- tab-width: 2 -*-
5 #include <boost/format.hpp>
8 void sentences_split(const string
&_input
,vector
<string
> &output
)
11 ::tokenize(_input
,tokens
);
13 int i
,n
= tokens
.size();
18 for (i
= 0;i
< n
;i
++) {
19 if (tokens
[i
].is_token
) {
20 int jj
,nn
= tokens
[i
].value
.size();
21 if (nn
== 1 && strchr("?!()[];:.,",tokens
[i
].value
[0]))
24 output
.push_back(str
);
30 str
+= tokens
[i
].value
;
34 output
.push_back(str
);
37 // candidates are ? ! .
39 string input = _input;
45 while (run && !input.empty()) {
47 output.push_back(input.substr(0,npos+1));
48 input.erase(0,npos+1);
53 npos = input.find_first_of("?!.",pos);
54 if (npos == string::npos) break;
58 if (!(npos + 1 < len)) break;
59 if (input[npos+1] != ' ') continue;
61 if (!(npos + 2 < len)) break;
62 if (viet_isupper(input[npos+2])) { split = true; continue; }
68 output.push_back(input);
72 void Sentence::standardize()
77 Split punctiations off the token
79 \param ret a sequence of tokens
81 Here is summary from SATZ tokenize.l:
82 LN = letters and numbers
83 LNS = letters and numbers and some: .:'$%-\/& and 0x7F
85 SC = single characters: LN + #_;!?@*+=~|^&,:$%\ 0x7F ( ) [ ] { } < > "
86 WS = white space (space tab new line)
88 INV = invisible (out of 32-127)
92 OPEN_SINGLE_QUOTE [\`]
93 CLOSE_SINGLE_QUOTE [\']
94 RIGHT_PAREN [\"\)\]\}\>\']
96 <p> <s> </p> </s> --> do nothing (**end**)
97 SENTENCE_FINAL+RIGHT_PAREN* .) ?) !) ?" . ? !
99 OPEN_SINGLE_QUOTE+ ` `` ```
100 CLOSE_SINGLE_QUOTE+ ' '' '''
101 LNS+LN all end with a letter or a number
103 SC --> token. c (fallback)
104 WS|NL --> ignore. should be replaced
107 Should we use flex or hand code?
108 Flex is less error-prone, but it's hard to specify Vietnamese letters.
109 Hand code is all right, but hard to extend later.
111 Choose flex for i'm lazy ;)
114 void Sentence::tokenize_punctuation(const string
&s
,vector
<string
> &ret
)
116 unsigned int pos
= 0,start
= 0;
118 unsigned int len
= s
.size();
119 while (start
< len
) {
121 npos
= s
.find_first_of("!#()'\";:.,?/",pos
);
122 if (npos
== string::npos
)
125 if (npos
< len
) { // TODO: some checks here
127 if ((s
[npos
] == '.' || s
[npos
] == ',') &&
128 (npos
+1 < len
&& s
[npos
+1] >= '0' && s
[npos
+1] <= '9'))
129 continue; // skip the dot/comma
132 if ((s
[npos
] == '/') &&
133 (npos
+1 < len
&& s
[npos
+1] >= '0' && s
[npos
+1] <= '9'))
136 // only split dot when it's in the end.
137 if (s
[npos
] == '.' && npos
+1 != len
)
143 ret
.push_back(s
.substr(start
,npos
-start
));
145 ret
.push_back(s
.substr(npos
,1));
151 Convert a string to a sequence of token
154 void Sentence::tokenize()
157 ::tokenize(sent_
,tokens
);
159 int i
,n
= tokens
.size();
166 for (i
= 0;i
< n
;i
++) {
167 if (tokens
[i
].is_token
) {
169 char *viet_token = viet_to_viscii(tokens[i].value.c_str());
171 sy.id = get_sarch()[tokens[i].value];
172 sy.cid = get_sarch()[string("6")+tokens[i].value];
173 syllables.push_back(sy);
176 const char *viet_token
= tokens
[i
].value
.c_str();
177 int jj
,nn
= strlen(viet_token
);
178 for (jj
= 0;jj
< nn
;jj
++)
179 if (viet_isalpha(viet_token
[jj
]) || viet_isdigit(viet_token
[jj
])) {
180 string s
= viet_token
;
181 sy
.id
= get_sarch()[s
];
182 sy
.cid
= get_sarch()[get_std_syllable(s
)];
183 syllables
.push_back(sy
);
188 sy
.start
+= tokens
[i
].value
.size();
197 ostream
& operator <<(ostream
&os
, const Sentence
&st
)
199 int cc
,i
,n
= st
.get_syllable_count();
200 for (cc
= i
= 0;i
< n
;i
++) {
202 os
<< boost::format("%s(%d-%d[%s])") % get_sarch()[st
[i
].id
] % st
[i
].id
% st
[i
].cid
% get_sarch()[st
[i
].cid
];
204 //os << st.prob << endl;
210 std::string& Sentence::const_iterator::operator++()
214 std::string Sentence::const_iterator::operator++(int)
220 std::ostream
& Segmentation::pretty_print(std::ostream
&os
,const Sentence
&st
)
224 for (i
= 0;i
< n
;i
++) {
227 int ii
,nn
= (*this)[i
].len
;
228 for (ii
= 0;ii
< nn
;ii
++) {
231 os
<< sarch
[st
[(*this)[i
].pos
+ii
].get_id()];