terrible bug in PenaltyDAG and Penalty2DAG.
[vspell.git] / libvspell / sentence.cpp
blob87d309559d05330aed9c17038b9d48af3574f4af
1 #include "sentence.h" // -*- tab-width: 2 -*-
2 #include "spell.h"
3 #include "syllable.h"
4 #include "tokenize.h"
5 #include <boost/format.hpp>
6 using namespace std;
8 void sentences_split(const string &_input,vector<string> &output)
10 Tokens tokens;
11 ::tokenize(_input,tokens);
13 int i,n = tokens.size();
15 string str;
16 bool flush = false;
18 for (i = 0;i < n;i ++) {
19 if (tokens[i].is_token) {
20 int jj,nn = tokens[i].value.size();
21 if (nn == 1 && strchr("?!()[];:.,",tokens[i].value[0]))
22 flush = true;
23 else if (flush) {
24 output.push_back(str);
25 str = "";
26 flush = false;
30 str += tokens[i].value;
33 if (!str.empty())
34 output.push_back(str);
37 // candidates are ? ! .
38 // . is ambiguous
39 string input = _input;
41 int npos,pos = 0,len;
42 bool run = true;
43 bool split = false;
45 while (run && !input.empty()) {
46 if (split) {
47 output.push_back(input.substr(0,npos+1));
48 input.erase(0,npos+1);
49 pos = 0;
50 split = false;
53 npos = input.find_first_of("?!.",pos);
54 if (npos == string::npos) break;
56 len = input.size();
58 if (!(npos + 1 < len)) break;
59 if (input[npos+1] != ' ') continue;
61 if (!(npos + 2 < len)) break;
62 if (viet_isupper(input[npos+2])) { split = true; continue; }
64 pos = npos+1;
67 if (!input.empty())
68 output.push_back(input);
72 void Sentence::standardize()
76 /**
77 Split punctiations off the token
78 \param s input token
79 \param ret a sequence of tokens
81 Here is summary from SATZ tokenize.l:
82 LN = letters and numbers
83 LNS = letters and numbers and some: .:'$%-\/& and 0x7F
84 A = apostrophe '
85 SC = single characters: LN + #_;!?@*+=~|^&,:$%\ 0x7F ( ) [ ] { } < > "
86 WS = white space (space tab new line)
87 NL = new line
88 INV = invisible (out of 32-127)
90 SENTENCE_FINAL [.?!]
91 HYPHEN [\-]
92 OPEN_SINGLE_QUOTE [\`]
93 CLOSE_SINGLE_QUOTE [\']
94 RIGHT_PAREN [\"\)\]\}\>\']
96 <p> <s> </p> </s> --> do nothing (**end**)
97 SENTENCE_FINAL+RIGHT_PAREN* .) ?) !) ?" . ? !
98 HYPHEN+ - -- ---
99 OPEN_SINGLE_QUOTE+ ` `` ```
100 CLOSE_SINGLE_QUOTE+ ' '' '''
101 LNS+LN all end with a letter or a number
102 LN+A he' ll run
103 SC --> token. c (fallback)
104 WS|NL --> ignore. should be replaced
105 WSNL+ --> token.
107 Should we use flex or hand code?
108 Flex is less error-prone, but it's hard to specify Vietnamese letters.
109 Hand code is all right, but hard to extend later.
111 Choose flex for i'm lazy ;)
114 void Sentence::tokenize_punctuation(const string &s,vector<string> &ret)
116 unsigned int pos = 0,start = 0;
117 unsigned int npos;
118 unsigned int len = s.size();
119 while (start < len) {
120 if (pos < len) {
121 npos = s.find_first_of("!#()'\";:.,?/",pos);
122 if (npos == string::npos)
123 npos = len;
124 pos = npos+1;
125 if (npos < len) { // TODO: some checks here
126 // floating point
127 if ((s[npos] == '.' || s[npos] == ',') &&
128 (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9'))
129 continue; // skip the dot/comma
131 // date
132 if ((s[npos] == '/') &&
133 (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9'))
134 continue;
136 // only split dot when it's in the end.
137 if (s[npos] == '.' && npos+1 != len)
138 continue;
140 } else
141 npos = len;
143 ret.push_back(s.substr(start,npos-start));
144 if (npos < len)
145 ret.push_back(s.substr(npos,1));
146 start = npos+1;
151 Convert a string to a sequence of token
154 void Sentence::tokenize()
156 Tokens tokens;
157 ::tokenize(sent_,tokens);
159 int i,n = tokens.size();
161 Syllable sy;
162 sy.span = 1;
163 sy.sent_ = this;
164 sy.start = 0;
166 for (i = 0;i < n;i ++) {
167 if (tokens[i].is_token) {
169 char *viet_token = viet_to_viscii(tokens[i].value.c_str());
170 if (!viet_token) {
171 sy.id = get_sarch()[tokens[i].value];
172 sy.cid = get_sarch()[string("6")+tokens[i].value];
173 syllables.push_back(sy);
174 } else {
176 const char *viet_token = tokens[i].value.c_str();
177 int jj,nn = strlen(viet_token);
178 for (jj = 0;jj < nn;jj ++)
179 if (viet_isalpha(viet_token[jj]) || viet_isdigit(viet_token[jj])) {
180 string s = viet_token;
181 sy.id = get_sarch()[s];
182 sy.cid = get_sarch()[get_std_syllable(s)];
183 syllables.push_back(sy);
184 break;
186 /*}*/
188 sy.start += tokens[i].value.size();
194 Dump a Sentence
197 ostream& operator <<(ostream &os, const Sentence &st)
199 int cc,i,n = st.get_syllable_count();
200 for (cc = i = 0;i < n;i ++) {
201 if (i) os << " ";
202 os << boost::format("%s(%d-%d[%s])") % get_sarch()[st[i].id] % st[i].id % st[i].cid % get_sarch()[st[i].cid];
204 //os << st.prob << endl;
206 return os;
210 std::string& Sentence::const_iterator::operator++()
214 std::string Sentence::const_iterator::operator++(int)
220 std::ostream& Segmentation::pretty_print(std::ostream &os,const Sentence &st)
222 int i,n = size();
223 VocabIndex id;
224 for (i = 0;i < n;i ++) {
225 if (i)
226 os << " ";
227 int ii,nn = (*this)[i].len;
228 for (ii = 0;ii < nn;ii ++) {
229 if (ii)
230 os << "_";
231 os << sarch[st[(*this)[i].pos+ii].get_id()];
234 return os;