libvspell/sentence.cpp

   1 #include "sentence.h"                                           // -*- tab-width: 2 -*-
   2 #include "spell.h"
   3 #include "syllable.h"
   4 #include "tokenize.h"
   5 #include <boost/format.hpp>
   6 using namespace std;
   7
   8 void sentences_split(const string &_input,vector<string> &output)
   9 {
  10         Tokens tokens;
  11         ::tokenize(_input,tokens);
  12
  13         int i,n = tokens.size();
  14
  15         string str;
  16         bool flush = false;
  17
  18         for (i = 0;i < n;i ++) {
  19                 if (tokens[i].is_token) {
  20                         int jj,nn = tokens[i].value.size();
  21                         if (nn == 1 && strchr("?!()[];:.,",tokens[i].value[0]))
  22                                 flush = true;
  23                         else if (flush) {
  24                                 output.push_back(str);
  25                                 str = "";
  26                                 flush = false;
  27
  28                         }
  29                 }
  30                 str += tokens[i].value;
  31         }
  32
  33         if (!str.empty())
  34                 output.push_back(str);
  35
  36         /*
  37   // candidates are ? ! .
  38   // . is ambiguous
  39   string input = _input;
  40
  41   int npos,pos = 0,len;
  42   bool run = true;
  43   bool split = false;
  44
  45   while (run && !input.empty()) {
  46     if (split) {
  47       output.push_back(input.substr(0,npos+1));
  48       input.erase(0,npos+1);
  49       pos = 0;
  50       split = false;
  51     }
  52
  53     npos = input.find_first_of("?!.",pos);
  54     if (npos == string::npos) break;
  55
  56     len = input.size();
  57
  58     if (!(npos + 1 < len)) break;
  59     if (input[npos+1] != ' ') continue;
  60
  61     if (!(npos + 2 < len)) break;
  62     if (viet_isupper(input[npos+2])) { split = true; continue; }
  63
  64     pos = npos+1;
  65   }
  66
  67   if (!input.empty())
  68     output.push_back(input);
  69         */
  70 }
  71
  72 void Sentence::standardize()
  73 {
  74 }
  75
  76 /**
  77  Split punctiations off the token
  78  \param s input token
  79  \param ret a sequence of tokens
  80
  81  Here is summary from SATZ tokenize.l:
  82  LN = letters and numbers
  83  LNS = letters and numbers and some: .:'$%-\/& and 0x7F
  84  A = apostrophe '
  85  SC = single characters: LN + #_;!?@*+=~|^&,:$%\ 0x7F ( ) [ ] { } < > "
  86  WS = white space (space tab new line)
  87  NL = new line
  88  INV = invisible (out of 32-127)
  89
  90  SENTENCE_FINAL                 [.?!]
  91  HYPHEN                         [\-]
  92  OPEN_SINGLE_QUOTE              [\`]
  93  CLOSE_SINGLE_QUOTE             [\']
  94  RIGHT_PAREN                     [\"\)\]\}\>\']
  95
  96  <p> <s> </p> </s> --> do nothing (**end**)
  97  SENTENCE_FINAL+RIGHT_PAREN*               .) ?) !)  ?" . ? !
  98  HYPHEN+                                   -  -- ---
  99  OPEN_SINGLE_QUOTE+                        `  `` ```
 100  CLOSE_SINGLE_QUOTE+                       '  '' '''
 101  LNS+LN                                    all end with a letter or a number
 102  LN+A                                      he' ll run
 103  SC                --> token.              c (fallback)
 104  WS|NL             --> ignore.             should be replaced
 105  WSNL+             --> token.
 106
 107  Should we use flex or hand code?
 108  Flex is less error-prone, but it's hard to specify Vietnamese letters.
 109  Hand code is all right, but hard to extend later.
 110
 111  Choose flex for i'm lazy ;)
 112 */
 113
 114 void Sentence::tokenize_punctuation(const string &s,vector<string> &ret)
 115 {
 116         unsigned int pos = 0,start = 0;
 117         unsigned int npos;
 118         unsigned int len = s.size();
 119         while (start < len) {
 120                 if (pos < len) {
 121                         npos = s.find_first_of("!#()'\";:.,?/",pos);
 122                         if (npos == string::npos)
 123                                 npos = len;
 124                         pos = npos+1;
 125                         if (npos < len) {                                       // TODO: some checks here
 126                                 // floating point
 127                                 if ((s[npos] == '.' || s[npos] == ',') &&
 128                                                 (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9'))
 129                                         continue;                                                       // skip the dot/comma
 130
 131                                 // date
 132                                 if ((s[npos] == '/') &&
 133                                                 (npos+1 < len && s[npos+1] >= '0' && s[npos+1] <= '9'))
 134                                         continue;
 135
 136                                 // only split dot when it's in the end.
 137                                 if (s[npos] == '.' && npos+1 != len)
 138                                         continue;
 139                         }
 140                 } else
 141                         npos = len;
 142
 143                 ret.push_back(s.substr(start,npos-start));
 144                 if (npos < len)
 145                         ret.push_back(s.substr(npos,1));
 146                 start = npos+1;
 147         }
 148 }
 149
 150 /**
 151  Convert a string to a sequence of token
 152 */
 153
 154 void Sentence::tokenize()
 155 {
 156         Tokens tokens;
 157         ::tokenize(sent_,tokens);
 158
 159         int i,n = tokens.size();
 160
 161         Syllable sy;
 162         sy.span = 1;
 163         sy.sent_ = this;
 164         sy.start = 0;
 165
 166         for (i = 0;i < n;i ++) {
 167                 if (tokens[i].is_token) {
 168                         /*
 169                                 char *viet_token = viet_to_viscii(tokens[i].value.c_str());
 170                                 if (!viet_token) {
 171                                 sy.id = get_sarch()[tokens[i].value];
 172                                 sy.cid = get_sarch()[string("6")+tokens[i].value];
 173                                 syllables.push_back(sy);
 174                                 } else {
 175                         */
 176                         const char *viet_token = tokens[i].value.c_str();
 177                         int jj,nn = strlen(viet_token);
 178                         for (jj = 0;jj < nn;jj ++)
 179                                 if (viet_isalpha(viet_token[jj]) || viet_isdigit(viet_token[jj])) {
 180                                         string s = viet_token;
 181                                         sy.id = get_sarch()[s];
 182                                         sy.cid = get_sarch()[get_std_syllable(s)];
 183                                         syllables.push_back(sy);
 184                                         break;
 185                                 }
 186                         /*}*/
 187                 }
 188                 sy.start += tokens[i].value.size();
 189         }
 190 }
 191
 192
 193 /**
 194          Dump a Sentence
 195 */
 196
 197 ostream& operator <<(ostream &os, const Sentence &st)
 198 {
 199         int cc,i,n = st.get_syllable_count();
 200         for (cc = i = 0;i < n;i ++) {
 201                 if (i) os << " ";
 202                 os << boost::format("%s(%d-%d[%s])") % get_sarch()[st[i].id] % st[i].id % st[i].cid % get_sarch()[st[i].cid];
 203         }
 204         //os << st.prob << endl;
 205
 206         return os;
 207 }
 208
 209 /*
 210         std::string& Sentence::const_iterator::operator++()
 211         {
 212         }
 213
 214         std::string Sentence::const_iterator::operator++(int)
 215         {
 216         }
 217 */
 218
 219
 220 std::ostream& Segmentation::pretty_print(std::ostream &os,const Sentence &st)
 221 {
 222         int i,n = size();
 223         VocabIndex id;
 224         for (i = 0;i < n;i ++) {
 225                 if (i)
 226                         os << " ";
 227                 int ii,nn = (*this)[i].len;
 228                 for (ii = 0;ii < nn;ii ++) {
 229                         if (ii)
 230                                 os << "_";
 231                         os << sarch[st[(*this)[i].pos+ii].get_id()];
 232                 }
 233         }
 234         return os;
 235 }
 236