third_party/hunspell/src/parsers/textparser.cxx

   1 #include <cstdlib>
   2 #include <cstring>
   3 #include <cstdio>
   4 #include <ctype.h>
   5
   6 #include "../hunspell/csutil.hxx"
   7 #include "textparser.hxx"
   8
   9 #ifndef W32
  10 using namespace std;
  11 #endif
  12
  13 // ISO-8859-1 HTML character entities
  14
  15 static const char * LATIN1[] = {
  16         "&Agrave;",
  17         "&Atilde;",
  18         "&Aring;",
  19         "&AElig;",
  20         "&Egrave;",
  21         "&Ecirc;",
  22         "&Igrave;",
  23         "&Iuml;",
  24         "&ETH;",
  25         "&Ntilde;",
  26         "&Ograve;",
  27         "&Oslash;",
  28         "&Ugrave;",
  29         "&THORN;",
  30         "&agrave;",
  31         "&atilde;",
  32         "&aring;",
  33         "&aelig;",
  34         "&egrave;",
  35         "&ecirc;",
  36         "&igrave;",
  37         "&iuml;",
  38         "&eth;",
  39         "&ntilde;",
  40         "&ograve;",
  41         "&oslash;",
  42         "&ugrave;",
  43         "&thorn;",
  44         "&yuml;"
  45 };
  46
  47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
  48
  49 TextParser::TextParser() {
  50         init((char *) NULL);
  51 }
  52
  53 TextParser::TextParser(const char * wordchars)
  54 {
  55         init(wordchars);
  56 }
  57
  58 TextParser::TextParser(unsigned short * wordchars, int len)
  59 {
  60         init(wordchars, len);
  61 }
  62
  63 TextParser::~TextParser()
  64 {
  65 }
  66
  67 int TextParser::is_wordchar(char * w)
  68 {
  69         if (*w == '\0') return 0;
  70         if (utf8) {
  71                 w_char wc;
  72                 unsigned short idx;
  73                 u8_u16(&wc, 1, w);
  74                 idx = (wc.h << 8) + wc.l;
  75                 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen)));
  76         } else {
  77                 return wordcharacters[(*w + 256) % 256];
  78         }
  79 }
  80
  81 const char * TextParser::get_latin1(char * s)
  82 {
  83         if (s[0] == '&') {
  84                 unsigned int i = 0;
  85                 while ((i < LATIN1_LEN) &&
  86                         strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;
  87                 if (i != LATIN1_LEN) return LATIN1[i];
  88         }
  89         return NULL;
  90 }
  91
  92 void TextParser::init(const char * wordchars)
  93 {
  94         for (int i = 0; i < MAXPREVLINE; i++) {
  95                 line[i][0] = '\0';
  96         }
  97         actual = 0;
  98         head = 0;
  99         token = 0;
 100         state = 0;
 101         utf8 = 0;
 102         checkurl = 0;
 103         unsigned int j;
 104         for (j = 0; j < 256; j++) {
 105                 wordcharacters[j] = 0;
 106         }
 107         if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
 108         for (j = 0; j < strlen(wordchars); j++) {
 109                 wordcharacters[(wordchars[j] + 256) % 256] = 1;
 110         }
 111 }
 112
 113 void TextParser::init(unsigned short * wc, int len)
 114 {
 115         for (int i = 0; i < MAXPREVLINE; i++) {
 116                 line[i][0] = '\0';
 117         }
 118         actual = 0;
 119         head = 0;
 120         token = 0;
 121         state = 0;
 122         utf8 = 1;
 123         checkurl = 0;
 124         wordchars_utf16 = wc;
 125         wclen = len;
 126 }
 127
 128 int TextParser::next_char(char * line, int * pos) {
 129         if (*(line + *pos) == '\0') return 1;
 130         if (utf8) {
 131             if (*(line + *pos) >> 7) {
 132                 // jump to next UTF-8 character
 133                 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);
 134             } else {
 135                 (*pos)++;
 136             }
 137         } else (*pos)++;
 138         return 0;
 139 }
 140
 141 void TextParser::put_line(char * word)
 142 {
 143         actual = (actual + 1) % MAXPREVLINE;
 144         strcpy(line[actual], word);
 145         token = 0;
 146         head = 0;
 147         check_urls();
 148 }
 149
 150 char * TextParser::get_prevline(int n)
 151 {
 152         return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);
 153 }
 154
 155 char * TextParser::get_line()
 156 {
 157         return get_prevline(0);
 158 }
 159
 160 char * TextParser::next_token()
 161 {
 162         const char * latin1;
 163
 164         for (;;) {
 165                 switch (state)
 166                 {
 167                 case 0: // non word chars
 168                         if (is_wordchar(line[actual] + head)) {
 169                                 state = 1;
 170                                 token = head;
 171                         } else if ((latin1 = get_latin1(line[actual] + head))) {
 172                                 state = 1;
 173                                 token = head;
 174                                 head += strlen(latin1);
 175                         }
 176                         break;
 177                 case 1: // wordchar
 178                         if ((latin1 = get_latin1(line[actual] + head))) {
 179                                 head += strlen(latin1);
 180                         } else if (! is_wordchar(line[actual] + head)) {
 181                                 state = 0;
 182                                 char * t = alloc_token(token, &head);
 183                                 if (t) return t;
 184                         }
 185                         break;
 186                 }
 187                 if (next_char(line[actual], &head)) return NULL;
 188         }
 189 }
 190
 191 int TextParser::get_tokenpos()
 192 {
 193         return token;
 194 }
 195
 196 int TextParser::change_token(const char * word)
 197 {
 198         if (word) {
 199                 char * r = mystrdup(line[actual] + head);
 200                 strcpy(line[actual] + token, word);
 201                 strcat(line[actual], r);
 202                 head = token;
 203                 free(r);
 204                 return 1;
 205         }
 206         return 0;
 207 }
 208
 209 void TextParser::check_urls()
 210 {
 211         int url_state = 0;
 212         int url_head = 0;
 213         int url_token = 0;
 214         int url = 0;
 215         for (;;) {
 216                 switch (url_state)
 217                 {
 218                 case 0: // non word chars
 219                         if (is_wordchar(line[actual] + url_head)) {
 220                                 url_state = 1;
 221                                 url_token = url_head;
 222                         // Unix path
 223                         } else if (*(line[actual] + url_head) == '/') {
 224                                 url_state = 1;
 225                                 url_token = url_head;
 226                                 url = 1;
 227                         }
 228                         break;
 229                 case 1: // wordchar
 230                         char ch = *(line[actual] + url_head);
 231                         // e-mail address
 232                         if ((ch == '@') ||
 233                             // MS-DOS, Windows path
 234                             (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
 235                             // URL
 236                             (strncmp(line[actual] + url_head, "://", 3) == 0)) {
 237                                 url = 1;
 238                         } else if (! (is_wordchar(line[actual] + url_head) ||
 239                           (ch == '-') || (ch == '_') || (ch == '\\') ||
 240                           (ch == '.') || (ch == ':') || (ch == '/') ||
 241                           (ch == '~') || (ch == '%') || (ch == '*') ||
 242                           (ch == '$') || (ch == '[') || (ch == ']') ||
 243                           (ch == '?') || (ch == '!') ||
 244                           ((ch >= '0') && (ch <= '9')))) {
 245                                 url_state = 0;
 246                                 if (url == 1) {
 247                                         for (int i = url_token; i < url_head; i++) {
 248                                                 *(urlline + i) = 1;
 249                                         }
 250                                 }
 251                                 url = 0;
 252                         }
 253                         break;
 254                 }
 255                 *(urlline + url_head) = 0;
 256                 if (next_char(line[actual], &url_head)) return;
 257         }
 258 }
 259
 260 int TextParser::get_url(int token_pos, int * head)
 261 {
 262         for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
 263         return checkurl ? 0 : urlline[token_pos];
 264 }
 265
 266 void TextParser::set_url_checking(int check)
 267 {
 268         checkurl = check;
 269 }
 270
 271
 272 char * TextParser::alloc_token(int token, int * head)
 273 {
 274     if (get_url(token, head)) return NULL;
 275     char * t = (char *) malloc(*head - token + 1);
 276     if (t) {
 277         t[*head - token] = '\0';
 278         strncpy(t, line[actual] + token, *head - token);
 279         // remove colon for Finnish and Swedish language
 280         if (t[*head - token - 1] == ':') {
 281             t[*head - token - 1] = '\0';
 282             if (!t[0]) {
 283                 free(t);
 284                 return NULL;
 285             }
 286         }
 287         return t;
 288     }
 289     fprintf(stderr,"Error - Insufficient Memory\n");
 290     return NULL;
 291 }