third_party/hunspell/src/parsers/htmlparser.cxx

   1 #include <cstdlib>
   2 #include <cstring>
   3 #include <cstdio>
   4 #include <ctype.h>
   5
   6 #include "../hunspell/csutil.hxx"
   7 #include "htmlparser.hxx"
   8
   9
  10 #ifndef W32
  11 using namespace std;
  12 #endif
  13
  14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
  15
  16 static const char * PATTERN[][2] = {
  17         { "<script", "</script>" },
  18         { "<style", "</style>" },
  19         { "<code", "</code>" },
  20         { "<samp", "</samp>" },
  21         { "<kbd", "</kbd>" },
  22         { "<var", "</var>" },
  23         { "<listing", "</listing>" },
  24         { "<address", "</address>" },
  25         { "<pre", "</pre>" },
  26         { "<!--", "-->" },
  27         { "<[cdata[", "]]>" }, // XML comment
  28         { "<", ">" }
  29 };
  30
  31 #define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char *) * 2))
  32
  33 static const char * PATTERN2[][2] = {
  34         { "<img", "alt=" }, // ALT and TITLE attrib handled spec.
  35         { "<img", "title=" },
  36         { "<a ", "title=" }
  37 };
  38
  39 #define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char *) * 2))
  40
  41 HTMLParser::HTMLParser(const char * wordchars)
  42 {
  43         init(wordchars);
  44 }
  45
  46 HTMLParser::HTMLParser(unsigned short * wordchars, int len)
  47 {
  48         init(wordchars, len);
  49 }
  50
  51 HTMLParser::~HTMLParser()
  52 {
  53 }
  54
  55
  56 int HTMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
  57 {
  58         for (unsigned int i = 0; i < len; i++) {
  59                 char * j = line[actual] + head;
  60                 const char * k = p[i][column];
  61                 while ((*k != '\0') && (tolower(*j) == *k)) {
  62                         j++;
  63                         k++;
  64                 }
  65                 if (*k == '\0') return i;
  66         }
  67         return -1;
  68 }
  69
  70 /*
  71  * HTML parser
  72  *
  73  */
  74
  75
  76 char * HTMLParser::next_token()
  77 {
  78         const char * latin1;
  79
  80         for (;;) {
  81                 //fprintf(stderr, "%d:%c:%s\n", state, line[actual][head], line[actual]);
  82                 //getch();
  83                 switch (state)
  84                 {
  85                 case ST_NON_WORD: // non word chars
  86                         prevstate = ST_NON_WORD;
  87                         if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
  88                                 checkattr = 0;
  89                                 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
  90                                         checkattr = 1;
  91                                 }
  92                                 state = ST_TAG;
  93                         } else if (is_wordchar(line[actual] + head)) {
  94                                 state = ST_WORD;
  95                                 token = head;
  96                         } else if ((latin1 = get_latin1(line[actual] + head))) {
  97                                 state = ST_WORD;
  98                                 token = head;
  99                                 head += strlen(latin1);
 100                         } else if (line[actual][head] == '&') {
 101                                 state = ST_CHAR_ENTITY;
 102                         }
 103                         break;
 104                 case ST_WORD: // wordchar
 105                         if ((latin1 = get_latin1(line[actual] + head))) {
 106                                 head += strlen(latin1);
 107                         } else if (! is_wordchar(line[actual] + head)) {
 108                                 state = prevstate;
 109                                 char * t = alloc_token(token, &head);
 110                                 if (t) return t;
 111                         }
 112                         break;
 113                 case ST_TAG: // comment, labels, etc
 114                         int i;
 115                         if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1)
 116                                 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) {
 117                                         checkattr = 2;
 118                         } else if ((checkattr > 0) && (line[actual][head] == '>')) {
 119                                         state = ST_NON_WORD;
 120                         } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
 121                                 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) {
 122                                         state = ST_NON_WORD;
 123                                         head += strlen(PATTERN[pattern_num][1]) - 1;
 124                         } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
 125                                 ((line[actual][head] == '"') || (line[actual][head] == '\''))) {
 126                                 quotmark = line[actual][head];
 127                                 state = ST_ATTRIB;
 128                         }
 129                         break;
 130                 case ST_ATTRIB: // non word chars
 131                         prevstate = ST_ATTRIB;
 132                         if (line[actual][head] == quotmark) {
 133                                 state = ST_TAG;
 134                                 if (checkattr == 2) checkattr = 1;
 135                          // for IMG ALT
 136                         } else if (is_wordchar(line[actual] + head) && (checkattr == 2)) {
 137                                 state = ST_WORD;
 138                                 token = head;
 139                         } else if (line[actual][head] == '&') {
 140                                 state = ST_CHAR_ENTITY;
 141                         }
 142                         break;
 143                 case ST_CHAR_ENTITY: // SGML element
 144                         if ((tolower(line[actual][head]) == ';')) {
 145                                 state = prevstate;
 146                                 head--;
 147                         }
 148                 }
 149                 if (next_char(line[actual], &head)) return NULL;
 150         }
 151 }