src/dom/sgml/scanner.c

   1 /* SGML token scanner utilities */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9
  10 #include "elinks.h"
  11
  12 #include "dom/scanner.h"
  13 #include "dom/sgml/scanner.h"
  14 #include "dom/string.h"
  15 #include "util/error.h"
  16
  17
  18 /* Bitmap entries for the SGML character groups used in the scanner table */
  19
  20 enum sgml_char_group {
  21         SGML_CHAR_ENTITY        = (1 << 1),
  22         SGML_CHAR_IDENT         = (1 << 2),
  23         SGML_CHAR_NEWLINE       = (1 << 3),
  24         SGML_CHAR_WHITESPACE    = (1 << 4),
  25         SGML_CHAR_NOT_TEXT      = (1 << 5),
  26         SGML_CHAR_NOT_ATTRIBUTE = (1 << 6),
  27 };
  28
  29 static struct dom_scan_table_info sgml_scan_table_info[] = {
  30         DOM_SCAN_TABLE_RANGE("0", '9', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  31         DOM_SCAN_TABLE_RANGE("A", 'Z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  32         DOM_SCAN_TABLE_RANGE("a", 'z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  33         /* For the octal number impared (me including) \241 is 161 --jonas */
  34         DOM_SCAN_TABLE_RANGE("\241", 255, SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  35
  36         DOM_SCAN_TABLE_STRING("-_:.",    SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  37         DOM_SCAN_TABLE_STRING("#",       SGML_CHAR_ENTITY),
  38         DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE),
  39         DOM_SCAN_TABLE_STRING("\f\n",    SGML_CHAR_NEWLINE),
  40         DOM_SCAN_TABLE_STRING("<&",      SGML_CHAR_NOT_TEXT),
  41         DOM_SCAN_TABLE_STRING("<=>",     SGML_CHAR_NOT_ATTRIBUTE),
  42
  43         DOM_SCAN_TABLE_END,
  44 };
  45
  46 #define SGML_STRING_MAP(str, type, family) \
  47         { INIT_DOM_STRING(str, -1), SGML_TOKEN_##type, SGML_TOKEN_##family }
  48
  49 static struct dom_scanner_string_mapping sgml_string_mappings[] = {
  50         SGML_STRING_MAP("--",             NOTATION_COMMENT,       NOTATION),
  51         SGML_STRING_MAP("ATTLIST",        NOTATION_ATTLIST,       NOTATION),
  52         SGML_STRING_MAP("DOCTYPE",        NOTATION_DOCTYPE,       NOTATION),
  53         SGML_STRING_MAP("ELEMENT",        NOTATION_ELEMENT,       NOTATION),
  54         SGML_STRING_MAP("ENTITY",         NOTATION_ENTITY,        NOTATION),
  55
  56         SGML_STRING_MAP("xml",            PROCESS_XML,            PROCESS),
  57         SGML_STRING_MAP("xml-stylesheet", PROCESS_XML_STYLESHEET, PROCESS),
  58
  59         DOM_STRING_MAP_END,
  60 };
  61
  62 static struct dom_scanner_token *scan_sgml_tokens(struct dom_scanner *scanner);
  63
  64 struct dom_scanner_info sgml_scanner_info = {
  65         sgml_string_mappings,
  66         sgml_scan_table_info,
  67         scan_sgml_tokens,
  68 };
  69
  70 #define check_sgml_table(c, bit)        (sgml_scanner_info.scan_table[(c)] & (bit))
  71
  72 #define scan_sgml(scanner, s, bit)                                      \
  73         while ((s) < (scanner)->end && check_sgml_table(*(s), bit)) (s)++;
  74
  75 #define is_sgml_ident(c)        check_sgml_table(c, SGML_CHAR_IDENT)
  76 #define is_sgml_entity(c)       check_sgml_table(c, SGML_CHAR_ENTITY)
  77 #define is_sgml_space(c)        check_sgml_table(c, SGML_CHAR_WHITESPACE)
  78 #define is_sgml_newline(c)      check_sgml_table(c, SGML_CHAR_NEWLINE)
  79 #define is_sgml_text(c)         !check_sgml_table(c, SGML_CHAR_NOT_TEXT)
  80 #define is_sgml_token_start(c)  check_sgml_table(c, SGML_CHAR_TOKEN_START)
  81 #define is_sgml_attribute(c)    !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE)
  82
  83 static inline void
  84 skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
  85 {
  86         unsigned char *pos = *string;
  87
  88         if (!scanner->count_lines) {
  89                 scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE);
  90         } else {
  91                 while (pos < scanner->end && is_sgml_space(*pos)) {
  92                         if (is_sgml_newline(*pos))
  93                                 scanner->lineno++;
  94                         pos++;
  95                 }
  96         }
  97
  98         *string = pos;
  99 }
 100
 101
 102 /* Text token scanning */
 103
 104 /* I think it is faster to not check the table here --jonas */
 105 #define foreach_sgml_cdata(scanner, str)                                \
 106         for (; ((str) < (scanner)->end && *(str) != '<' && *(str) != '&'); (str)++)
 107
 108 static inline void
 109 scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 110 {
 111         unsigned char *string = scanner->position;
 112         unsigned char first_char = *string;
 113         enum sgml_token_type type = SGML_TOKEN_GARBAGE;
 114         int real_length = -1;
 115
 116         /* In scan_sgml_tokens() we check that first_char != '<' */
 117         assert(first_char != '<' && scanner->state == SGML_STATE_TEXT);
 118
 119         token->string.string = string++;
 120
 121         if (first_char == '&') {
 122                 if (is_sgml_entity(*string)) {
 123                         scan_sgml(scanner, string, SGML_CHAR_ENTITY);
 124                         type = SGML_TOKEN_ENTITY;
 125                         token->string.string++;
 126                         real_length = string - token->string.string;
 127                 }
 128
 129                 foreach_sgml_cdata (scanner, string) {
 130                         if (*string == ';') {
 131                                 string++;
 132                                 break;
 133                         }
 134                 }
 135
 136         } else {
 137                 if (is_sgml_space(first_char)) {
 138                         skip_sgml_space(scanner, &string);
 139                         type = string < scanner->end && is_sgml_text(*string)
 140                              ? SGML_TOKEN_TEXT : SGML_TOKEN_SPACE;
 141                 } else {
 142                         type = SGML_TOKEN_TEXT;
 143                 }
 144
 145                 foreach_sgml_cdata (scanner, string) {
 146                         /* m33p */;
 147                 }
 148         }
 149
 150         token->type = type;
 151         token->string.length = real_length >= 0 ? real_length : string - token->string.string;
 152         token->precedence = get_sgml_precedence(type);
 153         scanner->position = string;
 154 }
 155
 156
 157 /* Element scanning */
 158
 159 /* Check whether it is safe to skip the @token when looking for @skipto. */
 160 static inline int
 161 check_sgml_precedence(int type, int skipto)
 162 {
 163         return get_sgml_precedence(type) <= get_sgml_precedence(skipto);
 164 }
 165
 166 /* Skip until @skipto is found, without taking precedence into account. */
 167 static inline unsigned char *
 168 skip_sgml_chars(struct dom_scanner *scanner, unsigned char *string,
 169                 unsigned char skipto)
 170 {
 171         int newlines;
 172
 173         assert(string >= scanner->position && string <= scanner->end);
 174
 175         if (!scanner->count_lines) {
 176                 size_t length = scanner->end - string;
 177
 178                 return memchr(string, skipto, length);
 179         }
 180
 181         for (newlines = 0; string < scanner->end; string++) {
 182                 if (is_sgml_newline(*string))
 183                         newlines++;
 184                 if (*string == skipto) {
 185                         /* Only count newlines if we actually find the
 186                          * requested char. Else callers are assumed to discard
 187                          * the scanning. */
 188                         scanner->lineno += newlines;
 189                         return string;
 190                 }
 191         }
 192
 193         return NULL;
 194 }
 195
 196 /* XXX: Only element or ``in tag'' precedence is handled correctly however
 197  * using this function for CDATA or text would be overkill. */
 198 static inline unsigned char *
 199 skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char skipto,
 200           int check_quoting)
 201 {
 202         unsigned char *pos = *string;
 203
 204         for (; pos < scanner->end; pos++) {
 205                 if (*pos == skipto) {
 206                         *string = pos + 1;
 207                         return pos;
 208                 }
 209
 210                 if (!check_sgml_precedence(*pos, skipto))
 211                         break;
 212
 213                 if (check_quoting && isquote(*pos)) {
 214                         unsigned char *end;
 215
 216                         end = skip_sgml_chars(scanner, pos + 1, *pos);
 217                         if (end) pos = end;
 218
 219                 } else if (scanner->count_lines && is_sgml_newline(*pos)) {
 220                         scanner->lineno++;
 221                 }
 222         }
 223
 224         *string = pos;
 225         return NULL;
 226 }
 227
 228 static inline int
 229 skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
 230 {
 231         unsigned char *pos = *string;
 232         int length = 0;
 233
 234         for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
 235                 /* It is always safe to access index -2 and -1 here since we
 236                  * are supposed to have '<!--' before this is called. We do
 237                  * however need to check that the '-->' are not overlapping any
 238                  * preceeding '-'. */
 239                 if (pos[-2] == '-' && pos[-1] == '-' && &pos[-2] >= *string) {
 240                         length = pos - *string - 2;
 241                         pos++;
 242                         break;
 243                 }
 244         }
 245
 246         if (!pos) {
 247                 pos = scanner->end;
 248                 length = pos - *string;
 249         }
 250
 251         *string = pos;
 252         return length;
 253 }
 254
 255 static inline int
 256 skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
 257 {
 258         unsigned char *pos = *string;
 259         int length = 0;
 260
 261         for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
 262                 /* It is always safe to access index -2 and -1 here since we
 263                  * are supposed to have '<![CDATA[' before this is called. */
 264                 if (pos[-2] == ']' && pos[-1] == ']') {
 265                         length = pos - *string - 2;
 266                         pos++;
 267                         break;
 268                 }
 269         }
 270
 271         if (!pos) {
 272                 pos = scanner->end;
 273                 length = pos - *string;
 274         }
 275
 276         *string = pos;
 277         return length;
 278 }
 279
 280 #define scan_sgml_attribute(scanner, str)                               \
 281         while ((str) < (scanner)->end && is_sgml_attribute(*(str)))     \
 282                (str)++;
 283
 284 static inline void
 285 scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 286 {
 287         unsigned char *string = scanner->position;
 288         unsigned char first_char = *string;
 289         enum sgml_token_type type = SGML_TOKEN_GARBAGE;
 290         int real_length = -1;
 291
 292         token->string.string = string++;
 293
 294         if (first_char == '<') {
 295                 skip_sgml_space(scanner, &string);
 296
 297                 if (string == scanner->end) {
 298                         /* Prevent out of bound access. */
 299
 300                 } else if (scanner->state == SGML_STATE_ELEMENT) {
 301                         /* Already inside an element so insert a tag end token
 302                          * and continue scanning in next iteration. */
 303                         string--;
 304                         real_length = 0;
 305                         type = SGML_TOKEN_TAG_END;
 306                         scanner->state = SGML_STATE_TEXT;
 307
 308                 } else if (is_sgml_ident(*string)) {
 309                         token->string.string = string;
 310                         scan_sgml(scanner, string, SGML_CHAR_IDENT);
 311
 312                         real_length = string - token->string.string;
 313
 314                         skip_sgml_space(scanner, &string);
 315                         if (string < scanner->end && *string == '>') {
 316                                 type = SGML_TOKEN_ELEMENT;
 317                                 string++;
 318                         } else {
 319                                 scanner->state = SGML_STATE_ELEMENT;
 320                                 type = SGML_TOKEN_ELEMENT_BEGIN;
 321                         }
 322
 323                 } else if (*string == '!') {
 324                         unsigned char *ident;
 325                         enum sgml_token_type base = SGML_TOKEN_NOTATION;
 326
 327                         string++;
 328                         skip_sgml_space(scanner, &string);
 329                         token->string.string = ident = string;
 330
 331                         if (string + 1 < scanner->end
 332                             && string[0] == '-' && string[1] == '-') {
 333                                 string += 2;
 334                                 type = SGML_TOKEN_NOTATION_COMMENT;
 335                                 token->string.string = string;
 336                                 real_length = skip_sgml_comment(scanner, &string);
 337                                 assert(real_length >= 0);
 338
 339                         } else if (string + 6 < scanner->end
 340                                    && !memcmp(string, "[CDATA[", 7)) {
 341
 342                                 string += 7;
 343                                 type = SGML_TOKEN_CDATA_SECTION;
 344                                 token->string.string = string;
 345                                 real_length = skip_sgml_cdata_section(scanner, &string);
 346                                 assert(real_length >= 0);
 347
 348                         } else {
 349                                 skip_sgml_space(scanner, &string);
 350                                 type = map_dom_scanner_string(scanner, ident, string, base);
 351                                 skip_sgml(scanner, &string, '>', 0);
 352                         }
 353
 354                 } else if (*string == '?') {
 355                         unsigned char *pos;
 356                         enum sgml_token_type base = SGML_TOKEN_PROCESS;
 357
 358                         string++;
 359                         skip_sgml_space(scanner, &string);
 360                         token->string.string = pos = string;
 361                         scan_sgml(scanner, string, SGML_CHAR_IDENT);
 362
 363                         type = map_dom_scanner_string(scanner, pos, string, base);
 364
 365                         scanner->state = SGML_STATE_PROC_INST;
 366
 367                 } else if (*string == '/') {
 368                         string++;
 369                         skip_sgml_space(scanner, &string);
 370
 371                         if (string == scanner->end) {
 372                                 /* Prevent out of bound access. */
 373
 374                         } else if (is_sgml_ident(*string)) {
 375                                 token->string.string = string;
 376                                 scan_sgml(scanner, string, SGML_CHAR_IDENT);
 377                                 real_length = string - token->string.string;
 378
 379                                 type = SGML_TOKEN_ELEMENT_END;
 380                                 skip_sgml(scanner, &string, '>', 1);
 381
 382                         } else if (*string == '>') {
 383                                 string++;
 384                                 real_length = 0;
 385                                 type = SGML_TOKEN_ELEMENT_END;
 386                         }
 387
 388                         if (type != SGML_TOKEN_GARBAGE)
 389                                 scanner->state = SGML_STATE_TEXT;
 390
 391                 } else {
 392                         /* Alien < > stuff so ignore it */
 393                         skip_sgml(scanner, &string, '>', 0);
 394                 }
 395
 396         } else if (first_char == '=') {
 397                 type = '=';
 398
 399         } else if (first_char == '?' || first_char == '>') {
 400                 if (first_char == '?') {
 401                         skip_sgml(scanner, &string, '>', 0);
 402                 }
 403
 404                 type = SGML_TOKEN_TAG_END;
 405                 assert(scanner->state == SGML_STATE_ELEMENT);
 406                 scanner->state = SGML_STATE_TEXT;
 407
 408         } else if (first_char == '/') {
 409                 if (string == scanner->end) {
 410                         /* Prevent out of bound access. */
 411
 412                 } else if (*string == '>') {
 413                         string++;
 414                         real_length = 0;
 415                         type = SGML_TOKEN_ELEMENT_EMPTY_END;
 416                         assert(scanner->state == SGML_STATE_ELEMENT);
 417                         scanner->state = SGML_STATE_TEXT;
 418                 } else if (is_sgml_attribute(*string)) {
 419                         scan_sgml_attribute(scanner, string);
 420                         type = SGML_TOKEN_ATTRIBUTE;
 421                         if (string[-1] == '/' && string[0] == '>')
 422                                 string--;
 423                 }
 424
 425         } else if (isquote(first_char)) {
 426                 unsigned char *string_end = skip_sgml_chars(scanner, string, first_char);
 427
 428                 if (string_end) {
 429                         /* We don't want the delimiters in the token */
 430                         token->string.string++;
 431                         real_length = string_end - token->string.string;
 432                         string = string_end + 1;
 433                         type = SGML_TOKEN_STRING;
 434
 435                 } else if (string < scanner->end
 436                            && is_sgml_attribute(*string)) {
 437
 438                         token->string.string++;
 439                         scan_sgml_attribute(scanner, string);
 440                         type = SGML_TOKEN_ATTRIBUTE;
 441                 }
 442
 443         } else if (is_sgml_attribute(first_char)) {
 444                 if (is_sgml_ident(first_char)) {
 445                         scan_sgml(scanner, string, SGML_CHAR_IDENT);
 446                         type = SGML_TOKEN_IDENT;
 447                 }
 448
 449                 if (string < scanner->end
 450                     && is_sgml_attribute(*string)) {
 451                         scan_sgml_attribute(scanner, string);
 452                         type = SGML_TOKEN_ATTRIBUTE;
 453                         if (string[-1] == '/' && string[0] == '>')
 454                                 string--;
 455                 }
 456         }
 457
 458         token->type = type;
 459         token->string.length = real_length >= 0 ? real_length : string - token->string.string;
 460         token->precedence = get_sgml_precedence(type);
 461         scanner->position = string;
 462 }
 463
 464
 465 /* Processing instruction data scanning */
 466
 467 static inline void
 468 scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 469 {
 470         unsigned char *string = scanner->position;
 471
 472         token->string.string = string;
 473
 474         /* Figure out where the processing instruction ends. This doesn't use
 475          * skip_sgml() since we MUST ignore precedence here to allow '<' inside
 476          * the data part to be skipped correctly. */
 477         for ( ; (string = skip_sgml_chars(scanner, string, '>')); string++) {
 478                 if (string[-1] == '?') {
 479                         string++;
 480                         break;
 481                 }
 482         }
 483
 484         if (!string) string = scanner->end;
 485
 486         token->type = SGML_TOKEN_PROCESS_DATA;
 487         token->string.length = string - token->string.string - 2;
 488         token->precedence = get_sgml_precedence(token->type);
 489         scanner->position = string;
 490         scanner->state = SGML_STATE_TEXT;
 491 }
 492
 493
 494 /* Scanner multiplexor */
 495
 496 static struct dom_scanner_token *
 497 scan_sgml_tokens(struct dom_scanner *scanner)
 498 {
 499         struct dom_scanner_token *table_end = scanner->table + DOM_SCANNER_TOKENS;
 500         struct dom_scanner_token *current;
 501
 502         if (!begin_dom_token_scanning(scanner))
 503                 return get_dom_scanner_token(scanner);
 504
 505         /* Scan tokens until we fill the table */
 506         for (current = scanner->table + scanner->tokens;
 507              current < table_end && scanner->position < scanner->end;
 508              current++) {
 509                 if (scanner->state == SGML_STATE_ELEMENT
 510                     || (*scanner->position == '<'
 511                         && scanner->state != SGML_STATE_PROC_INST)) {
 512                         skip_sgml_space(scanner, &scanner->position);
 513                         if (scanner->position >= scanner->end) break;
 514
 515                         scan_sgml_element_token(scanner, current);
 516
 517                         /* Shall we scratch this token? */
 518                         if (current->type == SGML_TOKEN_SKIP) {
 519                                 current--;
 520                         }
 521
 522                 } else if (scanner->state == SGML_STATE_TEXT) {
 523                         scan_sgml_text_token(scanner, current);
 524
 525                 } else {
 526                         skip_sgml_space(scanner, &scanner->position);
 527                         scan_sgml_proc_inst_token(scanner, current);
 528                 }
 529         }
 530
 531         return end_dom_token_scanning(scanner, current);
 532 }