src/dom/sgml/scanner.c

   1 /* SGML token scanner utilities */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9
  10 #include "elinks.h"
  11
  12 #include "dom/scanner.h"
  13 #include "dom/sgml/scanner.h"
  14 #include "dom/string.h"
  15 #include "util/error.h"
  16
  17
  18 /* Bitmap entries for the SGML character groups used in the scanner table */
  19
  20 enum sgml_char_group {
  21         SGML_CHAR_ENTITY        = (1 << 1),
  22         SGML_CHAR_IDENT         = (1 << 2),
  23         SGML_CHAR_NEWLINE       = (1 << 3),
  24         SGML_CHAR_WHITESPACE    = (1 << 4),
  25         SGML_CHAR_NOT_TEXT      = (1 << 5),
  26         SGML_CHAR_NOT_ATTRIBUTE = (1 << 6),
  27 };
  28
  29 static struct dom_scan_table_info sgml_scan_table_info[] = {
  30         DOM_SCAN_TABLE_RANGE("0", '9', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  31         DOM_SCAN_TABLE_RANGE("A", 'Z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  32         DOM_SCAN_TABLE_RANGE("a", 'z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  33         /* For the octal number impared (me including) \241 is 161 --jonas */
  34         DOM_SCAN_TABLE_RANGE("\241", 255, SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  35
  36         DOM_SCAN_TABLE_STRING("-_:.",    SGML_CHAR_IDENT | SGML_CHAR_ENTITY),
  37         DOM_SCAN_TABLE_STRING("#",       SGML_CHAR_ENTITY),
  38         DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE),
  39         DOM_SCAN_TABLE_STRING("\f\n",    SGML_CHAR_NEWLINE),
  40         DOM_SCAN_TABLE_STRING("<&",      SGML_CHAR_NOT_TEXT),
  41         DOM_SCAN_TABLE_STRING("<=>",     SGML_CHAR_NOT_ATTRIBUTE),
  42
  43         DOM_SCAN_TABLE_END,
  44 };
  45
  46 #define SGML_STRING_MAP(str, type, family) \
  47         { STATIC_DOM_STRING(str), SGML_TOKEN_##type, SGML_TOKEN_##family }
  48
  49 static struct dom_scanner_string_mapping sgml_string_mappings[] = {
  50         SGML_STRING_MAP("--",             NOTATION_COMMENT,       NOTATION),
  51         SGML_STRING_MAP("ATTLIST",        NOTATION_ATTLIST,       NOTATION),
  52         SGML_STRING_MAP("DOCTYPE",        NOTATION_DOCTYPE,       NOTATION),
  53         SGML_STRING_MAP("ELEMENT",        NOTATION_ELEMENT,       NOTATION),
  54         SGML_STRING_MAP("ENTITY",         NOTATION_ENTITY,        NOTATION),
  55
  56         SGML_STRING_MAP("xml",            PROCESS_XML,            PROCESS),
  57         SGML_STRING_MAP("xml-stylesheet", PROCESS_XML_STYLESHEET, PROCESS),
  58
  59         DOM_STRING_MAP_END,
  60 };
  61
  62 static struct dom_scanner_token *scan_sgml_tokens(struct dom_scanner *scanner);
  63
  64 struct dom_scanner_info sgml_scanner_info = {
  65         sgml_string_mappings,
  66         sgml_scan_table_info,
  67         scan_sgml_tokens,
  68 };
  69
  70 #define check_sgml_table(c, bit)        (sgml_scanner_info.scan_table[(c)] & (bit))
  71
  72 #define scan_sgml(scanner, s, bit)                                      \
  73         while ((s) < (scanner)->end && check_sgml_table(*(s), bit)) (s)++;
  74
  75 #define is_sgml_ident(c)        check_sgml_table(c, SGML_CHAR_IDENT)
  76 #define is_sgml_entity(c)       check_sgml_table(c, SGML_CHAR_ENTITY)
  77 #define is_sgml_space(c)        check_sgml_table(c, SGML_CHAR_WHITESPACE)
  78 #define is_sgml_newline(c)      check_sgml_table(c, SGML_CHAR_NEWLINE)
  79 #define is_sgml_text(c)         !check_sgml_table(c, SGML_CHAR_NOT_TEXT)
  80 #define is_sgml_token_start(c)  check_sgml_table(c, SGML_CHAR_TOKEN_START)
  81 #define is_sgml_attribute(c)    !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE)
  82
  83 static inline void
  84 skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
  85 {
  86         unsigned char *pos = *string;
  87
  88         if (!scanner->count_lines) {
  89                 scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE);
  90         } else {
  91                 while (pos < scanner->end && is_sgml_space(*pos)) {
  92                         if (is_sgml_newline(*pos))
  93                                 scanner->lineno++;
  94                         pos++;
  95                 }
  96         }
  97
  98         *string = pos;
  99 }
 100
 101 #define check_sgml_incomplete(scanner, string) \
 102         ((scanner)->check_complete \
 103          && (scanner)->incomplete \
 104          && (string) == (scanner)->end)
 105
 106 static void
 107 set_sgml_incomplete(struct dom_scanner *scanner, struct dom_scanner_token *token)
 108 {
 109         size_t left = scanner->end - scanner->position;
 110
 111         assert(left > 0);
 112
 113         token->type = SGML_TOKEN_INCOMPLETE;
 114         set_dom_string(&token->string, scanner->position, left);
 115
 116         /* Stop the scanning. */
 117         scanner->position = scanner->end;
 118 }
 119
 120
 121 static inline int
 122 check_sgml_error(struct dom_scanner *scanner)
 123 {
 124         unsigned int found_error = scanner->found_error;
 125
 126         /* Toggle if we found an error previously. */
 127         scanner->found_error = 0;
 128
 129         return scanner->detect_errors && !found_error;
 130 }
 131
 132 static unsigned char *
 133 get_sgml_error_end(struct dom_scanner *scanner, enum sgml_token_type type,
 134                    unsigned char *end)
 135 {
 136         switch (type) {
 137         case SGML_TOKEN_CDATA_SECTION:
 138         case SGML_TOKEN_NOTATION_ATTLIST:
 139         case SGML_TOKEN_NOTATION_DOCTYPE:
 140         case SGML_TOKEN_NOTATION_ELEMENT:
 141                 if (scanner->position + 9 < end)
 142                         end = scanner->position + 9;
 143                 break;
 144
 145         case SGML_TOKEN_NOTATION_COMMENT:
 146                 /* Just include the '<!--' part. */
 147                 if (scanner->position + 4 < end)
 148                         end = scanner->position + 4;
 149                 break;
 150
 151         case SGML_TOKEN_NOTATION_ENTITY:
 152                 if (scanner->position + 6 < end)
 153                         end = scanner->position + 6;
 154                 break;
 155
 156         case SGML_TOKEN_PROCESS_XML:
 157                 if (scanner->position + 5 < end)
 158                         end = scanner->position + 5;
 159                 break;
 160
 161         case SGML_TOKEN_PROCESS_XML_STYLESHEET:
 162                 if (scanner->position + 16 < end)
 163                         end = scanner->position + 16;
 164                 break;
 165
 166         default:
 167                 break;
 168         }
 169
 170         return end;
 171 }
 172
 173
 174 static struct dom_scanner_token *
 175 set_sgml_error(struct dom_scanner *scanner, unsigned char *end)
 176 {
 177         struct dom_scanner_token *token = scanner->current;
 178         struct dom_scanner_token *next;
 179
 180         assert(!scanner->found_error);
 181
 182         if (scanner->current >= scanner->table + DOM_SCANNER_TOKENS) {
 183                 scanner->found_error = 1;
 184                 next = NULL;
 185
 186         } else {
 187                 scanner->current++;
 188                 next = scanner->current;
 189                 copy_struct(next, token);
 190         }
 191
 192         token->type = SGML_TOKEN_ERROR;
 193         token->lineno = scanner->lineno;
 194         set_dom_string(&token->string, scanner->position, end - scanner->position);
 195
 196         return next;
 197 }
 198
 199
 200 /* Text token scanning */
 201
 202 /* I think it is faster to not check the table here --jonas */
 203 #define foreach_sgml_cdata(scanner, str)                                \
 204         for (; ((str) < (scanner)->end && *(str) != '<' && *(str) != '&'); (str)++)
 205
 206 static inline void
 207 scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 208 {
 209         unsigned char *string = scanner->position;
 210         unsigned char first_char = *string;
 211         enum sgml_token_type type = SGML_TOKEN_GARBAGE;
 212         int real_length = -1;
 213
 214         /* In scan_sgml_tokens() we check that first_char != '<' */
 215         assert(first_char != '<' && scanner->state == SGML_STATE_TEXT);
 216
 217         token->string.string = string++;
 218
 219         if (first_char == '&') {
 220                 int complete = 0;
 221
 222                 if (is_sgml_entity(*string)) {
 223                         scan_sgml(scanner, string, SGML_CHAR_ENTITY);
 224                         type = SGML_TOKEN_ENTITY;
 225                         token->string.string++;
 226                         real_length = string - token->string.string;
 227                 }
 228
 229                 foreach_sgml_cdata (scanner, string) {
 230                         if (*string == ';') {
 231                                 complete = 1;
 232                                 string++;
 233                                 break;
 234                         }
 235                 }
 236
 237                 /* We want the biggest possible text token. */
 238                 if (!complete) {
 239                         if (check_sgml_incomplete(scanner, string)) {
 240                                 set_sgml_incomplete(scanner, token);
 241                                 return;
 242                         }
 243
 244                         if (check_sgml_error(scanner)) {
 245                                 token = set_sgml_error(scanner, string);
 246                                 if (!token)
 247                                         return;
 248                         }
 249                 }
 250
 251         } else {
 252                 if (is_sgml_space(first_char)) {
 253                         if (scanner->count_lines
 254                             && is_sgml_newline(first_char))
 255                                 scanner->lineno++;
 256
 257                         skip_sgml_space(scanner, &string);
 258                         type = string < scanner->end && is_sgml_text(*string)
 259                              ? SGML_TOKEN_TEXT : SGML_TOKEN_SPACE;
 260                 } else {
 261                         type = SGML_TOKEN_TEXT;
 262                 }
 263
 264                 if (scanner->count_lines) {
 265                         foreach_sgml_cdata (scanner, string) {
 266                                 if (is_sgml_newline(*string))
 267                                         scanner->lineno++;
 268                         }
 269                 } else {
 270                         foreach_sgml_cdata (scanner, string) {
 271                                 /* m33p */;
 272                         }
 273                 }
 274
 275                 /* We want the biggest possible text token. */
 276                 if (check_sgml_incomplete(scanner, string)) {
 277                         set_sgml_incomplete(scanner, token);
 278                         return;
 279                 }
 280         }
 281
 282         token->type = type;
 283         token->string.length = real_length >= 0 ? real_length : string - token->string.string;
 284         token->precedence = get_sgml_precedence(type);
 285         scanner->position = string;
 286 }
 287
 288
 289 /* Element scanning */
 290
 291 /* Check whether it is safe to skip the @token when looking for @skipto. */
 292 static inline int
 293 check_sgml_precedence(int type, int skipto)
 294 {
 295         return get_sgml_precedence(type) <= get_sgml_precedence(skipto);
 296 }
 297
 298 /* Skip until @skipto is found, without taking precedence into account. */
 299 static inline unsigned char *
 300 skip_sgml_chars(struct dom_scanner *scanner, unsigned char *string,
 301                 unsigned char skipto)
 302 {
 303         int newlines;
 304
 305         assert(string >= scanner->position && string <= scanner->end);
 306
 307         if (!scanner->count_lines) {
 308                 size_t length = scanner->end - string;
 309
 310                 return memchr(string, skipto, length);
 311         }
 312
 313         for (newlines = 0; string < scanner->end; string++) {
 314                 if (is_sgml_newline(*string))
 315                         newlines++;
 316                 if (*string == skipto) {
 317                         /* Only count newlines if we actually find the
 318                          * requested char. Else callers are assumed to discard
 319                          * the scanning. */
 320                         scanner->lineno += newlines;
 321                         return string;
 322                 }
 323         }
 324
 325         return NULL;
 326 }
 327
 328 /* XXX: Only element or ``in tag'' precedence is handled correctly however
 329  * using this function for CDATA or text would be overkill. */
 330 static inline unsigned char *
 331 skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char skipto,
 332           int check_quoting)
 333 {
 334         unsigned char *pos = *string;
 335
 336         for (; pos < scanner->end; pos++) {
 337                 if (*pos == skipto) {
 338                         *string = pos + 1;
 339                         return pos;
 340                 }
 341
 342                 if (!check_sgml_precedence(*pos, skipto))
 343                         break;
 344
 345                 if (check_quoting && isquote(*pos)) {
 346                         unsigned char *end;
 347
 348                         end = skip_sgml_chars(scanner, pos + 1, *pos);
 349                         if (end) pos = end;
 350
 351                 } else if (scanner->count_lines && is_sgml_newline(*pos)) {
 352                         scanner->lineno++;
 353                 }
 354         }
 355
 356         *string = pos;
 357         return NULL;
 358 }
 359
 360 static inline int
 361 skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string,
 362                   int *possibly_incomplete)
 363 {
 364         unsigned char *pos = *string;
 365         int length = 0;
 366
 367         for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
 368                 /* It is always safe to access index -2 and -1 here since we
 369                  * are supposed to have '<!--' before this is called. We do
 370                  * however need to check that the '-->' are not overlapping any
 371                  * preceeding '-'. Additionally also handle the quirky '--!>'
 372                  * end sometimes found. */
 373                 if (pos[-2] == '-') {
 374                         if (pos[-1] == '-' && &pos[-2] >= *string) {
 375                                 length = pos - *string - 2;
 376                                 *possibly_incomplete = 0;
 377                                 pos++;
 378                                 break;
 379                         } else if (pos[-1] == '!' && pos[-3] == '-' && &pos[-3] >= *string) {
 380                                 length = pos - *string - 3;
 381                                 *possibly_incomplete = 0;
 382                                 pos++;
 383                                 break;
 384                         }
 385                 }
 386         }
 387
 388         if (!pos) {
 389                 pos = scanner->end;
 390                 /* The token is incomplete but set the length to handle tag
 391                  * tag soup graciously. */
 392                 *possibly_incomplete = 1;
 393                 length = pos - *string;
 394         }
 395
 396         *string = pos;
 397         return length;
 398 }
 399
 400 static inline int
 401 skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string,
 402                         int *possibly_incomplete)
 403 {
 404         unsigned char *pos = *string;
 405         int length = 0;
 406
 407         for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) {
 408                 /* It is always safe to access index -2 and -1 here since we
 409                  * are supposed to have '<![CDATA[' before this is called. */
 410                 if (pos[-2] == ']' && pos[-1] == ']') {
 411                         length = pos - *string - 2;
 412                         *possibly_incomplete = 0;
 413                         pos++;
 414                         break;
 415                 }
 416         }
 417
 418         if (!pos) {
 419                 pos = scanner->end;
 420                 /* The token is incomplete but set the length to handle tag
 421                  * soup graciously. */
 422                 *possibly_incomplete = 1;
 423                 length = pos - *string;
 424         }
 425
 426         *string = pos;
 427         return length;
 428 }
 429
 430 #define scan_sgml_attribute(scanner, str)                               \
 431         while ((str) < (scanner)->end && is_sgml_attribute(*(str)))     \
 432                (str)++;
 433
 434 static inline void
 435 scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 436 {
 437         unsigned char *string = scanner->position;
 438         unsigned char first_char = *string;
 439         enum sgml_token_type type = SGML_TOKEN_GARBAGE;
 440         int real_length = -1;
 441         int possibly_incomplete = 1;
 442         enum sgml_scanner_state scanner_state = scanner->state;
 443
 444         token->string.string = string++;
 445
 446         if (first_char == '<') {
 447                 skip_sgml_space(scanner, &string);
 448
 449                 if (scanner->state == SGML_STATE_ELEMENT) {
 450                         /* Already inside an element so insert a tag end token
 451                          * and continue scanning in next iteration. */
 452                         type = SGML_TOKEN_TAG_END;
 453                         scanner_state = SGML_STATE_TEXT;
 454
 455                         /* We are creating a 'virtual' that has no source. */
 456                         possibly_incomplete = 0;
 457                         string = token->string.string;
 458                         real_length = 0;
 459
 460                 } else if (string == scanner->end) {
 461                         /* It is incomplete so prevent out of bound acess to
 462                          * the scanned string. */
 463
 464                 } else if (is_sgml_ident(*string)) {
 465                         token->string.string = string;
 466                         scan_sgml(scanner, string, SGML_CHAR_IDENT);
 467
 468                         real_length = string - token->string.string;
 469
 470                         skip_sgml_space(scanner, &string);
 471                         if (string < scanner->end && *string == '>') {
 472                                 type = SGML_TOKEN_ELEMENT;
 473                                 string++;
 474
 475                                 /* We found the end. */
 476                                 possibly_incomplete = 0;
 477
 478                         } else {
 479                                 /* Was any space skipped? */
 480                                 if (is_sgml_space(string[-1])) {
 481                                         /* We found the end. */
 482                                         possibly_incomplete = 0;
 483                                 }
 484                                 type = SGML_TOKEN_ELEMENT_BEGIN;
 485                                 scanner_state = SGML_STATE_ELEMENT;
 486                         }
 487
 488                 } else if (*string == '!') {
 489                         unsigned char *ident;
 490                         enum sgml_token_type base = SGML_TOKEN_NOTATION;
 491
 492                         string++;
 493                         skip_sgml_space(scanner, &string);
 494                         token->string.string = ident = string;
 495
 496                         if (string + 1 < scanner->end
 497                             && string[0] == '-' && string[1] == '-') {
 498                                 string += 2;
 499                                 type = SGML_TOKEN_NOTATION_COMMENT;
 500                                 token->string.string = string;
 501                                 real_length = skip_sgml_comment(scanner, &string,
 502                                                                 &possibly_incomplete);
 503                                 assert(real_length >= 0);
 504
 505                         } else if (string + 6 < scanner->end
 506                                    && !memcmp(string, "[CDATA[", 7)) {
 507
 508                                 string += 7;
 509                                 type = SGML_TOKEN_CDATA_SECTION;
 510                                 token->string.string = string;
 511                                 real_length = skip_sgml_cdata_section(scanner, &string,
 512                                                                       &possibly_incomplete);
 513                                 assert(real_length >= 0);
 514
 515                         } else {
 516                                 scan_sgml(scanner, string, SGML_CHAR_IDENT);
 517                                 type = map_dom_scanner_string(scanner, ident, string, base);
 518                                 if (skip_sgml(scanner, &string, '>', 0)) {
 519                                         /* We found the end. */
 520                                         possibly_incomplete = 0;
 521                                 }
 522                         }
 523
 524                 } else if (*string == '?') {
 525                         unsigned char *pos;
 526                         enum sgml_token_type base = SGML_TOKEN_PROCESS;
 527
 528                         string++;
 529                         skip_sgml_space(scanner, &string);
 530                         token->string.string = pos = string;
 531                         scan_sgml(scanner, string, SGML_CHAR_IDENT);
 532
 533                         type = map_dom_scanner_string(scanner, pos, string, base);
 534
 535                         scanner_state = SGML_STATE_PROC_INST;
 536
 537                         real_length = string - token->string.string;
 538                         skip_sgml_space(scanner, &string);
 539
 540                         /* Make '<?xml ' cause the right kind of error. */
 541                         if (is_sgml_space(string[-1])
 542                             && string < scanner->end) {
 543                                 /* We found the end. */
 544                                 possibly_incomplete = 0;
 545                         }
 546
 547                         if (scanner->check_complete && scanner->incomplete) {
 548                                 /* We need to fit both the process target token
 549                                  * and the process data token into the scanner
 550                                  * table. */
 551                                 if (token + 1 >= scanner->table + DOM_SCANNER_TOKENS) {
 552                                         possibly_incomplete = 1;
 553
 554                                 } else if (!possibly_incomplete) {
 555                                         /* FIXME: We do this twice. */
 556                                         for (pos = string + 1;
 557                                              (pos = skip_sgml_chars(scanner, pos, '>'));
 558                                              pos++) {
 559                                                 if (pos[-1] == '?')
 560                                                         break;
 561                                         }
 562                                         if (!pos)
 563                                                 possibly_incomplete = 1;
 564                                 }
 565
 566                                 if (possibly_incomplete)
 567                                         string = scanner->end;
 568                         }
 569
 570                 } else if (*string == '/') {
 571                         string++;
 572                         skip_sgml_space(scanner, &string);
 573
 574                         if (string == scanner->end) {
 575                                 /* Prevent out of bound access. */
 576
 577                         } else if (is_sgml_ident(*string)) {
 578                                 token->string.string = string;
 579                                 scan_sgml(scanner, string, SGML_CHAR_IDENT);
 580                                 real_length = string - token->string.string;
 581
 582                                 type = SGML_TOKEN_ELEMENT_END;
 583                                 if (skip_sgml(scanner, &string, '>', 1)) {
 584                                         /* We found the end. */
 585                                         possibly_incomplete = 0;
 586                                 }
 587
 588                         } else if (*string == '>') {
 589                                 string++;
 590                                 real_length = 0;
 591                                 type = SGML_TOKEN_ELEMENT_END;
 592
 593                                 /* We found the end. */
 594                                 possibly_incomplete = 0;
 595                         }
 596
 597                         if (type != SGML_TOKEN_GARBAGE) {
 598                                 scanner_state = SGML_STATE_TEXT;
 599                         }
 600
 601                 } else {
 602                         /* Alien < > stuff so ignore it */
 603                         if (skip_sgml(scanner, &string, '>', 0)) {
 604                                 /* We found the end. */
 605                                 possibly_incomplete = 0;
 606                         }
 607                 }
 608
 609         } else if (first_char == '=') {
 610                 type = '=';
 611                 /* We found the end. */
 612                 possibly_incomplete = 0;
 613
 614         } else if (first_char == '?' || first_char == '>') {
 615                 if (first_char == '?') {
 616                         if (skip_sgml(scanner, &string, '>', 0)) {
 617                                 /* We found the end. */
 618                                 possibly_incomplete = 0;
 619                         }
 620                 } else {
 621                         assert(first_char == '>');
 622
 623                         /* We found the end. */
 624                         possibly_incomplete = 0;
 625                 }
 626
 627                 type = SGML_TOKEN_TAG_END;
 628                 assert(scanner->state == SGML_STATE_ELEMENT);
 629                 scanner_state = SGML_STATE_TEXT;
 630
 631         } else if (first_char == '/') {
 632                 /* We allow '/' inside elements and only consider it as an end
 633                  * tag if immediately preceeds the '>' char. This is to allow
 634                  *
 635                  *      '<form action=/ >'      where '/' is part of a path and
 636                  *      '<form action=a />'     where '/>' is truely a tag end
 637                  *
 638                  * For stricter parsing we should always require attribute
 639                  * values to be quoted.
 640                  */
 641                 if (string == scanner->end) {
 642                         /* Prevent out of bound access. */
 643
 644                 } else if (*string == '>') {
 645                         string++;
 646                         real_length = 0;
 647                         type = SGML_TOKEN_ELEMENT_EMPTY_END;
 648                         assert(scanner->state == SGML_STATE_ELEMENT);
 649                         scanner_state = SGML_STATE_TEXT;
 650
 651                         /* We found the end. */
 652                         possibly_incomplete = 0;
 653
 654                 } else if (is_sgml_attribute(*string)) {
 655                         scan_sgml_attribute(scanner, string);
 656                         type = SGML_TOKEN_ATTRIBUTE;
 657                         if (string[-1] == '/' && string[0] == '>') {
 658                                 string--;
 659                                 /* We found the end. */
 660                                 possibly_incomplete = 0;
 661                         }
 662                 }
 663
 664         } else if (isquote(first_char)) {
 665                 unsigned char *string_end = skip_sgml_chars(scanner, string, first_char);
 666
 667                 if (string_end) {
 668                         /* We don't want the delimiters in the token */
 669                         token->string.string++;
 670                         real_length = string_end - token->string.string;
 671                         string = string_end + 1;
 672                         type = SGML_TOKEN_STRING;
 673
 674                         /* We found the end. */
 675                         possibly_incomplete = 0;
 676
 677                 } else if (scanner->check_complete && scanner->incomplete) {
 678                         /* Force an incomplete token. */
 679                         string = scanner->end;
 680
 681                 } else if (string < scanner->end
 682                            && is_sgml_attribute(*string)) {
 683                         token->string.string++;
 684                         scan_sgml_attribute(scanner, string);
 685                         type = SGML_TOKEN_ATTRIBUTE;
 686                 }
 687
 688         } else if (is_sgml_attribute(first_char)) {
 689                 if (is_sgml_ident(first_char)) {
 690                         scan_sgml(scanner, string, SGML_CHAR_IDENT);
 691                         type = SGML_TOKEN_IDENT;
 692                 }
 693
 694                 if (string < scanner->end
 695                     && is_sgml_attribute(*string)) {
 696                         scan_sgml_attribute(scanner, string);
 697                         type = SGML_TOKEN_ATTRIBUTE;
 698                         if (string[-1] == '/' && string[0] == '>') {
 699                                 /* We found the end. */
 700                                 possibly_incomplete = 0;
 701                                 string--;
 702                         }
 703                 }
 704         }
 705
 706         if (possibly_incomplete) {
 707                 if (check_sgml_incomplete(scanner, string)) {
 708                         set_sgml_incomplete(scanner, token);
 709                         return;
 710                 }
 711
 712                 if (check_sgml_error(scanner) && string == scanner->end) {
 713                         unsigned char *end;
 714
 715                         end = get_sgml_error_end(scanner, type, string);
 716                         token = set_sgml_error(scanner, end);
 717                         if (!token)
 718                                 return;
 719                 }
 720         }
 721
 722         /* Only apply the state change if the token was not abandoned because
 723          * it was incomplete. */
 724         scanner->state = scanner_state;
 725
 726         token->type = type;
 727         token->string.length = real_length >= 0 ? real_length : string - token->string.string;
 728         token->precedence = get_sgml_precedence(type);
 729         scanner->position = string;
 730 }
 731
 732
 733 /* Processing instruction data scanning */
 734
 735 static inline void
 736 scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 737 {
 738         unsigned char *string = scanner->position;
 739         /* The length can be empty for '<??>'. */
 740         ssize_t length = -1;
 741
 742         token->string.string = string++;
 743
 744         /* Figure out where the processing instruction ends. This doesn't use
 745          * skip_sgml() since we MUST ignore precedence here to allow '<' inside
 746          * the data part to be skipped correctly. */
 747         for ( ; (string = skip_sgml_chars(scanner, string, '>')); string++) {
 748                 if (string[-1] == '?') {
 749                         string++;
 750                         length = string - token->string.string - 2;
 751                         break;
 752                 }
 753         }
 754
 755         if (!string) {
 756                 /* Makes the next succeed when checking for incompletion, and
 757                  * puts the rest of the text within the token. */
 758                 string = scanner->end;
 759
 760                 if (check_sgml_incomplete(scanner, string)) {
 761                         set_sgml_incomplete(scanner, token);
 762                         return;
 763                 }
 764
 765                 if (check_sgml_error(scanner)) {
 766                         token = set_sgml_error(scanner, string);
 767                         if (!token)
 768                                 return;
 769                 }
 770         }
 771
 772         token->type = SGML_TOKEN_PROCESS_DATA;
 773         token->string.length = length >= 0 ? length : string - token->string.string;
 774         token->precedence = get_sgml_precedence(token->type);
 775         scanner->position = string;
 776         scanner->state = SGML_STATE_TEXT;
 777 }
 778
 779
 780 /* Scanner multiplexor */
 781
 782 static struct dom_scanner_token *
 783 scan_sgml_tokens(struct dom_scanner *scanner)
 784 {
 785         struct dom_scanner_token *table_end = scanner->table + DOM_SCANNER_TOKENS;
 786
 787         if (!begin_dom_token_scanning(scanner))
 788                 return get_dom_scanner_token(scanner);
 789
 790         /* Scan tokens until we fill the table */
 791         for (scanner->current = scanner->table + scanner->tokens;
 792              scanner->current < table_end && scanner->position < scanner->end;
 793              scanner->current++) {
 794                 if (scanner->state == SGML_STATE_ELEMENT
 795                     || (*scanner->position == '<'
 796                         && scanner->state != SGML_STATE_PROC_INST)) {
 797                         skip_sgml_space(scanner, &scanner->position);
 798                         if (scanner->position >= scanner->end) break;
 799
 800                         scan_sgml_element_token(scanner, scanner->current);
 801
 802                         /* Shall we scratch this token? */
 803                         if (scanner->current->type == SGML_TOKEN_SKIP) {
 804                                 scanner->current--;
 805                         }
 806
 807                 } else if (scanner->state == SGML_STATE_TEXT) {
 808                         scan_sgml_text_token(scanner, scanner->current);
 809
 810                 } else {
 811                         scan_sgml_proc_inst_token(scanner, scanner->current);
 812                 }
 813         }
 814
 815         return end_dom_token_scanning(scanner, scanner->current);
 816 }