src/document/css/scanner.c

   1 /* CSS token scanner utilities */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9
  10 #include "elinks.h"
  11
  12 #include "document/css/scanner.h"
  13 #include "util/error.h"
  14 #include "util/scanner.h"
  15 #include "util/string.h"
  16
  17
  18 /* Bitmap entries for the CSS character groups used in the scanner table */
  19
  20 enum css_char_group {
  21         CSS_CHAR_ALPHA          = (1 << 0),
  22         CSS_CHAR_DIGIT          = (1 << 1),
  23         CSS_CHAR_HEX_DIGIT      = (1 << 2),
  24         CSS_CHAR_IDENT          = (1 << 3),
  25         CSS_CHAR_IDENT_START    = (1 << 4),
  26         CSS_CHAR_NEWLINE        = (1 << 5),
  27         CSS_CHAR_NON_ASCII      = (1 << 6),
  28         CSS_CHAR_SGML_MARKUP    = (1 << 7),
  29         CSS_CHAR_TOKEN          = (1 << 8),
  30         CSS_CHAR_TOKEN_START    = (1 << 9),
  31         CSS_CHAR_WHITESPACE     = (1 << 10),
  32 };
  33
  34 static const struct scan_table_info css_scan_table_info[] = {
  35         SCAN_TABLE_RANGE("0", '9', CSS_CHAR_DIGIT | CSS_CHAR_HEX_DIGIT | CSS_CHAR_IDENT),
  36         SCAN_TABLE_RANGE("A", 'F', CSS_CHAR_HEX_DIGIT),
  37         SCAN_TABLE_RANGE("A", 'Z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
  38         SCAN_TABLE_RANGE("a", 'f', CSS_CHAR_HEX_DIGIT),
  39         SCAN_TABLE_RANGE("a", 'z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
  40         /* For the octal number impared (me including) \241 is 161 --jonas */
  41         SCAN_TABLE_RANGE("\241", 255, CSS_CHAR_NON_ASCII | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
  42
  43         SCAN_TABLE_STRING(" \f\n\r\t\v\000", CSS_CHAR_WHITESPACE),
  44         SCAN_TABLE_STRING("\f\n\r",      CSS_CHAR_NEWLINE),
  45         SCAN_TABLE_STRING("-",           CSS_CHAR_IDENT),
  46         SCAN_TABLE_STRING(".#@!\"'<-/|^$*",      CSS_CHAR_TOKEN_START),
  47         /* Unicode escape (that we do not handle yet) + other special chars */
  48         SCAN_TABLE_STRING("\\_",         CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
  49         /* This should contain mostly used char tokens like ':' and maybe a few
  50          * garbage chars that people might put in their CSS code */
  51         SCAN_TABLE_STRING("[({})];:,.>+~",       CSS_CHAR_TOKEN),
  52         SCAN_TABLE_STRING("<![CDATA]->", CSS_CHAR_SGML_MARKUP),
  53
  54         SCAN_TABLE_END,
  55 };
  56
  57 static const struct scanner_string_mapping css_string_mappings[] = {
  58         { "Hz",         CSS_TOKEN_FREQUENCY,    CSS_TOKEN_DIMENSION },
  59         { "cm",         CSS_TOKEN_LENGTH,       CSS_TOKEN_DIMENSION },
  60         { "deg",        CSS_TOKEN_ANGLE,        CSS_TOKEN_DIMENSION },
  61         { "em",         CSS_TOKEN_EM,           CSS_TOKEN_DIMENSION },
  62         { "ex",         CSS_TOKEN_EX,           CSS_TOKEN_DIMENSION },
  63         { "grad",       CSS_TOKEN_ANGLE,        CSS_TOKEN_DIMENSION },
  64         { "in",         CSS_TOKEN_LENGTH,       CSS_TOKEN_DIMENSION },
  65         { "kHz",        CSS_TOKEN_FREQUENCY,    CSS_TOKEN_DIMENSION },
  66         { "mm",         CSS_TOKEN_LENGTH,       CSS_TOKEN_DIMENSION },
  67         { "ms",         CSS_TOKEN_TIME,         CSS_TOKEN_DIMENSION },
  68         { "pc",         CSS_TOKEN_LENGTH,       CSS_TOKEN_DIMENSION },
  69         { "pt",         CSS_TOKEN_LENGTH,       CSS_TOKEN_DIMENSION },
  70         { "px",         CSS_TOKEN_LENGTH,       CSS_TOKEN_DIMENSION },
  71         { "rad",        CSS_TOKEN_ANGLE,        CSS_TOKEN_DIMENSION },
  72         { "s",          CSS_TOKEN_TIME,         CSS_TOKEN_DIMENSION },
  73
  74         { "rgb",        CSS_TOKEN_RGB,          CSS_TOKEN_FUNCTION },
  75         { "url",        CSS_TOKEN_URL,          CSS_TOKEN_FUNCTION },
  76
  77         { "charset",    CSS_TOKEN_AT_CHARSET,   CSS_TOKEN_AT_KEYWORD },
  78         { "font-face",  CSS_TOKEN_AT_FONT_FACE, CSS_TOKEN_AT_KEYWORD },
  79         { "import",     CSS_TOKEN_AT_IMPORT,    CSS_TOKEN_AT_KEYWORD },
  80         { "media",      CSS_TOKEN_AT_MEDIA,     CSS_TOKEN_AT_KEYWORD },
  81         { "page",       CSS_TOKEN_AT_PAGE,      CSS_TOKEN_AT_KEYWORD },
  82
  83         { NULL, CSS_TOKEN_NONE, CSS_TOKEN_NONE },
  84 };
  85
  86 static struct scanner_token *scan_css_tokens(struct scanner *scanner);
  87
  88 struct scanner_info css_scanner_info = {
  89         css_string_mappings,
  90         css_scan_table_info,
  91         scan_css_tokens,
  92 };
  93
  94 #define check_css_table(c, bit) (css_scanner_info.scan_table[(c)] & (bit))
  95
  96 #define scan_css(scanner, s, bit)                                       \
  97         while ((s) < (scanner)->end && check_css_table(*(s), bit)) (s)++;
  98
  99 #define scan_back_css(scanner, s, bit)                                  \
 100         while ((s) >= (scanner)->string && check_css_table(*(s), bit)) (s)--;
 101
 102 #define is_css_ident_start(c)   check_css_table(c, CSS_CHAR_IDENT_START)
 103 #define is_css_ident(c)         check_css_table(c, CSS_CHAR_IDENT)
 104 #define is_css_digit(c)         check_css_table(c, CSS_CHAR_DIGIT)
 105 #define is_css_hexdigit(c)      check_css_table(c, CSS_CHAR_HEX_DIGIT)
 106 #define is_css_char_token(c)    check_css_table(c, CSS_CHAR_TOKEN)
 107 #define is_css_token_start(c)   check_css_table(c, CSS_CHAR_TOKEN_START)
 108
 109
 110 #define skip_css(scanner, s, skipto)                                    \
 111         while (s < (scanner)->end                                       \
 112                && *(s) != (skipto)                                      \
 113                && check_css_precedence(*(s), skipto)) {                 \
 114                 if (isquote(*(s))) {                                    \
 115                         int size = (scanner)->end - (s);                \
 116                         unsigned char *end = memchr(s + 1, *(s), size); \
 117                                                                         \
 118                         if (end) (s) = end;                             \
 119                 }                                                       \
 120                 (s)++;                                                  \
 121         }
 122
 123
 124 static inline void
 125 scan_css_token(struct scanner *scanner, struct scanner_token *token)
 126 {
 127         unsigned char *string = scanner->position;
 128         unsigned char first_char = *string;
 129         enum css_token_type type = CSS_TOKEN_GARBAGE;
 130         int real_length = -1;
 131
 132         assert(first_char);
 133         token->string = string++;
 134
 135         if (is_css_char_token(first_char)) {
 136                 type = first_char;
 137
 138         } else if (is_css_digit(first_char) || first_char == '.') {
 139                 scan_css(scanner, string, CSS_CHAR_DIGIT);
 140
 141                 /* First scan the full number token */
 142                 if (*string == '.') {
 143                         string++;
 144
 145                         if (is_css_digit(*string)) {
 146                                 type = CSS_TOKEN_NUMBER;
 147                                 scan_css(scanner, string, CSS_CHAR_DIGIT);
 148                         }
 149                 }
 150
 151                 /* Check what kind of number we have */
 152                 if (*string == '%') {
 153                         if (first_char != '.')
 154                                 type = CSS_TOKEN_PERCENTAGE;
 155                         string++;
 156
 157                 } else if (!is_css_ident_start(*string)) {
 158                         type = CSS_TOKEN_NUMBER;
 159
 160                 } else {
 161                         unsigned char *ident = string;
 162
 163                         scan_css(scanner, string, CSS_CHAR_IDENT);
 164                         type = map_scanner_string(scanner, ident, string,
 165                                                   CSS_TOKEN_DIMENSION);
 166                 }
 167
 168         } else if (is_css_ident_start(first_char)) {
 169                 scan_css(scanner, string, CSS_CHAR_IDENT);
 170
 171                 if (*string == '(') {
 172                         unsigned char *function_end = string + 1;
 173
 174                         /* Make sure that we have an ending ')' */
 175                         skip_css(scanner, function_end, ')');
 176                         if (*function_end == ')') {
 177                                 type = map_scanner_string(scanner, token->string,
 178                                                 string, CSS_TOKEN_FUNCTION);
 179
 180                                 /* If it is not a known function just skip the
 181                                  * how arg stuff so we don't end up generating
 182                                  * a lot of useless tokens. */
 183                                 if (type == CSS_TOKEN_FUNCTION) {
 184                                         string = function_end;
 185
 186                                 } else if (type == CSS_TOKEN_URL) {
 187                                         /* Extracting the URL first removes any
 188                                          * leading or ending whitespace and
 189                                          * then see if the url is given in a
 190                                          * string. If that is the case the
 191                                          * string delimiters are also trimmed.
 192                                          * This is not totally correct because
 193                                          * we should of course handle escape
 194                                          * sequences .. but that will have to
 195                                          * be fixed later.  */
 196                                         unsigned char *from = string + 1;
 197                                         unsigned char *to = function_end - 1;
 198
 199                                         scan_css(scanner, from, CSS_CHAR_WHITESPACE);
 200                                         scan_back_css(scanner, to, CSS_CHAR_WHITESPACE);
 201
 202                                         if (isquote(*from)) from++;
 203                                         if (isquote(*to)) to--;
 204
 205                                         token->string = from;
 206                                         real_length = to - from + 1;
 207                                         assert(real_length >= 0);
 208                                         string = function_end;
 209                                 }
 210
 211                                 assert(type != CSS_TOKEN_RGB || *string == '(');
 212                                 assert(type != CSS_TOKEN_URL || *string == ')');
 213                                 assert(type != CSS_TOKEN_FUNCTION || *string == ')');
 214                         }
 215
 216                         string++;
 217
 218                 } else {
 219                         type = CSS_TOKEN_IDENT;
 220                 }
 221
 222         } else if (!is_css_token_start(first_char)) {
 223                 /* TODO: Better composing of error tokens. For now we just
 224                  * split them down into char tokens */
 225
 226         } else if (first_char == '#') {
 227                 /* Check whether it is hexcolor or hash token */
 228                 if (is_css_hexdigit(*string)) {
 229                         int hexdigits;
 230
 231                         scan_css(scanner, string, CSS_CHAR_HEX_DIGIT);
 232
 233                         /* Check that the hexdigit sequence is either 3 or 6
 234                          * chars and it isn't just start of some non-hex ident
 235                          * string. */
 236                         hexdigits = string - token->string - 1;
 237                         if ((hexdigits == 3 || hexdigits == 6)
 238                             && !is_css_ident(*string)) {
 239                                 type = CSS_TOKEN_HEX_COLOR;
 240                         } else {
 241                                 scan_css(scanner, string, CSS_CHAR_IDENT);
 242                                 type = CSS_TOKEN_HASH;
 243                         }
 244
 245                 } else if (is_css_ident(*string)) {
 246                         /* Not *_ident_start() because hashes are #<name>. */
 247                         scan_css(scanner, string, CSS_CHAR_IDENT);
 248                         type = CSS_TOKEN_HASH;
 249                 }
 250
 251         } else if (first_char == '@') {
 252                 /* Compose token containing @<ident> */
 253                 if (is_css_ident_start(*string)) {
 254                         unsigned char *ident = string;
 255
 256                         /* Scan both ident start and ident */
 257                         scan_css(scanner, string, CSS_CHAR_IDENT);
 258                         type = map_scanner_string(scanner, ident, string,
 259                                                   CSS_TOKEN_AT_KEYWORD);
 260                 }
 261
 262         } else if (first_char == '*') {
 263                 if (*string == '=') {
 264                         type = CSS_TOKEN_SELECT_CONTAINS;
 265                         string++;
 266                 } else {
 267                         type = CSS_TOKEN_IDENT;
 268                 }
 269
 270         } else if (first_char == '^') {
 271                 if (*string == '=') {
 272                         type = CSS_TOKEN_SELECT_BEGIN;
 273                         string++;
 274                 }
 275
 276         } else if (first_char == '$') {
 277                 if (*string == '=') {
 278                         type = CSS_TOKEN_SELECT_END;
 279                         string++;
 280                 }
 281
 282         } else if (first_char == '|') {
 283                 if (*string == '=') {
 284                         type = CSS_TOKEN_SELECT_HYPHEN_LIST;
 285                         string++;
 286                 }
 287
 288         } else if (first_char == '!') {
 289                 scan_css(scanner, string, CSS_CHAR_WHITESPACE);
 290                 if (!strncasecmp(string, "important", 9)) {
 291                         type = CSS_TOKEN_IMPORTANT;
 292                         string += 9;
 293                 }
 294
 295         } else if (isquote(first_char)) {
 296                 /* TODO: Escaped delimiters --jonas */
 297                 int size = scanner->end - string;
 298                 unsigned char *string_end = memchr(string, first_char, size);
 299
 300                 if (string_end) {
 301                         /* We don't want the delimiters in the token */
 302                         token->string++;
 303                         real_length = string_end - token->string;
 304                         string = string_end + 1;
 305                         type = CSS_TOKEN_STRING;
 306                 }
 307
 308         } else if (first_char == '<' || first_char == '-') {
 309                 /* Try to navigate SGML tagsoup */
 310
 311                 if (*string == '/') {
 312                         /* Some kind of SGML tag end ... better bail out screaming */
 313                         type = CSS_TOKEN_NONE;
 314
 315                 } else {
 316                         unsigned char *sgml = string;
 317
 318                         /* Skip anything looking like SGML "<!--" and "-->"
 319                          * comments + <![CDATA[ and ]]> notations. */
 320                         scan_css(scanner, sgml, CSS_CHAR_SGML_MARKUP);
 321
 322                         if (sgml - string >= 2
 323                             && ((first_char == '<' && *string == '!')
 324                                 || (first_char == '-' && sgml[-1] == '>'))) {
 325                                 type = CSS_TOKEN_SKIP;
 326                                 string = sgml;
 327                         }
 328                 }
 329
 330         } else if (first_char == '/') {
 331                 /* Comments */
 332                 if (*string == '*') {
 333                         type = CSS_TOKEN_SKIP;
 334
 335                         for (string++; string < scanner->end; string++)
 336                                 if (*string == '*' && string[1] == '/') {
 337                                         string += 2;
 338                                         break;
 339                                 }
 340                 }
 341
 342         } else {
 343                 INTERNAL("Someone forgot to put code for recognizing tokens "
 344                          "which start with '%c'.", first_char);
 345         }
 346
 347         token->type = type;
 348         token->length = real_length > 0 ? real_length : string - token->string;
 349         token->precedence = get_css_precedence(type);
 350         scanner->position = string;
 351 }
 352
 353 static struct scanner_token *
 354 scan_css_tokens(struct scanner *scanner)
 355 {
 356         struct scanner_token *table_end = scanner->table + SCANNER_TOKENS;
 357         struct scanner_token *current;
 358
 359         if (!begin_token_scanning(scanner))
 360                 return get_scanner_token(scanner);
 361
 362         /* Scan tokens until we fill the table */
 363         for (current = scanner->table + scanner->tokens;
 364              current < table_end && scanner->position < scanner->end;
 365              current++) {
 366                 scan_css(scanner, scanner->position, CSS_CHAR_WHITESPACE);
 367                 if (scanner->position >= scanner->end) break;
 368
 369                 scan_css_token(scanner, current);
 370
 371                 /* Did some one scream for us to end the madness? */
 372                 if (current->type == CSS_TOKEN_NONE) {
 373                         scanner->position = NULL;
 374                         current--;
 375                         break;
 376                 }
 377
 378                 /* Shall we scratch this token? */
 379                 if (current->type == CSS_TOKEN_SKIP) {
 380                         current--;
 381                 }
 382         }
 383
 384         return end_token_scanning(scanner, current);
 385 }