grafthistory: support curl
[elinks/elinks-j605.git] / src / document / css / scanner.c
blob0781968c3c4f3b1921f06bf3c263f8dca3ee0855
1 /* CSS token scanner utilities */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <stdio.h>
8 #include <string.h>
10 #include "elinks.h"
12 #include "document/css/scanner.h"
13 #include "util/error.h"
14 #include "util/scanner.h"
15 #include "util/string.h"
18 /* Bitmap entries for the CSS character groups used in the scanner table */
20 enum css_char_group {
21 CSS_CHAR_ALPHA = (1 << 0),
22 CSS_CHAR_DIGIT = (1 << 1),
23 CSS_CHAR_HEX_DIGIT = (1 << 2),
24 CSS_CHAR_IDENT = (1 << 3),
25 CSS_CHAR_IDENT_START = (1 << 4),
26 CSS_CHAR_NEWLINE = (1 << 5),
27 CSS_CHAR_NON_ASCII = (1 << 6),
28 CSS_CHAR_SGML_MARKUP = (1 << 7),
29 CSS_CHAR_TOKEN = (1 << 8),
30 CSS_CHAR_TOKEN_START = (1 << 9),
31 CSS_CHAR_WHITESPACE = (1 << 10),
34 static const struct scan_table_info css_scan_table_info[] = {
35 SCAN_TABLE_RANGE("0", '9', CSS_CHAR_DIGIT | CSS_CHAR_HEX_DIGIT | CSS_CHAR_IDENT),
36 SCAN_TABLE_RANGE("A", 'F', CSS_CHAR_HEX_DIGIT),
37 SCAN_TABLE_RANGE("A", 'Z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
38 SCAN_TABLE_RANGE("a", 'f', CSS_CHAR_HEX_DIGIT),
39 SCAN_TABLE_RANGE("a", 'z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
40 /* For the octal number impared (me including) \241 is 161 --jonas */
41 SCAN_TABLE_RANGE("\241", 255, CSS_CHAR_NON_ASCII | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
43 SCAN_TABLE_STRING(" \f\n\r\t\v\000", CSS_CHAR_WHITESPACE),
44 SCAN_TABLE_STRING("\f\n\r", CSS_CHAR_NEWLINE),
45 SCAN_TABLE_STRING("-", CSS_CHAR_IDENT),
46 SCAN_TABLE_STRING(".#@!\"'<-/|^$*", CSS_CHAR_TOKEN_START),
47 /* Unicode escape (that we do not handle yet) + other special chars */
48 SCAN_TABLE_STRING("\\_", CSS_CHAR_IDENT | CSS_CHAR_IDENT_START),
49 /* This should contain mostly used char tokens like ':' and maybe a few
50 * garbage chars that people might put in their CSS code */
51 SCAN_TABLE_STRING("[({})];:,.>+~", CSS_CHAR_TOKEN),
52 SCAN_TABLE_STRING("<![CDATA]->", CSS_CHAR_SGML_MARKUP),
54 SCAN_TABLE_END,
57 static const struct scanner_string_mapping css_string_mappings[] = {
58 { "Hz", CSS_TOKEN_FREQUENCY, CSS_TOKEN_DIMENSION },
59 { "cm", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION },
60 { "deg", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION },
61 { "em", CSS_TOKEN_EM, CSS_TOKEN_DIMENSION },
62 { "ex", CSS_TOKEN_EX, CSS_TOKEN_DIMENSION },
63 { "grad", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION },
64 { "in", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION },
65 { "kHz", CSS_TOKEN_FREQUENCY, CSS_TOKEN_DIMENSION },
66 { "mm", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION },
67 { "ms", CSS_TOKEN_TIME, CSS_TOKEN_DIMENSION },
68 { "pc", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION },
69 { "pt", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION },
70 { "px", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION },
71 { "rad", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION },
72 { "s", CSS_TOKEN_TIME, CSS_TOKEN_DIMENSION },
74 { "rgb", CSS_TOKEN_RGB, CSS_TOKEN_FUNCTION },
75 { "url", CSS_TOKEN_URL, CSS_TOKEN_FUNCTION },
77 { "charset", CSS_TOKEN_AT_CHARSET, CSS_TOKEN_AT_KEYWORD },
78 { "font-face", CSS_TOKEN_AT_FONT_FACE, CSS_TOKEN_AT_KEYWORD },
79 { "import", CSS_TOKEN_AT_IMPORT, CSS_TOKEN_AT_KEYWORD },
80 { "media", CSS_TOKEN_AT_MEDIA, CSS_TOKEN_AT_KEYWORD },
81 { "page", CSS_TOKEN_AT_PAGE, CSS_TOKEN_AT_KEYWORD },
83 { NULL, CSS_TOKEN_NONE, CSS_TOKEN_NONE },
86 static struct scanner_token *scan_css_tokens(struct scanner *scanner);
88 struct scanner_info css_scanner_info = {
89 css_string_mappings,
90 css_scan_table_info,
91 scan_css_tokens,
94 #define check_css_table(c, bit) (css_scanner_info.scan_table[(c)] & (bit))
96 #define scan_css(scanner, s, bit) \
97 while ((s) < (scanner)->end && check_css_table(*(s), bit)) (s)++;
99 #define scan_back_css(scanner, s, bit) \
100 while ((s) >= (scanner)->string && check_css_table(*(s), bit)) (s)--;
102 #define is_css_ident_start(c) check_css_table(c, CSS_CHAR_IDENT_START)
103 #define is_css_ident(c) check_css_table(c, CSS_CHAR_IDENT)
104 #define is_css_digit(c) check_css_table(c, CSS_CHAR_DIGIT)
105 #define is_css_hexdigit(c) check_css_table(c, CSS_CHAR_HEX_DIGIT)
106 #define is_css_char_token(c) check_css_table(c, CSS_CHAR_TOKEN)
107 #define is_css_token_start(c) check_css_table(c, CSS_CHAR_TOKEN_START)
110 #define skip_css(scanner, s, skipto) \
111 while (s < (scanner)->end \
112 && *(s) != (skipto) \
113 && check_css_precedence(*(s), skipto)) { \
114 if (isquote(*(s))) { \
115 int size = (scanner)->end - (s); \
116 unsigned char *end = memchr(s + 1, *(s), size); \
118 if (end) (s) = end; \
120 (s)++; \
124 static inline void
125 scan_css_token(struct scanner *scanner, struct scanner_token *token)
127 unsigned char *string = scanner->position;
128 unsigned char first_char = *string;
129 enum css_token_type type = CSS_TOKEN_GARBAGE;
130 int real_length = -1;
132 assert(first_char);
133 token->string = string++;
135 if (is_css_char_token(first_char)) {
136 type = first_char;
138 } else if (is_css_digit(first_char) || first_char == '.') {
139 scan_css(scanner, string, CSS_CHAR_DIGIT);
141 /* First scan the full number token */
142 if (*string == '.') {
143 string++;
145 if (is_css_digit(*string)) {
146 type = CSS_TOKEN_NUMBER;
147 scan_css(scanner, string, CSS_CHAR_DIGIT);
151 /* Check what kind of number we have */
152 if (*string == '%') {
153 if (first_char != '.')
154 type = CSS_TOKEN_PERCENTAGE;
155 string++;
157 } else if (!is_css_ident_start(*string)) {
158 type = CSS_TOKEN_NUMBER;
160 } else {
161 unsigned char *ident = string;
163 scan_css(scanner, string, CSS_CHAR_IDENT);
164 type = map_scanner_string(scanner, ident, string,
165 CSS_TOKEN_DIMENSION);
168 } else if (is_css_ident_start(first_char)) {
169 scan_css(scanner, string, CSS_CHAR_IDENT);
171 if (*string == '(') {
172 unsigned char *function_end = string + 1;
174 /* Make sure that we have an ending ')' */
175 skip_css(scanner, function_end, ')');
176 if (*function_end == ')') {
177 type = map_scanner_string(scanner, token->string,
178 string, CSS_TOKEN_FUNCTION);
180 /* If it is not a known function just skip the
181 * how arg stuff so we don't end up generating
182 * a lot of useless tokens. */
183 if (type == CSS_TOKEN_FUNCTION) {
184 string = function_end;
186 } else if (type == CSS_TOKEN_URL) {
187 /* Extracting the URL first removes any
188 * leading or ending whitespace and
189 * then see if the url is given in a
190 * string. If that is the case the
191 * string delimiters are also trimmed.
192 * This is not totally correct because
193 * we should of course handle escape
194 * sequences .. but that will have to
195 * be fixed later. */
196 unsigned char *from = string + 1;
197 unsigned char *to = function_end - 1;
199 scan_css(scanner, from, CSS_CHAR_WHITESPACE);
200 scan_back_css(scanner, to, CSS_CHAR_WHITESPACE);
202 if (isquote(*from)) from++;
203 if (isquote(*to)) to--;
205 token->string = from;
206 real_length = to - from + 1;
207 assert(real_length >= 0);
208 string = function_end;
211 assert(type != CSS_TOKEN_RGB || *string == '(');
212 assert(type != CSS_TOKEN_URL || *string == ')');
213 assert(type != CSS_TOKEN_FUNCTION || *string == ')');
216 string++;
218 } else {
219 type = CSS_TOKEN_IDENT;
222 } else if (!is_css_token_start(first_char)) {
223 /* TODO: Better composing of error tokens. For now we just
224 * split them down into char tokens */
226 } else if (first_char == '#') {
227 /* Check whether it is hexcolor or hash token */
228 if (is_css_hexdigit(*string)) {
229 int hexdigits;
231 scan_css(scanner, string, CSS_CHAR_HEX_DIGIT);
233 /* Check that the hexdigit sequence is either 3 or 6
234 * chars and it isn't just start of some non-hex ident
235 * string. */
236 hexdigits = string - token->string - 1;
237 if ((hexdigits == 3 || hexdigits == 6)
238 && !is_css_ident(*string)) {
239 type = CSS_TOKEN_HEX_COLOR;
240 } else {
241 scan_css(scanner, string, CSS_CHAR_IDENT);
242 type = CSS_TOKEN_HASH;
245 } else if (is_css_ident(*string)) {
246 /* Not *_ident_start() because hashes are #<name>. */
247 scan_css(scanner, string, CSS_CHAR_IDENT);
248 type = CSS_TOKEN_HASH;
251 } else if (first_char == '@') {
252 /* Compose token containing @<ident> */
253 if (is_css_ident_start(*string)) {
254 unsigned char *ident = string;
256 /* Scan both ident start and ident */
257 scan_css(scanner, string, CSS_CHAR_IDENT);
258 type = map_scanner_string(scanner, ident, string,
259 CSS_TOKEN_AT_KEYWORD);
262 } else if (first_char == '*') {
263 if (*string == '=') {
264 type = CSS_TOKEN_SELECT_CONTAINS;
265 string++;
266 } else {
267 type = CSS_TOKEN_IDENT;
270 } else if (first_char == '^') {
271 if (*string == '=') {
272 type = CSS_TOKEN_SELECT_BEGIN;
273 string++;
276 } else if (first_char == '$') {
277 if (*string == '=') {
278 type = CSS_TOKEN_SELECT_END;
279 string++;
282 } else if (first_char == '|') {
283 if (*string == '=') {
284 type = CSS_TOKEN_SELECT_HYPHEN_LIST;
285 string++;
288 } else if (first_char == '!') {
289 scan_css(scanner, string, CSS_CHAR_WHITESPACE);
290 if (!strncasecmp(string, "important", 9)) {
291 type = CSS_TOKEN_IMPORTANT;
292 string += 9;
295 } else if (isquote(first_char)) {
296 /* TODO: Escaped delimiters --jonas */
297 int size = scanner->end - string;
298 unsigned char *string_end = memchr(string, first_char, size);
300 if (string_end) {
301 /* We don't want the delimiters in the token */
302 token->string++;
303 real_length = string_end - token->string;
304 string = string_end + 1;
305 type = CSS_TOKEN_STRING;
308 } else if (first_char == '<' || first_char == '-') {
309 /* Try to navigate SGML tagsoup */
311 if (*string == '/') {
312 /* Some kind of SGML tag end ... better bail out screaming */
313 type = CSS_TOKEN_NONE;
315 } else {
316 unsigned char *sgml = string;
318 /* Skip anything looking like SGML "<!--" and "-->"
319 * comments + <![CDATA[ and ]]> notations. */
320 scan_css(scanner, sgml, CSS_CHAR_SGML_MARKUP);
322 if (sgml - string >= 2
323 && ((first_char == '<' && *string == '!')
324 || (first_char == '-' && sgml[-1] == '>'))) {
325 type = CSS_TOKEN_SKIP;
326 string = sgml;
330 } else if (first_char == '/') {
331 /* Comments */
332 if (*string == '*') {
333 type = CSS_TOKEN_SKIP;
335 for (string++; string < scanner->end; string++)
336 if (*string == '*' && string[1] == '/') {
337 string += 2;
338 break;
342 } else {
343 INTERNAL("Someone forgot to put code for recognizing tokens "
344 "which start with '%c'.", first_char);
347 token->type = type;
348 token->length = real_length > 0 ? real_length : string - token->string;
349 token->precedence = get_css_precedence(type);
350 scanner->position = string;
353 static struct scanner_token *
354 scan_css_tokens(struct scanner *scanner)
356 struct scanner_token *table_end = scanner->table + SCANNER_TOKENS;
357 struct scanner_token *current;
359 if (!begin_token_scanning(scanner))
360 return get_scanner_token(scanner);
362 /* Scan tokens until we fill the table */
363 for (current = scanner->table + scanner->tokens;
364 current < table_end && scanner->position < scanner->end;
365 current++) {
366 scan_css(scanner, scanner->position, CSS_CHAR_WHITESPACE);
367 if (scanner->position >= scanner->end) break;
369 scan_css_token(scanner, current);
371 /* Did some one scream for us to end the madness? */
372 if (current->type == CSS_TOKEN_NONE) {
373 scanner->position = NULL;
374 current--;
375 break;
378 /* Shall we scratch this token? */
379 if (current->type == CSS_TOKEN_SKIP) {
380 current--;
384 return end_token_scanning(scanner, current);