8322 nl: misleading-indentation
[unleashed/tickless.git] / usr / src / cmd / iconv / scanner.c
blob5c536952823a844c9efea1e1f66034f0a28ae5a1
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
17 * This file contains the "scanner", which tokenizes charmap files
18 * for iconv for processing by the higher level grammar processor.
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <string.h>
26 #include <widec.h>
27 #include <sys/types.h>
28 #include <assert.h>
29 #include "charmap.h"
30 #include "parser.tab.h"
32 int com_char = '#';
33 int esc_char = '\\';
34 int mb_cur_min = 1;
35 int mb_cur_max = MB_LEN_MAX;
36 int lineno = 1;
37 int warnings = 0;
38 static int nextline;
39 static FILE *input = stdin;
40 static const char *filename = "<stdin>";
41 static int instring = 0;
42 static int escaped = 0;
45 * Token space ... grows on demand.
47 static char *token = NULL;
48 static int tokidx;
49 static int toksz = 0;
50 static int hadtok = 0;
53 * The last keyword seen. This is useful to trigger the special lexer rules
54 * for "copy" and also collating symbols and elements.
56 int last_kw = 0;
57 static int category = T_END;
59 static struct token {
60 int id;
61 const char *name;
62 } keywords[] = {
63 { T_COM_CHAR, "comment_char" },
64 { T_ESC_CHAR, "escape_char" },
65 { T_END, "END" },
68 * These are keywords used in the charmap file. Note that
69 * Solaris orginally used angle brackets to wrap some of them,
70 * but we removed that to simplify our parser. The first of these
71 * items are "global items."
73 { T_CHARMAP, "CHARMAP" },
74 { T_WIDTH, "WIDTH" },
75 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
77 { -1, NULL },
81 * These special words are only used in a charmap file, enclosed in <>.
83 static struct token symwords[] = {
84 { T_COM_CHAR, "comment_char" },
85 { T_ESC_CHAR, "escape_char" },
86 { T_CODE_SET, "code_set_name" },
87 { T_MB_CUR_MAX, "mb_cur_max" },
88 { T_MB_CUR_MIN, "mb_cur_min" },
89 { -1, NULL },
92 static int categories[] = {
93 T_CHARMAP,
97 void
98 reset_scanner(const char *fname)
100 if (fname == NULL) {
101 filename = "<stdin>";
102 input = stdin;
103 } else {
104 if (input != stdin)
105 (void) fclose(input);
106 if ((input = fopen(fname, "r")) == NULL) {
107 perror(fname);
108 exit(1);
110 filename = fname;
112 com_char = '#';
113 esc_char = '\\';
114 instring = 0;
115 escaped = 0;
116 lineno = 1;
117 nextline = 1;
118 tokidx = 0;
119 last_kw = 0;
120 category = T_END;
123 #define hex(x) \
124 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
125 #define isodigit(x) ((x >= '0') && (x <= '7'))
127 static int
128 scanc(void)
130 int c;
132 c = getc(input);
133 lineno = nextline;
134 if (c == '\n') {
135 nextline++;
137 return (c);
140 static void
141 unscanc(int c)
143 if (c == '\n') {
144 nextline--;
146 if (ungetc(c, input) < 0) {
147 yyerror(_("ungetc failed"));
151 static int
152 scan_hex_byte(void)
154 int c1, c2;
155 int v;
157 c1 = scanc();
158 if (!isxdigit(c1)) {
159 yyerror(_("malformed hex digit"));
160 return (0);
162 c2 = scanc();
163 if (!isxdigit(c2)) {
164 yyerror(_("malformed hex digit"));
165 return (0);
167 v = ((hex(c1) << 4) | hex(c2));
168 return (v);
171 static int
172 scan_dec_byte(void)
174 int c1, c2, c3;
175 int b;
177 c1 = scanc();
178 if (!isdigit(c1)) {
179 yyerror(_("malformed decimal digit"));
180 return (0);
182 b = c1 - '0';
183 c2 = scanc();
184 if (!isdigit(c2)) {
185 yyerror(_("malformed decimal digit"));
186 return (0);
188 b *= 10;
189 b += (c2 - '0');
190 c3 = scanc();
191 if (!isdigit(c3)) {
192 unscanc(c3);
193 } else {
194 b *= 10;
195 b += (c3 - '0');
197 return (b);
200 static int
201 scan_oct_byte(void)
203 int c1, c2, c3;
204 int b;
206 b = 0;
208 c1 = scanc();
209 if (!isodigit(c1)) {
210 yyerror(_("malformed octal digit"));
211 return (0);
213 b = c1 - '0';
214 c2 = scanc();
215 if (!isodigit(c2)) {
216 yyerror(_("malformed octal digit"));
217 return (0);
219 b *= 8;
220 b += (c2 - '0');
221 c3 = scanc();
222 if (!isodigit(c3)) {
223 unscanc(c3);
224 } else {
225 b *= 8;
226 b += (c3 - '0');
228 return (b);
231 void
232 add_tok(int c)
234 if ((tokidx + 1) >= toksz) {
235 toksz += 64;
236 if ((token = realloc(token, toksz)) == NULL) {
237 yyerror(_("out of memory"));
238 tokidx = 0;
239 toksz = 0;
240 return;
244 token[tokidx++] = (char)c;
245 token[tokidx] = 0;
248 static int
249 get_byte(void)
251 int c;
253 if ((c = scanc()) != esc_char) {
254 unscanc(c);
255 return (EOF);
257 c = scanc();
259 switch (c) {
260 case 'd':
261 case 'D':
262 return (scan_dec_byte());
263 case 'x':
264 case 'X':
265 return (scan_hex_byte());
266 case '0':
267 case '1':
268 case '2':
269 case '3':
270 case '4':
271 case '5':
272 case '6':
273 case '7':
274 /* put the character back so we can get it */
275 unscanc(c);
276 return (scan_oct_byte());
277 default:
278 unscanc(c);
279 unscanc(esc_char);
280 return (EOF);
285 get_escaped(int c)
287 switch (c) {
288 case 'n':
289 return ('\n');
290 case 'r':
291 return ('\r');
292 case 't':
293 return ('\t');
294 case 'f':
295 return ('\f');
296 case 'v':
297 return ('\v');
298 case 'b':
299 return ('\b');
300 case 'a':
301 return ('\a');
302 default:
303 return (c);
308 get_wide(void)
310 /* NB: yylval.mbs[0] is the length */
311 char *mbs = &yylval.mbs[1];
312 int mbi = 0;
313 int c;
315 mbs[mbi] = 0;
316 if (mb_cur_max > MB_LEN_MAX) {
317 yyerror(_("max multibyte character size too big"));
318 return (T_NULL);
320 for (;;) {
321 if ((c = get_byte()) == EOF)
322 break;
323 if (mbi == mb_cur_max) {
324 unscanc(c);
325 yyerror(_("length > mb_cur_max"));
326 return (T_NULL);
328 mbs[mbi++] = c;
329 mbs[mbi] = 0;
332 /* result in yylval.mbs */
333 mbs[-1] = mbi;
334 return (T_CHAR);
338 get_symbol(void)
340 int c;
342 while ((c = scanc()) != EOF) {
343 if (escaped) {
344 escaped = 0;
345 if (c == '\n')
346 continue;
347 add_tok(get_escaped(c));
348 continue;
350 if (c == esc_char) {
351 escaped = 1;
352 continue;
354 if (c == '\n') { /* well that's strange! */
355 yyerror(_("unterminated symbolic name"));
356 continue;
358 if (c == '>') { /* end of symbol */
361 * This restarts the token from the beginning
362 * the next time we scan a character. (This
363 * token is complete.)
366 if (token == NULL) {
367 yyerror(_("missing symbolic name"));
368 return (T_NULL);
370 tokidx = 0;
373 * A few symbols are handled as keywords outside
374 * of the normal categories.
376 if (category == T_END) {
377 int i;
378 for (i = 0; symwords[i].name != 0; i++) {
379 if (strcmp(token, symwords[i].name) ==
380 0) {
381 last_kw = symwords[i].id;
382 return (last_kw);
386 /* its an undefined symbol */
387 yylval.token = strdup(token);
388 if (yylval.token == NULL) {
389 perror("malloc");
390 exit(1);
392 token = NULL;
393 toksz = 0;
394 tokidx = 0;
395 return (T_SYMBOL);
397 add_tok(c);
400 yyerror(_("unterminated symbolic name"));
401 return (EOF);
405 static int
406 consume_token(void)
408 int len = tokidx;
409 int i;
411 tokidx = 0;
412 if (token == NULL)
413 return (T_NULL);
416 * this one is special, because we don't want it to alter the
417 * last_kw field.
419 if (strcmp(token, "...") == 0) {
420 return (T_ELLIPSIS);
423 /* search for reserved words first */
424 for (i = 0; keywords[i].name; i++) {
425 int j;
426 if (strcmp(keywords[i].name, token) != 0) {
427 continue;
430 last_kw = keywords[i].id;
432 /* clear the top level category if we're done with it */
433 if (last_kw == T_END) {
434 category = T_END;
437 /* set the top level category if we're changing */
438 for (j = 0; categories[j]; j++) {
439 if (categories[j] != last_kw)
440 continue;
441 category = last_kw;
444 return (keywords[i].id);
447 /* maybe its a numeric constant? */
448 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
449 char *eptr;
450 yylval.num = strtol(token, &eptr, 10);
451 if (*eptr != 0)
452 yyerror(_("malformed number"));
453 return (T_NUMBER);
457 * A single lone character is treated as a character literal.
458 * To avoid duplication of effort, we stick in the charmap.
460 if (len == 1) {
461 yylval.mbs[0] = 1; /* length */
462 yylval.mbs[1] = token[0];
463 yylval.mbs[2] = '\0';
464 return (T_CHAR);
467 /* anything else is treated as a symbolic name */
468 yylval.token = strdup(token);
469 token = NULL;
470 toksz = 0;
471 tokidx = 0;
472 return (T_NAME);
475 void
476 scan_to_eol(void)
478 int c;
479 while ((c = scanc()) != '\n') {
480 if (c == EOF) {
481 /* end of file without newline! */
482 errf(_("missing newline"));
483 return;
486 assert(c == '\n');
490 yylex(void)
492 int c;
494 while ((c = scanc()) != EOF) {
496 /* special handling for quoted string */
497 if (instring) {
498 if (escaped) {
499 escaped = 0;
501 /* if newline, just eat and forget it */
502 if (c == '\n')
503 continue;
505 if (strchr("xXd01234567", c)) {
506 unscanc(c);
507 unscanc(esc_char);
508 return (get_wide());
510 yylval.mbs[0] = 1; /* length */
511 yylval.mbs[1] = get_escaped(c);
512 yylval.mbs[2] = '\0';
513 return (T_CHAR);
515 if (c == esc_char) {
516 escaped = 1;
517 continue;
519 switch (c) {
520 case '<':
521 return (get_symbol());
522 case '>':
523 /* oops! should generate syntax error */
524 return (T_GT);
525 case '"':
526 instring = 0;
527 return (T_QUOTE);
528 default:
529 yylval.mbs[0] = 1; /* length */
530 yylval.mbs[1] = c;
531 yylval.mbs[2] = '\0';
532 return (T_CHAR);
536 /* escaped characters first */
537 if (escaped) {
538 escaped = 0;
539 if (c == '\n') {
540 /* eat the newline */
541 continue;
543 hadtok = 1;
544 if (tokidx) {
545 /* an escape mid-token is nonsense */
546 return (T_NULL);
549 /* numeric escapes are treated as wide characters */
550 if (strchr("xXd01234567", c)) {
551 unscanc(c);
552 unscanc(esc_char);
553 return (get_wide());
556 add_tok(get_escaped(c));
557 continue;
560 /* if it is the escape charter itself note it */
561 if (c == esc_char) {
562 escaped = 1;
563 continue;
566 /* remove from the comment char to end of line */
567 if (c == com_char) {
568 while (c != '\n') {
569 if ((c = scanc()) == EOF) {
570 /* end of file without newline! */
571 return (EOF);
574 assert(c == '\n');
575 if (!hadtok) {
577 * If there were no tokens on this line,
578 * then just pretend it didn't exist at all.
580 continue;
582 hadtok = 0;
583 return (T_NL);
586 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
588 * These are all token delimiters. If there
589 * is a token already in progress, we need to
590 * process it.
592 unscanc(c);
593 return (consume_token());
596 switch (c) {
597 case '\n':
598 if (!hadtok) {
600 * If the line was completely devoid of tokens,
601 * then just ignore it.
603 continue;
605 /* we're starting a new line, reset the token state */
606 hadtok = 0;
607 return (T_NL);
608 case ',':
609 hadtok = 1;
610 return (T_COMMA);
611 case ';':
612 hadtok = 1;
613 return (T_SEMI);
614 case '(':
615 hadtok = 1;
616 return (T_LPAREN);
617 case ')':
618 hadtok = 1;
619 return (T_RPAREN);
620 case '>':
621 hadtok = 1;
622 return (T_GT);
623 case '<':
624 /* symbol start! */
625 hadtok = 1;
626 return (get_symbol());
627 case ' ':
628 case '\t':
629 /* whitespace, just ignore it */
630 continue;
631 case '"':
632 hadtok = 1;
633 instring = 1;
634 return (T_QUOTE);
635 default:
636 hadtok = 1;
637 add_tok(c);
638 continue;
641 return (EOF);
644 void
645 yyerror(const char *msg)
647 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
648 filename, lineno, msg);
649 exit(1);
652 void
653 errf(const char *fmt, ...)
655 char *msg;
657 va_list va;
658 va_start(va, fmt);
659 (void) vasprintf(&msg, fmt, va);
660 va_end(va);
662 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
663 filename, lineno, msg);
664 free(msg);
665 exit(1);
668 void
669 warn(const char *fmt, ...)
671 char *msg;
673 va_list va;
674 va_start(va, fmt);
675 (void) vasprintf(&msg, fmt, va);
676 va_end(va);
678 (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
679 filename, lineno, msg);
680 free(msg);
681 warnings++;