2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
17 * This file contains the "scanner", which tokenizes charmap files
18 * for iconv for processing by the higher level grammar processor.
27 #include <sys/types.h>
30 #include "parser.tab.h"
35 int mb_cur_max
= MB_LEN_MAX
;
40 static const char *filename
;
41 static int instring
= 0;
42 static int escaped
= 0;
45 * Token space ... grows on demand.
47 static char *token
= NULL
;
50 static int hadtok
= 0;
53 * The last keyword seen. This is useful to trigger the special lexer rules
54 * for "copy" and also collating symbols and elements.
57 static int category
= T_END
;
63 { T_COM_CHAR
, "comment_char" },
64 { T_ESC_CHAR
, "escape_char" },
68 * These are keywords used in the charmap file. Note that
69 * Solaris orginally used angle brackets to wrap some of them,
70 * but we removed that to simplify our parser. The first of these
71 * items are "global items."
73 { T_CHARMAP
, "CHARMAP" },
75 { T_WIDTH_DEFAULT
, "WIDTH_DEFAULT" },
81 * These special words are only used in a charmap file, enclosed in <>.
83 static struct token symwords
[] = {
84 { T_COM_CHAR
, "comment_char" },
85 { T_ESC_CHAR
, "escape_char" },
86 { T_CODE_SET
, "code_set_name" },
87 { T_MB_CUR_MAX
, "mb_cur_max" },
88 { T_MB_CUR_MIN
, "mb_cur_min" },
92 static int categories
[] = {
98 reset_scanner(const char *fname
)
101 filename
= "<stdin>";
105 (void) fclose(input
);
106 if ((input
= fopen(fname
, "r")) == NULL
) {
124 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
125 #define isodigit(x) ((x >= '0') && (x <= '7'))
146 if (ungetc(c
, input
) < 0) {
147 yyerror(_("ungetc failed"));
159 yyerror(_("malformed hex digit"));
164 yyerror(_("malformed hex digit"));
167 v
= ((hex(c1
) << 4) | hex(c2
));
179 yyerror(_("malformed decimal digit"));
185 yyerror(_("malformed decimal digit"));
210 yyerror(_("malformed octal digit"));
216 yyerror(_("malformed octal digit"));
234 if ((tokidx
+ 1) >= toksz
) {
236 if ((token
= realloc(token
, toksz
)) == NULL
) {
237 yyerror(_("out of memory"));
244 token
[tokidx
++] = (char)c
;
253 if ((c
= scanc()) != esc_char
) {
262 return (scan_dec_byte());
265 return (scan_hex_byte());
274 /* put the character back so we can get it */
276 return (scan_oct_byte());
310 /* NB: yylval.mbs[0] is the length */
311 char *mbs
= &yylval
.mbs
[1];
316 if (mb_cur_max
> MB_LEN_MAX
) {
317 yyerror(_("max multibyte character size too big"));
321 if ((c
= get_byte()) == EOF
)
323 if (mbi
== mb_cur_max
) {
325 yyerror(_("length > mb_cur_max"));
332 /* result in yylval.mbs */
342 while ((c
= scanc()) != EOF
) {
347 add_tok(get_escaped(c
));
354 if (c
== '\n') { /* well that's strange! */
355 yyerror(_("unterminated symbolic name"));
358 if (c
== '>') { /* end of symbol */
361 * This restarts the token from the beginning
362 * the next time we scan a character. (This
363 * token is complete.)
367 yyerror(_("missing symbolic name"));
373 * A few symbols are handled as keywords outside
374 * of the normal categories.
376 if (category
== T_END
) {
378 for (i
= 0; symwords
[i
].name
!= 0; i
++) {
379 if (strcmp(token
, symwords
[i
].name
) ==
381 last_kw
= symwords
[i
].id
;
386 /* its an undefined symbol */
387 yylval
.token
= strdup(token
);
388 if (yylval
.token
== NULL
) {
400 yyerror(_("unterminated symbolic name"));
416 * this one is special, because we don't want it to alter the
419 if (strcmp(token
, "...") == 0) {
423 /* search for reserved words first */
424 for (i
= 0; keywords
[i
].name
; i
++) {
426 if (strcmp(keywords
[i
].name
, token
) != 0) {
430 last_kw
= keywords
[i
].id
;
432 /* clear the top level category if we're done with it */
433 if (last_kw
== T_END
) {
437 /* set the top level category if we're changing */
438 for (j
= 0; categories
[j
]; j
++) {
439 if (categories
[j
] != last_kw
)
444 return (keywords
[i
].id
);
447 /* maybe its a numeric constant? */
448 if (isdigit(*token
) || (*token
== '-' && isdigit(token
[1]))) {
450 yylval
.num
= strtol(token
, &eptr
, 10);
452 yyerror(_("malformed number"));
457 * A single lone character is treated as a character literal.
458 * To avoid duplication of effort, we stick in the charmap.
461 yylval
.mbs
[0] = 1; /* length */
462 yylval
.mbs
[1] = token
[0];
463 yylval
.mbs
[2] = '\0';
467 /* anything else is treated as a symbolic name */
468 yylval
.token
= strdup(token
);
479 while ((c
= scanc()) != '\n') {
481 /* end of file without newline! */
482 errf(_("missing newline"));
496 filename
= "<stdin>";
499 while ((c
= scanc()) != EOF
) {
501 /* special handling for quoted string */
506 /* if newline, just eat and forget it */
510 if (strchr("xXd01234567", c
)) {
515 yylval
.mbs
[0] = 1; /* length */
516 yylval
.mbs
[1] = get_escaped(c
);
517 yylval
.mbs
[2] = '\0';
526 return (get_symbol());
528 /* oops! should generate syntax error */
534 yylval
.mbs
[0] = 1; /* length */
536 yylval
.mbs
[2] = '\0';
541 /* escaped characters first */
545 /* eat the newline */
550 /* an escape mid-token is nonsense */
554 /* numeric escapes are treated as wide characters */
555 if (strchr("xXd01234567", c
)) {
561 add_tok(get_escaped(c
));
565 /* if it is the escape charter itself note it */
571 /* remove from the comment char to end of line */
574 if ((c
= scanc()) == EOF
) {
575 /* end of file without newline! */
582 * If there were no tokens on this line,
583 * then just pretend it didn't exist at all.
591 if (strchr(" \t\n;()<>,\"", c
) && (tokidx
!= 0)) {
593 * These are all token delimiters. If there
594 * is a token already in progress, we need to
598 return (consume_token());
605 * If the line was completely devoid of tokens,
606 * then just ignore it.
610 /* we're starting a new line, reset the token state */
631 return (get_symbol());
634 /* whitespace, just ignore it */
650 yyerror(const char *msg
)
652 (void) fprintf(stderr
, _("%s: %d: error: %s\n"),
653 filename
, lineno
, msg
);
658 errf(const char *fmt
, ...)
664 (void) vasprintf(&msg
, fmt
, va
);
667 (void) fprintf(stderr
, _("%s: %d: error: %s\n"),
668 filename
, lineno
, msg
);
674 warn(const char *fmt
, ...)
680 (void) vasprintf(&msg
, fmt
, va
);
683 (void) fprintf(stderr
, _("%s: %d: warning: %s\n"),
684 filename
, lineno
, msg
);