Update ooo320-m1
[ooovba.git] / idlc / source / preproc / lex.c
blob1d0d16ee76de821878a79d1a3dc8daeca1da8c16
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: lex.c,v $
10 * $Revision: 1.7 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
34 #include <io.h>
35 #else
36 #include <unistd.h>
37 #endif
38 #include "cpp.h"
41 * lexical FSM encoding
42 * when in state state, and one of the characters
43 * in ch arrives, enter nextstate.
44 * States >= S_SELF are either final, or at least require special action.
45 * In 'fsm' there is a line for each state X charset X nextstate.
46 * List chars that overwrite previous entries later (e.g. C_ALPH
47 * can be overridden by '_' by a later entry; and C_XX is the
48 * the universal set, and should always be first.
49 * States above S_SELF are represented in the big table as negative values.
50 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
51 * These actions differ in that S_SELF doesn't have a lookahead char,
52 * S_SELFB does.
54 * The encoding is blown out into a big table for time-efficiency.
55 * Entries have
56 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
59 #define MAXSTATE 32
60 #define ACT(tok,act) ((tok<<7)+act)
61 #define QBSBIT 0100
62 #define GETACT(st) ((st>>7)&0x1ff)
64 /* character classes */
65 #define C_WS 1
66 #define C_ALPH 2
67 #define C_NUM 3
68 #define C_EOF 4
69 #define C_XX 5
71 enum state
73 START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
74 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
75 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
76 S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
77 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
80 int tottok;
81 int tokkind[256];
82 struct fsm
84 int state; /* if in this state */
85 uchar ch[4]; /* and see one of these characters */
86 int nextstate; /* enter this state if +ve */
89 /*const*/ struct fsm fsm[] = {
90 /* start state */
91 { START, {C_XX}, ACT(UNCLASS, S_SELF) },
92 { START, {' ', '\t', '\v'}, WS1 },
93 { START, {C_NUM}, NUM1 },
94 { START, {'.'}, NUM3 },
95 { START, {C_ALPH}, ID1 },
96 { START, {'L'}, ST1 },
97 { START, {'"'}, ST2 },
98 { START, {'\''}, CC1 },
99 { START, {'/'}, COM1 },
100 { START, {EOFC}, S_EOF },
101 { START, {'\n'}, S_NL },
102 { START, {'-'}, MINUS1 },
103 { START, {'+'}, PLUS1 },
104 { START, {'<'}, LT1 },
105 { START, {'>'}, GT1 },
106 { START, {'='}, ASG1 },
107 { START, {'!'}, NOT1 },
108 { START, {'&'}, AND1 },
109 { START, {'|'}, OR1 },
110 { START, {'#'}, SHARP1 },
111 { START, {'%'}, PCT1 },
112 { START, {'['}, ACT(SBRA, S_SELF) },
113 { START, {']'}, ACT(SKET, S_SELF) },
114 { START, {'('}, ACT(LP, S_SELF) },
115 { START, {')'}, ACT(RP, S_SELF) },
116 { START, {'*'}, STAR1 },
117 { START, {','}, ACT(COMMA, S_SELF) },
118 { START, {'?'}, ACT(QUEST, S_SELF) },
119 { START, {':'}, ACT(COLON, S_SELF) },
120 { START, {';'}, ACT(SEMIC, S_SELF) },
121 { START, {'{'}, ACT(CBRA, S_SELF) },
122 { START, {'}'}, ACT(CKET, S_SELF) },
123 { START, {'~'}, ACT(TILDE, S_SELF) },
124 { START, {'^'}, CIRC1 },
126 /* saw a digit */
127 { NUM1, {C_XX}, ACT(NUMBER, S_SELFB) },
128 { NUM1, {C_NUM, C_ALPH, '.'}, NUM1 },
129 { NUM1, {'E', 'e'}, NUM2 },
130 { NUM1, {'_'}, ACT(NUMBER, S_SELFB) },
132 /* saw possible start of exponent, digits-e */
133 { NUM2, {C_XX}, ACT(NUMBER, S_SELFB) },
134 { NUM2, {'+', '-'}, NUM1 },
135 { NUM2, {C_NUM, C_ALPH}, NUM1 },
136 { NUM2, {'_'}, ACT(NUMBER, S_SELFB) },
138 /* saw a '.', which could be a number or an operator */
139 { NUM3, {C_XX}, ACT(DOT, S_SELFB) },
140 { NUM3, {'.'}, DOTS1 },
141 { NUM3, {C_NUM}, NUM1 },
143 { DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB) },
144 { DOTS1, {C_NUM}, NUM1 },
145 { DOTS1, {'.'}, ACT(ELLIPS, S_SELF) },
147 /* saw a letter or _ */
148 { ID1, {C_XX}, ACT(NAME, S_NAME) },
149 { ID1, {C_ALPH, C_NUM}, ID1 },
151 /* saw L (start of wide string?) */
152 { ST1, {C_XX}, ACT(NAME, S_NAME) },
153 { ST1, {C_ALPH, C_NUM}, ID1 },
154 { ST1, {'"'}, ST2 },
155 { ST1, {'\''}, CC1 },
157 /* saw " beginning string */
158 { ST2, {C_XX}, ST2 },
159 { ST2, {'"'}, ACT(STRING, S_SELF) },
160 { ST2, {'\\'}, ST3 },
161 { ST2, {'\n'}, S_STNL },
162 { ST2, {EOFC}, S_EOFSTR },
164 /* saw \ in string */
165 { ST3, {C_XX}, ST2 },
166 { ST3, {'\n'}, S_STNL },
167 { ST3, {EOFC}, S_EOFSTR },
169 /* saw ' beginning character const */
170 { CC1, {C_XX}, CC1 },
171 { CC1, {'\''}, ACT(CCON, S_SELF) },
172 { CC1, {'\\'}, CC2 },
173 { CC1, {'\n'}, S_STNL },
174 { CC1, {EOFC}, S_EOFSTR },
176 /* saw \ in ccon */
177 { CC2, {C_XX}, CC1 },
178 { CC2, {'\n'}, S_STNL },
179 { CC2, {EOFC}, S_EOFSTR },
181 /* saw /, perhaps start of comment */
182 { COM1, {C_XX}, ACT(SLASH, S_SELFB) },
183 { COM1, {'='}, ACT(ASSLASH, S_SELF) },
184 { COM1, {'*'}, COM2 },
185 { COM1, {'/'}, COM4 },
187 /* saw / followed by *, start of comment */
188 { COM2, {C_XX}, COM2 },
189 { COM2, {'\n'}, S_COMNL },
190 { COM2, {'*'}, COM3 },
191 { COM2, {EOFC}, S_EOFCOM },
193 /* saw the * possibly ending a comment */
194 { COM3, {C_XX}, COM2 },
195 { COM3, {'\n'}, S_COMNL },
196 { COM3, {'*'}, COM3 },
197 { COM3, {'/'}, S_COMMENT },
199 /* // comment */
200 { COM4, {C_XX}, COM4 },
201 { COM4, {'\n'}, S_COMMENT },
202 /* { COM4, {'\n'}, S_NL }, */
203 { COM4, {EOFC}, S_EOFCOM },
205 /* saw white space, eat it up */
206 { WS1, {C_XX}, S_WS },
207 { WS1, {'\t', '\v', ' '}, WS1 },
209 /* saw -, check --, -=, -> */
210 { MINUS1, {C_XX}, ACT(MINUS, S_SELFB) },
211 { MINUS1, {'-'}, ACT(MMINUS, S_SELF) },
212 { MINUS1, {'='}, ACT(ASMINUS, S_SELF) },
213 { MINUS1, {'>'}, ACT(ARROW, S_SELF) },
215 /* saw +, check ++, += */
216 { PLUS1, {C_XX}, ACT(PLUS, S_SELFB) },
217 { PLUS1, {'+'}, ACT(PPLUS, S_SELF) },
218 { PLUS1, {'='}, ACT(ASPLUS, S_SELF) },
220 /* saw <, check <<, <<=, <= */
221 { LT1, {C_XX}, ACT(LT, S_SELFB) },
222 { LT1, {'<'}, LT2 },
223 { LT1, {'='}, ACT(LEQ, S_SELF) },
224 { LT2, {C_XX}, ACT(LSH, S_SELFB) },
225 { LT2, {'='}, ACT(ASLSH, S_SELF) },
227 /* saw >, check >>, >>=, >= */
228 { GT1, {C_XX}, ACT(GT, S_SELFB) },
229 { GT1, {'>'}, GT2 },
230 { GT1, {'='}, ACT(GEQ, S_SELF) },
231 { GT2, {C_XX}, ACT(RSH, S_SELFB) },
232 { GT2, {'='}, ACT(ASRSH, S_SELF) },
234 /* = */
235 { ASG1, {C_XX}, ACT(ASGN, S_SELFB) },
236 { ASG1, {'='}, ACT(EQ, S_SELF) },
238 /* ! */
239 { NOT1, {C_XX}, ACT(NOT, S_SELFB) },
240 { NOT1, {'='}, ACT(NEQ, S_SELF) },
242 /* & */
243 { AND1, {C_XX}, ACT(AND, S_SELFB) },
244 { AND1, {'&'}, ACT(LAND, S_SELF) },
245 { AND1, {'='}, ACT(ASAND, S_SELF) },
247 /* | */
248 { OR1, {C_XX}, ACT(OR, S_SELFB) },
249 { OR1, {'|'}, ACT(LOR, S_SELF) },
250 { OR1, {'='}, ACT(ASOR, S_SELF) },
252 /* # */
253 { SHARP1, {C_XX}, ACT(SHARP, S_SELFB) },
254 { SHARP1, {'#'}, ACT(DSHARP, S_SELF) },
256 /* % */
257 { PCT1, {C_XX}, ACT(PCT, S_SELFB) },
258 { PCT1, {'='}, ACT(ASPCT, S_SELF) },
260 /* * */
261 { STAR1, {C_XX}, ACT(STAR, S_SELFB) },
262 { STAR1, {'='}, ACT(ASSTAR, S_SELF) },
264 /* ^ */
265 { CIRC1, {C_XX}, ACT(CIRC, S_SELFB) },
266 { CIRC1, {'='}, ACT(ASCIRC, S_SELF) },
268 { -1, {'\0'}, S_SELF }
271 /* first index is char, second is state */
272 /* increase #states to power of 2 to encourage use of shift */
273 short bigfsm[256][MAXSTATE];
275 void
276 expandlex(void)
278 /* const */ struct fsm *fp;
279 int i, j, nstate;
281 for (fp = fsm; fp->state >= 0; fp++)
283 for (i = 0; fp->ch[i]; i++)
285 nstate = fp->nextstate;
286 if (nstate >= S_SELF)
287 nstate = ~nstate;
288 switch (fp->ch[i])
291 case C_XX: /* random characters */
292 for (j = 0; j < 256; j++)
293 bigfsm[j][fp->state] = (short) nstate;
294 continue;
295 case C_ALPH:
296 for (j = 0; j <= 256; j++)
297 if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z')
298 || j == '_')
299 bigfsm[j][fp->state] = (short) nstate;
300 continue;
301 case C_NUM:
302 for (j = '0'; j <= '9'; j++)
303 bigfsm[j][fp->state] = (short) nstate;
304 continue;
305 default:
306 bigfsm[fp->ch[i]][fp->state] = (short) nstate;
312 * install special cases for ? (trigraphs), \ (splicing), runes, and
313 * EOB
315 for (i = 0; i < MAXSTATE; i++)
317 for (j = 0; j < 0xFF; j++)
318 if (j == '?' || j == '\\' || j == '\n' || j == '\r')
320 if (bigfsm[j][i] > 0)
321 bigfsm[j][i] = ~bigfsm[j][i];
322 bigfsm[j][i] &= ~QBSBIT;
324 bigfsm[EOB][i] = ~S_EOB;
325 if (bigfsm[EOFC][i] >= 0)
326 bigfsm[EOFC][i] = ~S_EOF;
331 * fill in a row of tokens from input, terminated by NL or END
332 * First token is put at trp->lp.
333 * Reset is non-zero when the input buffer can be "rewound."
334 * The value is a flag indicating that possible macros have
335 * been seen in the row.
338 gettokens(Tokenrow * trp, int reset)
340 register int c, state, oldstate;
341 register uchar *ip;
342 register Token *tp, *maxp;
343 int runelen;
344 Source *s = cursource;
345 int nmac = 0;
347 tp = trp->lp;
348 ip = s->inp;
349 if (reset)
351 s->lineinc = 0;
352 if (ip >= s->inl)
353 { /* nothing in buffer */
354 s->inl = s->inb;
355 fillbuf(s);
356 ip = s->inp = s->inb;
358 else
359 if (ip >= s->inb + (3 * INS / 4))
361 memmove(s->inb, ip, 4 + s->inl - ip);
362 s->inl = s->inb + (s->inl - ip);
363 ip = s->inp = s->inb;
366 maxp = &trp->bp[trp->max];
367 runelen = 1;
368 for (;;)
370 continue2:
371 if (tp >= maxp)
373 trp->lp = tp;
374 tp = growtokenrow(trp);
375 maxp = &trp->bp[trp->max];
377 tp->type = UNCLASS;
378 tp->t = ip;
379 tp->wslen = 0;
380 tp->flag = 0;
381 state = START;
382 for (;;)
384 oldstate = state;
386 c = *ip;
388 if ((state = bigfsm[c][state]) >= 0)
390 ip += runelen;
391 runelen = 1;
392 continue;
394 state = ~state;
395 reswitch:
396 switch (state & 0177)
398 case S_SELF:
399 ip += runelen;
400 runelen = 1;
401 case S_SELFB:
402 tp->type = (unsigned char) GETACT(state);
403 tp->len = ip - tp->t;
404 tp++;
405 goto continue2;
407 case S_NAME: /* like S_SELFB but with nmac check */
408 tp->type = NAME;
409 tp->len = ip - tp->t;
410 nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0);
411 tp++;
412 goto continue2;
414 case S_WS:
415 tp->wslen = ip - tp->t;
416 tp->t = ip;
417 state = START;
418 continue;
420 default:
421 if ((state & QBSBIT) == 0)
423 ip += runelen;
424 runelen = 1;
425 continue;
427 state &= ~QBSBIT;
428 s->inp = ip;
430 if (c == '\n')
432 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
434 if (s->inp[1] == '\r')
436 memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2);
437 s->inl -= 1;
440 goto reswitch;
443 if (c == '\r')
445 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
447 if (s->inp[1] == '\n')
449 memmove(s->inp, s->inp + 1, s->inl - s->inp + 1);
450 s->inl -= 1;
452 else
453 *s->inp = '\n';
455 state = oldstate;
456 continue;
459 if (c == '?')
460 { /* check trigraph */
461 if (trigraph(s))
463 state = oldstate;
464 continue;
466 goto reswitch;
468 if (c == '\\')
469 { /* line-folding */
470 if (foldline(s))
472 s->lineinc++;
473 state = oldstate;
474 continue;
476 goto reswitch;
478 error(WARNING, "Lexical botch in cpp");
479 ip += runelen;
480 runelen = 1;
481 continue;
483 case S_EOB:
484 s->inp = ip;
485 fillbuf(cursource);
486 state = oldstate;
487 continue;
489 case S_EOF:
490 tp->type = END;
491 tp->len = 0;
492 s->inp = ip;
493 if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1)
494 error(WARNING, "No newline at end of file");
495 trp->lp = tp + 1;
496 return nmac;
498 case S_STNL:
499 error(ERROR, "Unterminated string or char const");
500 case S_NL:
501 tp->t = ip;
502 tp->type = NL;
503 tp->len = 1;
504 tp->wslen = 0;
505 s->lineinc++;
506 s->inp = ip + 1;
507 trp->lp = tp + 1;
508 return nmac;
510 case S_EOFSTR:
511 error(FATAL, "EOF in string or char constant");
512 break;
514 case S_COMNL:
515 s->lineinc++;
516 state = COM2;
517 ip += runelen;
518 runelen = 1;
519 continue;
521 case S_EOFCOM:
522 error(WARNING, "EOF inside comment");
523 --ip;
524 case S_COMMENT:
525 if (!Cflag)
527 tp->t = ++ip;
528 tp->t[-1] = ' ';
529 tp->wslen = 1;
530 state = START;
531 continue;
533 else
535 runelen = 1;
536 /* s->lineinc = 0; */
537 tp->type = COMMENT;
538 tp->flag |= XTWS;
541 break;
543 ip += runelen;
544 runelen = 1;
545 tp->len = ip - tp->t;
546 tp++;
550 /* have seen ?; handle the trigraph it starts (if any) else 0 */
552 trigraph(Source * s)
554 uchar c;
556 while (s->inp + 2 >= s->inl && fillbuf(s) != EOF);
558 if (s->inp[1] != '?')
559 return 0;
560 c = 0;
561 switch (s->inp[2])
563 case '=':
564 c = '#';
565 break;
566 case '(':
567 c = '[';
568 break;
569 case '/':
570 c = '\\';
571 break;
572 case ')':
573 c = ']';
574 break;
575 case '\'':
576 c = '^';
577 break;
578 case '<':
579 c = '{';
580 break;
581 case '!':
582 c = '|';
583 break;
584 case '>':
585 c = '}';
586 break;
587 case '-':
588 c = '~';
589 break;
591 if (c)
593 *s->inp = c;
594 memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2);
595 s->inl -= 2;
597 return c;
601 foldline(Source * s)
603 int n = 1;
605 while (s->inp + 2 >= s->inl && fillbuf(s) != EOF);
607 /* skip DOS line ends */
608 if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) ||
609 ((s->inp[n] == '\n') && (s->inp[n+1] == '\r')))
610 n++;
612 if ((s->inp[n] == '\n') || (s->inp[n] == '\r'))
614 memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2);
615 s->inl -= n + 1;
616 return 1;
618 return 0;
622 fillbuf(Source * s)
624 int n;
626 if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0)
627 n = 0;
628 s->inl += n;
629 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB;
630 if (n == 0)
632 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC;
633 return EOF;
635 return 0;
639 * Push down to new source of characters.
640 * If fd>0 and str==NULL, then from a file `name';
641 * if fd==-1 and str, then from the string.
643 Source *
644 setsource(char *name, int path, int fd, char *str, int wrap)
646 Source *s = new(Source);
647 int len;
649 s->line = 1;
650 s->lineinc = 0;
651 s->fd = fd;
652 s->filename = name;
653 s->next = cursource;
654 s->ifdepth = 0;
655 s->pathdepth = path;
656 s->wrap = wrap;
658 cursource = s;
660 if (s->wrap)
661 genwrap(0);
663 /* slop at right for EOB */
664 if (str)
666 len = strlen(str);
667 s->inb = domalloc(len + 4);
668 s->inp = s->inb;
669 strncpy((char *) s->inp, str, len);
671 else
673 s->inb = domalloc(INS + 4);
674 s->inp = s->inb;
675 len = 0;
677 s->inl = s->inp + len;
678 s->inl[0] = s->inl[1] = EOB;
680 return s;
683 void
684 unsetsource(void)
686 Source *s = cursource;
688 if (s->wrap)
689 genwrap(1);
691 if (s->fd >= 0)
693 close(s->fd);
694 dofree(s->inb);
696 cursource = s->next;
697 dofree(s);