masterfix OOO330: #i10000# BrOffice removed
[LibreOffice.git] / soltools / cpp / _lex.c
blob2ff188ff2264c491b7a3bdd7d2e696add493f34e
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
5 #include <io.h>
6 #else
7 #include <unistd.h>
8 #endif
9 #include "cpp.h"
11 * lexical FSM encoding
12 * when in state state, and one of the characters
13 * in ch arrives, enter nextstate.
14 * States >= S_SELF are either final, or at least require special action.
15 * In 'fsm' there is a line for each state X charset X nextstate.
16 * List chars that overwrite previous entries later (e.g. C_ALPH
17 * can be overridden by '_' by a later entry; and C_XX is the
18 * the universal set, and should always be first.
19 * States above S_SELF are represented in the big table as negative values.
20 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
21 * These actions differ in that S_SELF doesn't have a lookahead char,
22 * S_SELFB does.
24 * The encoding is blown out into a big table for time-efficiency.
25 * Entries have
26 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
29 #define MAXSTATE 32
30 #define ACT(tok,act) ((tok<<7)+act)
31 #define QBSBIT 0100
32 #define GETACT(st) ((st>>7)&0x1ff)
34 /* character classes */
35 #define C_WS 1
36 #define C_ALPH 2
37 #define C_NUM 3
38 #define C_EOF 4
39 #define C_XX 5
41 enum state
43 START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
44 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
45 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
46 S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
47 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
50 int tottok;
51 int tokkind[256];
52 struct fsm
54 int state; /* if in this state */
55 uchar ch[4]; /* and see one of these characters */
56 int nextstate; /* enter this state if +ve */
59 /*const*/ struct fsm fsm[] = {
60 /* start state */
61 {START, {C_XX}, ACT(UNCLASS, S_SELF)},
62 {START, {' ', '\t', '\v'}, WS1},
63 {START, {C_NUM}, NUM1},
64 {START, {'.'}, NUM3},
65 {START, {C_ALPH}, ID1},
66 {START, {'L'}, ST1},
67 {START, {'"'}, ST2},
68 {START, {'\''}, CC1},
69 {START, {'/'}, COM1},
70 {START, {EOFC}, S_EOF},
71 {START, {'\n'}, S_NL},
72 {START, {'-'}, MINUS1},
73 {START, {'+'}, PLUS1},
74 {START, {'<'}, LT1},
75 {START, {'>'}, GT1},
76 {START, {'='}, ASG1},
77 {START, {'!'}, NOT1},
78 {START, {'&'}, AND1},
79 {START, {'|'}, OR1},
80 {START, {'#'}, SHARP1},
81 {START, {'%'}, PCT1},
82 {START, {'['}, ACT(SBRA, S_SELF)},
83 {START, {']'}, ACT(SKET, S_SELF)},
84 {START, {'('}, ACT(LP, S_SELF)},
85 {START, {')'}, ACT(RP, S_SELF)},
86 {START, {'*'}, STAR1},
87 {START, {','}, ACT(COMMA, S_SELF)},
88 {START, {'?'}, ACT(QUEST, S_SELF)},
89 {START, {':'}, ACT(COLON, S_SELF)},
90 {START, {';'}, ACT(SEMIC, S_SELF)},
91 {START, {'{'}, ACT(CBRA, S_SELF)},
92 {START, {'}'}, ACT(CKET, S_SELF)},
93 {START, {'~'}, ACT(TILDE, S_SELF)},
94 {START, {'^'}, CIRC1},
96 /* saw a digit */
97 {NUM1, {C_XX}, ACT(NUMBER, S_SELFB)},
98 {NUM1, {C_NUM, C_ALPH, '.'}, NUM1},
99 {NUM1, {'E', 'e'}, NUM2},
100 {NUM1, {'_'}, ACT(NUMBER, S_SELFB)},
102 /* saw possible start of exponent, digits-e */
103 {NUM2, {C_XX}, ACT(NUMBER, S_SELFB)},
104 {NUM2, {'+', '-'}, NUM1},
105 {NUM2, {C_NUM, C_ALPH}, NUM1},
106 {NUM2, {'_'}, ACT(NUMBER, S_SELFB)},
108 /* saw a '.', which could be a number or an operator */
109 {NUM3, {C_XX}, ACT(DOT, S_SELFB)},
110 {NUM3, {'.'}, DOTS1},
111 {NUM3, {C_NUM}, NUM1},
113 {DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB)},
114 {DOTS1, {C_NUM}, NUM1},
115 {DOTS1, {'.'}, ACT(ELLIPS, S_SELF)},
117 /* saw a letter or _ */
118 {ID1, {C_XX}, ACT(NAME, S_NAME)},
119 {ID1, {C_ALPH, C_NUM}, ID1},
121 /* saw L (start of wide string?) */
122 {ST1, {C_XX}, ACT(NAME, S_NAME)},
123 {ST1, {C_ALPH, C_NUM}, ID1},
124 {ST1, {'"'}, ST2},
125 {ST1, {'\''}, CC1},
127 /* saw " beginning string */
128 {ST2, {C_XX}, ST2},
129 {ST2, {'"'}, ACT(STRING, S_SELF)},
130 {ST2, {'\\'}, ST3},
131 {ST2, {'\n'}, S_STNL},
132 {ST2, {EOFC}, S_EOFSTR},
134 /* saw \ in string */
135 {ST3, {C_XX}, ST2},
136 {ST3, {'\n'}, S_STNL},
137 {ST3, {EOFC}, S_EOFSTR},
139 /* saw ' beginning character const */
140 {CC1, {C_XX}, CC1},
141 {CC1, {'\''}, ACT(CCON, S_SELF)},
142 {CC1, {'\\'}, CC2},
143 {CC1, {'\n'}, S_STNL},
144 {CC1, {EOFC}, S_EOFSTR},
146 /* saw \ in ccon */
147 {CC2, {C_XX}, CC1},
148 {CC2, {'\n'}, S_STNL},
149 {CC2, {EOFC}, S_EOFSTR},
151 /* saw /, perhaps start of comment */
152 {COM1, {C_XX}, ACT(SLASH, S_SELFB)},
153 {COM1, {'='}, ACT(ASSLASH, S_SELF)},
154 {COM1, {'*'}, COM2},
155 {COM1, {'/'}, COM4},
157 /* saw / followed by *, start of comment */
158 {COM2, {C_XX}, COM2},
159 {COM2, {'\n'}, S_COMNL},
160 {COM2, {'*'}, COM3},
161 {COM2, {EOFC}, S_EOFCOM},
163 /* saw the * possibly ending a comment */
164 {COM3, {C_XX}, COM2},
165 {COM3, {'\n'}, S_COMNL},
166 {COM3, {'*'}, COM3},
167 {COM3, {'/'}, S_COMMENT},
169 /* // comment */
170 {COM4, {C_XX}, COM4},
171 {COM4, {'\n'}, S_NL},
172 {COM4, {EOFC}, S_EOFCOM},
174 /* saw white space, eat it up */
175 {WS1, {C_XX}, S_WS},
176 {WS1, {'\t', '\v', ' '}, WS1},
178 /* saw -, check --, -=, -> */
179 {MINUS1, {C_XX}, ACT(MINUS, S_SELFB)},
180 {MINUS1, {'-'}, ACT(MMINUS, S_SELF)},
181 {MINUS1, {'='}, ACT(ASMINUS, S_SELF)},
182 {MINUS1, {'>'}, ACT(ARROW, S_SELF)},
184 /* saw +, check ++, += */
185 {PLUS1, {C_XX}, ACT(PLUS, S_SELFB)},
186 {PLUS1, {'+'}, ACT(PPLUS, S_SELF)},
187 {PLUS1, {'='}, ACT(ASPLUS, S_SELF)},
189 /* saw <, check <<, <<=, <= */
190 {LT1, {C_XX}, ACT(LT, S_SELFB)},
191 {LT1, {'<'}, LT2},
192 {LT1, {'='}, ACT(LEQ, S_SELF)},
193 {LT2, {C_XX}, ACT(LSH, S_SELFB)},
194 {LT2, {'='}, ACT(ASLSH, S_SELF)},
196 /* saw >, check >>, >>=, >= */
197 {GT1, {C_XX}, ACT(GT, S_SELFB)},
198 {GT1, {'>'}, GT2},
199 {GT1, {'='}, ACT(GEQ, S_SELF)},
200 {GT2, {C_XX}, ACT(RSH, S_SELFB)},
201 {GT2, {'='}, ACT(ASRSH, S_SELF)},
203 /* = */
204 {ASG1, {C_XX}, ACT(ASGN, S_SELFB)},
205 {ASG1, {'='}, ACT(EQ, S_SELF)},
207 /* ! */
208 {NOT1, {C_XX}, ACT(NOT, S_SELFB)},
209 {NOT1, {'='}, ACT(NEQ, S_SELF)},
211 /* & */
212 {AND1, {C_XX}, ACT(AND, S_SELFB)},
213 {AND1, {'&'}, ACT(LAND, S_SELF)},
214 {AND1, {'='}, ACT(ASAND, S_SELF)},
216 /* | */
217 {OR1, {C_XX}, ACT(OR, S_SELFB)},
218 {OR1, {'|'}, ACT(LOR, S_SELF)},
219 {OR1, {'='}, ACT(ASOR, S_SELF)},
221 /* # */
222 {SHARP1, {C_XX}, ACT(SHARP, S_SELFB)},
223 {SHARP1, {'#'}, ACT(DSHARP, S_SELF)},
225 /* % */
226 {PCT1, {C_XX}, ACT(PCT, S_SELFB)},
227 {PCT1, {'='}, ACT(ASPCT, S_SELF)},
229 /* * */
230 {STAR1, {C_XX}, ACT(STAR, S_SELFB)},
231 {STAR1, {'='}, ACT(ASSTAR, S_SELF)},
233 /* ^ */
234 {CIRC1, {C_XX}, ACT(CIRC, S_SELFB)},
235 {CIRC1, {'='}, ACT(ASCIRC, S_SELF)},
237 {-1, "", 0}
240 /* first index is char, second is state */
241 /* increase #states to power of 2 to encourage use of shift */
242 short bigfsm[256][MAXSTATE];
244 void
245 expandlex(void)
247 /* const */ struct fsm *fp;
248 int i, j, nstate;
250 for (fp = fsm; fp->state >= 0; fp++)
252 for (i = 0; fp->ch[i]; i++)
254 nstate = fp->nextstate;
255 if (nstate >= S_SELF)
256 nstate = ~nstate;
257 switch (fp->ch[i])
260 case C_XX: /* random characters */
261 for (j = 0; j < 256; j++)
262 bigfsm[j][fp->state] = (short) nstate;
263 continue;
264 case C_ALPH:
265 for (j = 0; j <= 256; j++)
266 #ifdef S390
267 if( isalpha( j ) || (j == '_') )
268 #else
269 if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z')
270 || j == '_')
271 #endif
272 bigfsm[j][fp->state] = (short) nstate;
273 continue;
274 case C_NUM:
275 for (j = '0'; j <= '9'; j++)
276 bigfsm[j][fp->state] = (short) nstate;
277 continue;
278 default:
279 bigfsm[fp->ch[i]][fp->state] = (short) nstate;
285 * install special cases for ? (trigraphs), \ (splicing), runes, and
286 * EOB
288 for (i = 0; i < MAXSTATE; i++)
290 for (j = 0; j < 0xFF; j++)
291 if (j == '?' || j == '\\' || j == '\n' || j == '\r')
293 if (bigfsm[j][i] > 0)
294 bigfsm[j][i] = ~bigfsm[j][i];
295 bigfsm[j][i] &= ~QBSBIT;
297 bigfsm[EOB][i] = ~S_EOB;
298 if (bigfsm[EOFC][i] >= 0)
299 bigfsm[EOFC][i] = ~S_EOF;
303 void
304 fixlex(void)
306 /* do C++ comments? */
307 if ((Cplusplus == 0) || (Cflag != 0))
308 bigfsm['/'][COM1] = bigfsm['x'][COM1];
312 * fill in a row of tokens from input, terminated by NL or END
313 * First token is put at trp->lp.
314 * Reset is non-zero when the input buffer can be "rewound."
315 * The value is a flag indicating that possible macros have
316 * been seen in the row.
319 gettokens(Tokenrow * trp, int reset)
321 register int c, state, oldstate;
322 register uchar *ip;
323 register Token *tp, *maxp;
324 int runelen;
325 Source *s = cursource;
326 int nmac = 0;
328 tp = trp->lp;
329 ip = s->inp;
330 if (reset)
332 s->lineinc = 0;
333 if (ip >= s->inl)
334 { /* nothing in buffer */
335 s->inl = s->inb;
336 fillbuf(s);
337 ip = s->inp = s->inb;
339 else
340 if (ip >= s->inb + (3 * INS / 4))
342 memmove(s->inb, ip, 4 + s->inl - ip);
343 s->inl = s->inb + (s->inl - ip);
344 ip = s->inp = s->inb;
347 maxp = &trp->bp[trp->max];
348 runelen = 1;
349 for (;;)
351 continue2:
352 if (tp >= maxp)
354 trp->lp = tp;
355 tp = growtokenrow(trp);
356 maxp = &trp->bp[trp->max];
358 tp->type = UNCLASS;
359 tp->t = ip;
360 tp->wslen = 0;
361 tp->flag = 0;
362 state = START;
363 for (;;)
365 oldstate = state;
367 c = *ip;
369 if ((state = bigfsm[c][state]) >= 0)
371 ip += runelen;
372 runelen = 1;
373 continue;
375 state = ~state;
376 reswitch:
377 switch (state & 0177)
379 case S_SELF:
380 ip += runelen;
381 runelen = 1;
382 case S_SELFB:
383 tp->type = (unsigned char) GETACT(state);
384 tp->len = ip - tp->t;
385 tp++;
386 goto continue2;
388 case S_NAME: /* like S_SELFB but with nmac check */
389 tp->type = NAME;
390 tp->len = ip - tp->t;
391 nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0);
392 tp++;
393 goto continue2;
395 case S_WS:
396 tp->wslen = ip - tp->t;
397 tp->t = ip;
398 state = START;
399 continue;
401 default:
402 if ((state & QBSBIT) == 0)
404 ip += runelen;
405 runelen = 1;
406 continue;
408 state &= ~QBSBIT;
409 s->inp = ip;
411 if (c == '\n')
413 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
415 if (s->inp[1] == '\r')
417 memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2);
418 s->inl -= 1;
421 goto reswitch;
424 if (c == '\r')
426 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
428 if (s->inp[1] == '\n')
430 memmove(s->inp, s->inp + 1, s->inl - s->inp + 1);
431 s->inl -= 1;
433 else
434 *s->inp = '\n';
436 state = oldstate;
437 continue;
440 if (c == '?')
441 { /* check trigraph */
442 if (trigraph(s))
444 state = oldstate;
445 continue;
447 goto reswitch;
449 if (c == '\\')
450 { /* line-folding */
451 if (foldline(s))
453 s->lineinc++;
454 state = oldstate;
455 continue;
457 goto reswitch;
459 error(WARNING, "Lexical botch in cpp");
460 ip += runelen;
461 runelen = 1;
462 continue;
464 case S_EOB:
465 s->inp = ip;
466 fillbuf(cursource);
467 state = oldstate;
468 continue;
470 case S_EOF:
471 tp->type = END;
472 tp->len = 0;
473 s->inp = ip;
474 if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1)
475 error(WARNING, "No newline at end of file");
476 trp->lp = tp + 1;
477 return nmac;
479 case S_STNL:
480 error(ERROR, "Unterminated string or char const");
481 case S_NL:
482 tp->t = ip;
483 tp->type = NL;
484 tp->len = 1;
485 tp->wslen = 0;
486 s->lineinc++;
487 s->inp = ip + 1;
488 trp->lp = tp + 1;
489 return nmac;
491 case S_EOFSTR:
492 error(FATAL, "EOF in string or char constant");
493 break;
495 case S_COMNL:
496 s->lineinc++;
497 state = COM2;
498 ip += runelen;
499 runelen = 1;
500 continue;
502 case S_EOFCOM:
503 error(WARNING, "EOF inside comment");
504 --ip;
505 case S_COMMENT:
506 if (!Cflag)
508 tp->t = ++ip;
509 tp->t[-1] = ' ';
510 tp->wslen = 1;
511 state = START;
512 continue;
514 else
516 runelen = 1;
517 s->lineinc = 0;;
518 tp->type = COMMENT;
519 tp->flag |= XTWS;
522 break;
524 ip += runelen;
525 runelen = 1;
526 tp->len = ip - tp->t;
527 tp++;
531 /* have seen ?; handle the trigraph it starts (if any) else 0 */
533 trigraph(Source * s)
535 uchar c;
537 while (s->inp + 2 >= s->inl && fillbuf(s) != EOF);
539 if (s->inp[1] != '?')
540 return 0;
541 c = 0;
542 switch (s->inp[2])
544 case '=':
545 c = '#';
546 break;
547 case '(':
548 c = '[';
549 break;
550 case '/':
551 c = '\\';
552 break;
553 case ')':
554 c = ']';
555 break;
556 case '\'':
557 c = '^';
558 break;
559 case '<':
560 c = '{';
561 break;
562 case '!':
563 c = '|';
564 break;
565 case '>':
566 c = '}';
567 break;
568 case '-':
569 c = '~';
570 break;
572 if (c)
574 *s->inp = c;
575 memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2);
576 s->inl -= 2;
578 return c;
582 foldline(Source * s)
584 int n = 1;
586 /* skip pending wihite spaces */
587 while ((s->inp[n] == ' ') || (s->inp[n] == '\t'))
589 n++;
590 if ((s->inp + n >= s->inl) && (fillbuf(s) == EOF))
591 break;
594 /* refill buffer */
595 while (s->inp + (n + 1) >= s->inl && fillbuf(s) != EOF);
597 /* skip DOS line ends */
598 if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) ||
599 ((s->inp[n] == '\n') && (s->inp[n+1] == '\r')))
600 n++;
602 if ((s->inp[n] == '\n') || (s->inp[n] == '\r'))
604 memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2);
605 s->inl -= n + 1;
606 return 1;
608 return 0;
612 fillbuf(Source * s)
614 int n;
616 if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0)
617 n = 0;
618 s->inl += n;
619 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB;
620 if (n == 0)
622 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC;
623 return EOF;
625 return 0;
629 * Push down to new source of characters.
630 * If fd>0 and str==NULL, then from a file `name';
631 * if fd==-1 and str, then from the string.
633 Source *
634 setsource(char *name, int path, int fd, char *str, int wrap)
636 Source *s = new(Source);
637 int len;
639 s->line = 1;
640 s->lineinc = 0;
641 s->fd = fd;
642 s->filename = name;
643 s->next = cursource;
644 s->ifdepth = 0;
645 s->pathdepth = path;
646 s->wrap = wrap;
648 cursource = s;
650 if (s->wrap)
651 genwrap(0);
653 /* slop at right for EOB */
654 if (str)
656 len = strlen(str);
657 s->inb = domalloc(len + 4);
658 s->inp = s->inb;
659 strncpy((char *) s->inp, str, len);
661 else
663 s->inb = domalloc(INS + 4);
664 s->inp = s->inb;
665 len = 0;
667 s->inl = s->inp + len;
668 s->inl[0] = s->inl[1] = EOB;
670 return s;
673 void
674 unsetsource(void)
676 Source *s = cursource;
678 if (s->wrap)
679 genwrap(1);
681 if (s->fd >= 0)
683 close(s->fd);
684 dofree(s->inb);
686 cursource = s->next;
687 dofree(s);