4 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
11 * lexical FSM encoding
12 * when in state state, and one of the characters
13 * in ch arrives, enter nextstate.
14 * States >= S_SELF are either final, or at least require special action.
15 * In 'fsm' there is a line for each state X charset X nextstate.
16 * List chars that overwrite previous entries later (e.g. C_ALPH
17 * can be overridden by '_' by a later entry; and C_XX is the
18 * the universal set, and should always be first.
19 * States above S_SELF are represented in the big table as negative values.
20 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
21 * These actions differ in that S_SELF doesn't have a lookahead char,
24 * The encoding is blown out into a big table for time-efficiency.
26 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
30 #define ACT(tok,act) ((tok<<7)+act)
32 #define GETACT(st) ((st>>7)&0x1ff)
34 /* character classes */
43 START
= 0, NUM1
, NUM2
, NUM3
, ID1
, ST1
, ST2
, ST3
, COM1
, COM2
, COM3
, COM4
,
44 CC1
, CC2
, WS1
, PLUS1
, MINUS1
, STAR1
, SLASH1
, PCT1
, SHARP1
,
45 CIRC1
, GT1
, GT2
, LT1
, LT2
, OR1
, AND1
, ASG1
, NOT1
, DOTS1
,
46 S_SELF
= MAXSTATE
, S_SELFB
, S_EOF
, S_NL
, S_EOFSTR
,
47 S_STNL
, S_COMNL
, S_EOFCOM
, S_COMMENT
, S_EOB
, S_WS
, S_NAME
54 int state
; /* if in this state */
55 uchar ch
[4]; /* and see one of these characters */
56 int nextstate
; /* enter this state if +ve */
59 /*const*/ struct fsm fsm
[] = {
61 {START
, {C_XX
}, ACT(UNCLASS
, S_SELF
)},
62 {START
, {' ', '\t', '\v'}, WS1
},
63 {START
, {C_NUM
}, NUM1
},
65 {START
, {C_ALPH
}, ID1
},
70 {START
, {EOFC
}, S_EOF
},
71 {START
, {'\n'}, S_NL
},
72 {START
, {'-'}, MINUS1
},
73 {START
, {'+'}, PLUS1
},
80 {START
, {'#'}, SHARP1
},
82 {START
, {'['}, ACT(SBRA
, S_SELF
)},
83 {START
, {']'}, ACT(SKET
, S_SELF
)},
84 {START
, {'('}, ACT(LP
, S_SELF
)},
85 {START
, {')'}, ACT(RP
, S_SELF
)},
86 {START
, {'*'}, STAR1
},
87 {START
, {','}, ACT(COMMA
, S_SELF
)},
88 {START
, {'?'}, ACT(QUEST
, S_SELF
)},
89 {START
, {':'}, ACT(COLON
, S_SELF
)},
90 {START
, {';'}, ACT(SEMIC
, S_SELF
)},
91 {START
, {'{'}, ACT(CBRA
, S_SELF
)},
92 {START
, {'}'}, ACT(CKET
, S_SELF
)},
93 {START
, {'~'}, ACT(TILDE
, S_SELF
)},
94 {START
, {'^'}, CIRC1
},
97 {NUM1
, {C_XX
}, ACT(NUMBER
, S_SELFB
)},
98 {NUM1
, {C_NUM
, C_ALPH
, '.'}, NUM1
},
99 {NUM1
, {'E', 'e'}, NUM2
},
100 {NUM1
, {'_'}, ACT(NUMBER
, S_SELFB
)},
102 /* saw possible start of exponent, digits-e */
103 {NUM2
, {C_XX
}, ACT(NUMBER
, S_SELFB
)},
104 {NUM2
, {'+', '-'}, NUM1
},
105 {NUM2
, {C_NUM
, C_ALPH
}, NUM1
},
106 {NUM2
, {'_'}, ACT(NUMBER
, S_SELFB
)},
108 /* saw a '.', which could be a number or an operator */
109 {NUM3
, {C_XX
}, ACT(DOT
, S_SELFB
)},
110 {NUM3
, {'.'}, DOTS1
},
111 {NUM3
, {C_NUM
}, NUM1
},
113 {DOTS1
, {C_XX
}, ACT(UNCLASS
, S_SELFB
)},
114 {DOTS1
, {C_NUM
}, NUM1
},
115 {DOTS1
, {'.'}, ACT(ELLIPS
, S_SELF
)},
117 /* saw a letter or _ */
118 {ID1
, {C_XX
}, ACT(NAME
, S_NAME
)},
119 {ID1
, {C_ALPH
, C_NUM
}, ID1
},
121 /* saw L (start of wide string?) */
122 {ST1
, {C_XX
}, ACT(NAME
, S_NAME
)},
123 {ST1
, {C_ALPH
, C_NUM
}, ID1
},
127 /* saw " beginning string */
129 {ST2
, {'"'}, ACT(STRING
, S_SELF
)},
131 {ST2
, {'\n'}, S_STNL
},
132 {ST2
, {EOFC
}, S_EOFSTR
},
134 /* saw \ in string */
136 {ST3
, {'\n'}, S_STNL
},
137 {ST3
, {EOFC
}, S_EOFSTR
},
139 /* saw ' beginning character const */
141 {CC1
, {'\''}, ACT(CCON
, S_SELF
)},
143 {CC1
, {'\n'}, S_STNL
},
144 {CC1
, {EOFC
}, S_EOFSTR
},
148 {CC2
, {'\n'}, S_STNL
},
149 {CC2
, {EOFC
}, S_EOFSTR
},
151 /* saw /, perhaps start of comment */
152 {COM1
, {C_XX
}, ACT(SLASH
, S_SELFB
)},
153 {COM1
, {'='}, ACT(ASSLASH
, S_SELF
)},
157 /* saw / followed by *, start of comment */
158 {COM2
, {C_XX
}, COM2
},
159 {COM2
, {'\n'}, S_COMNL
},
161 {COM2
, {EOFC
}, S_EOFCOM
},
163 /* saw the * possibly ending a comment */
164 {COM3
, {C_XX
}, COM2
},
165 {COM3
, {'\n'}, S_COMNL
},
167 {COM3
, {'/'}, S_COMMENT
},
170 {COM4
, {C_XX
}, COM4
},
171 {COM4
, {'\n'}, S_NL
},
172 {COM4
, {EOFC
}, S_EOFCOM
},
174 /* saw white space, eat it up */
176 {WS1
, {'\t', '\v', ' '}, WS1
},
178 /* saw -, check --, -=, -> */
179 {MINUS1
, {C_XX
}, ACT(MINUS
, S_SELFB
)},
180 {MINUS1
, {'-'}, ACT(MMINUS
, S_SELF
)},
181 {MINUS1
, {'='}, ACT(ASMINUS
, S_SELF
)},
182 {MINUS1
, {'>'}, ACT(ARROW
, S_SELF
)},
184 /* saw +, check ++, += */
185 {PLUS1
, {C_XX
}, ACT(PLUS
, S_SELFB
)},
186 {PLUS1
, {'+'}, ACT(PPLUS
, S_SELF
)},
187 {PLUS1
, {'='}, ACT(ASPLUS
, S_SELF
)},
189 /* saw <, check <<, <<=, <= */
190 {LT1
, {C_XX
}, ACT(LT
, S_SELFB
)},
192 {LT1
, {'='}, ACT(LEQ
, S_SELF
)},
193 {LT2
, {C_XX
}, ACT(LSH
, S_SELFB
)},
194 {LT2
, {'='}, ACT(ASLSH
, S_SELF
)},
196 /* saw >, check >>, >>=, >= */
197 {GT1
, {C_XX
}, ACT(GT
, S_SELFB
)},
199 {GT1
, {'='}, ACT(GEQ
, S_SELF
)},
200 {GT2
, {C_XX
}, ACT(RSH
, S_SELFB
)},
201 {GT2
, {'='}, ACT(ASRSH
, S_SELF
)},
204 {ASG1
, {C_XX
}, ACT(ASGN
, S_SELFB
)},
205 {ASG1
, {'='}, ACT(EQ
, S_SELF
)},
208 {NOT1
, {C_XX
}, ACT(NOT
, S_SELFB
)},
209 {NOT1
, {'='}, ACT(NEQ
, S_SELF
)},
212 {AND1
, {C_XX
}, ACT(AND
, S_SELFB
)},
213 {AND1
, {'&'}, ACT(LAND
, S_SELF
)},
214 {AND1
, {'='}, ACT(ASAND
, S_SELF
)},
217 {OR1
, {C_XX
}, ACT(OR
, S_SELFB
)},
218 {OR1
, {'|'}, ACT(LOR
, S_SELF
)},
219 {OR1
, {'='}, ACT(ASOR
, S_SELF
)},
222 {SHARP1
, {C_XX
}, ACT(SHARP
, S_SELFB
)},
223 {SHARP1
, {'#'}, ACT(DSHARP
, S_SELF
)},
226 {PCT1
, {C_XX
}, ACT(PCT
, S_SELFB
)},
227 {PCT1
, {'='}, ACT(ASPCT
, S_SELF
)},
230 {STAR1
, {C_XX
}, ACT(STAR
, S_SELFB
)},
231 {STAR1
, {'='}, ACT(ASSTAR
, S_SELF
)},
234 {CIRC1
, {C_XX
}, ACT(CIRC
, S_SELFB
)},
235 {CIRC1
, {'='}, ACT(ASCIRC
, S_SELF
)},
240 /* first index is char, second is state */
241 /* increase #states to power of 2 to encourage use of shift */
242 short bigfsm
[256][MAXSTATE
];
247 /* const */ struct fsm
*fp
;
250 for (fp
= fsm
; fp
->state
>= 0; fp
++)
252 for (i
= 0; fp
->ch
[i
]; i
++)
254 nstate
= fp
->nextstate
;
255 if (nstate
>= S_SELF
)
260 case C_XX
: /* random characters */
261 for (j
= 0; j
< 256; j
++)
262 bigfsm
[j
][fp
->state
] = (short) nstate
;
265 for (j
= 0; j
<= 256; j
++)
267 if( isalpha( j
) || (j
== '_') )
269 if (('a' <= j
&& j
<= 'z') || ('A' <= j
&& j
<= 'Z')
272 bigfsm
[j
][fp
->state
] = (short) nstate
;
275 for (j
= '0'; j
<= '9'; j
++)
276 bigfsm
[j
][fp
->state
] = (short) nstate
;
279 bigfsm
[fp
->ch
[i
]][fp
->state
] = (short) nstate
;
285 * install special cases for ? (trigraphs), \ (splicing), runes, and
288 for (i
= 0; i
< MAXSTATE
; i
++)
290 for (j
= 0; j
< 0xFF; j
++)
291 if (j
== '?' || j
== '\\' || j
== '\n' || j
== '\r')
293 if (bigfsm
[j
][i
] > 0)
294 bigfsm
[j
][i
] = ~bigfsm
[j
][i
];
295 bigfsm
[j
][i
] &= ~QBSBIT
;
297 bigfsm
[EOB
][i
] = ~S_EOB
;
298 if (bigfsm
[EOFC
][i
] >= 0)
299 bigfsm
[EOFC
][i
] = ~S_EOF
;
306 /* do C++ comments? */
307 if ((Cplusplus
== 0) || (Cflag
!= 0))
308 bigfsm
['/'][COM1
] = bigfsm
['x'][COM1
];
312 * fill in a row of tokens from input, terminated by NL or END
313 * First token is put at trp->lp.
314 * Reset is non-zero when the input buffer can be "rewound."
315 * The value is a flag indicating that possible macros have
316 * been seen in the row.
319 gettokens(Tokenrow
* trp
, int reset
)
321 register int c
, state
, oldstate
;
323 register Token
*tp
, *maxp
;
325 Source
*s
= cursource
;
334 { /* nothing in buffer */
337 ip
= s
->inp
= s
->inb
;
340 if (ip
>= s
->inb
+ (3 * INS
/ 4))
342 memmove(s
->inb
, ip
, 4 + s
->inl
- ip
);
343 s
->inl
= s
->inb
+ (s
->inl
- ip
);
344 ip
= s
->inp
= s
->inb
;
347 maxp
= &trp
->bp
[trp
->max
];
355 tp
= growtokenrow(trp
);
356 maxp
= &trp
->bp
[trp
->max
];
369 if ((state
= bigfsm
[c
][state
]) >= 0)
377 switch (state
& 0177)
383 tp
->type
= (unsigned char) GETACT(state
);
384 tp
->len
= ip
- tp
->t
;
388 case S_NAME
: /* like S_SELFB but with nmac check */
390 tp
->len
= ip
- tp
->t
;
391 nmac
|= quicklook(tp
->t
[0], tp
->len
> 1 ? tp
->t
[1] : 0);
396 tp
->wslen
= ip
- tp
->t
;
402 if ((state
& QBSBIT
) == 0)
413 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
415 if (s
->inp
[1] == '\r')
417 memmove(s
->inp
+ 1, s
->inp
+ 2, s
->inl
- s
->inp
+ 2);
426 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
428 if (s
->inp
[1] == '\n')
430 memmove(s
->inp
, s
->inp
+ 1, s
->inl
- s
->inp
+ 1);
441 { /* check trigraph */
459 error(WARNING
, "Lexical botch in cpp");
474 if (tp
!= trp
->bp
&& (tp
- 1)->type
!= NL
&& cursource
->fd
!= -1)
475 error(WARNING
, "No newline at end of file");
480 error(ERROR
, "Unterminated string or char const");
492 error(FATAL
, "EOF in string or char constant");
503 error(WARNING
, "EOF inside comment");
526 tp
->len
= ip
- tp
->t
;
531 /* have seen ?; handle the trigraph it starts (if any) else 0 */
537 while (s
->inp
+ 2 >= s
->inl
&& fillbuf(s
) != EOF
);
539 if (s
->inp
[1] != '?')
575 memmove(s
->inp
+ 1, s
->inp
+ 3, s
->inl
- s
->inp
+ 2);
586 /* skip pending wihite spaces */
587 while ((s
->inp
[n
] == ' ') || (s
->inp
[n
] == '\t'))
590 if ((s
->inp
+ n
>= s
->inl
) && (fillbuf(s
) == EOF
))
595 while (s
->inp
+ (n
+ 1) >= s
->inl
&& fillbuf(s
) != EOF
);
597 /* skip DOS line ends */
598 if (((s
->inp
[n
] == '\r') && (s
->inp
[n
+1] == '\n')) ||
599 ((s
->inp
[n
] == '\n') && (s
->inp
[n
+1] == '\r')))
602 if ((s
->inp
[n
] == '\n') || (s
->inp
[n
] == '\r'))
604 memmove(s
->inp
, s
->inp
+ n
+ 1, s
->inl
- s
->inp
+ n
+ 2);
616 if (s
->fd
< 0 || (n
= read(s
->fd
, (char *) s
->inl
, INS
/ 8)) <= 0)
619 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOB
;
622 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOFC
;
629 * Push down to new source of characters.
630 * If fd>0 and str==NULL, then from a file `name';
631 * if fd==-1 and str, then from the string.
634 setsource(char *name
, int path
, int fd
, char *str
, int wrap
)
636 Source
*s
= new(Source
);
653 /* slop at right for EOB */
657 s
->inb
= domalloc(len
+ 4);
659 strncpy((char *) s
->inp
, str
, len
);
663 s
->inb
= domalloc(INS
+ 4);
667 s
->inl
= s
->inp
+ len
;
668 s
->inl
[0] = s
->inl
[1] = EOB
;
676 Source
*s
= cursource
;