1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
23 #if (defined(_WIN32) || defined(__IBMC__))
30 * lexical FSM encoding
31 * when in state state, and one of the characters
32 * in ch arrives, enter nextstate.
33 * States >= S_SELF are either final, or at least require special action.
34 * In 'fsm' there is a line for each state X charset X nextstate.
35 * List chars that overwrite previous entries later (e.g. C_ALPH
36 * can be overridden by '_' by a later entry; and C_XX is the
37 * universal set, and should always be first.
38 * States above S_SELF are represented in the big table as negative values.
39 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
40 * These actions differ in that S_SELF doesn't have a lookahead char,
43 * The encoding is blown out into a big table for time-efficiency.
45 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
49 #define ACT(tok,act) ((tok<<7)+act)
51 #define GETACT(st) ((st>>7)&0x1ff)
53 /* character classes */
60 START
= 0, NUM1
, NUM2
, NUM3
, ID1
, ST1
, ST2
, ST3
, COM1
, COM2
, COM3
, COM4
,
61 CC1
, CC2
, WS1
, PLUS1
, MINUS1
, STAR1
, SLASH1
, PCT1
, SHARP1
,
62 CIRC1
, GT1
, GT2
, LT1
, LT2
, OR1
, AND1
, ASG1
, NOT1
, DOTS1
,
63 S_SELF
= MAXSTATE
, S_SELFB
, S_EOF
, S_NL
, S_EOFSTR
,
64 S_STNL
, S_COMNL
, S_EOFCOM
, S_COMMENT
, S_EOB
, S_WS
, S_NAME
71 int state
; /* if in this state */
72 uchar ch
[4]; /* and see one of these characters */
73 int nextstate
; /* enter this state if +ve */
76 static /*const*/ struct fsm fsm
[] = {
78 {START
, {C_XX
}, ACT(UNCLASS
, S_SELF
)},
79 {START
, {' ', '\t', '\v'}, WS1
},
80 {START
, {C_NUM
}, NUM1
},
82 {START
, {C_ALPH
}, ID1
},
87 {START
, {EOFC
}, S_EOF
},
88 {START
, {'\n'}, S_NL
},
89 {START
, {'-'}, MINUS1
},
90 {START
, {'+'}, PLUS1
},
97 {START
, {'#'}, SHARP1
},
99 {START
, {'['}, ACT(SBRA
, S_SELF
)},
100 {START
, {']'}, ACT(SKET
, S_SELF
)},
101 {START
, {'('}, ACT(LP
, S_SELF
)},
102 {START
, {')'}, ACT(RP
, S_SELF
)},
103 {START
, {'*'}, STAR1
},
104 {START
, {','}, ACT(COMMA
, S_SELF
)},
105 {START
, {'?'}, ACT(QUEST
, S_SELF
)},
106 {START
, {':'}, ACT(COLON
, S_SELF
)},
107 {START
, {';'}, ACT(SEMIC
, S_SELF
)},
108 {START
, {'{'}, ACT(CBRA
, S_SELF
)},
109 {START
, {'}'}, ACT(CKET
, S_SELF
)},
110 {START
, {'~'}, ACT(TILDE
, S_SELF
)},
111 {START
, {'^'}, CIRC1
},
114 {NUM1
, {C_XX
}, ACT(NUMBER
, S_SELFB
)},
115 {NUM1
, {C_NUM
, C_ALPH
, '.'}, NUM1
},
116 {NUM1
, {'E', 'e'}, NUM2
},
117 {NUM1
, {'_'}, ACT(NUMBER
, S_SELFB
)},
119 /* saw possible start of exponent, digits-e */
120 {NUM2
, {C_XX
}, ACT(NUMBER
, S_SELFB
)},
121 {NUM2
, {'+', '-'}, NUM1
},
122 {NUM2
, {C_NUM
, C_ALPH
}, NUM1
},
123 {NUM2
, {'_'}, ACT(NUMBER
, S_SELFB
)},
125 /* saw a '.', which could be a number or an operator */
126 {NUM3
, {C_XX
}, ACT(DOT
, S_SELFB
)},
127 {NUM3
, {'.'}, DOTS1
},
128 {NUM3
, {C_NUM
}, NUM1
},
130 {DOTS1
, {C_XX
}, ACT(UNCLASS
, S_SELFB
)},
131 {DOTS1
, {C_NUM
}, NUM1
},
132 {DOTS1
, {'.'}, ACT(ELLIPS
, S_SELF
)},
134 /* saw a letter or _ */
135 {ID1
, {C_XX
}, ACT(NAME
, S_NAME
)},
136 {ID1
, {C_ALPH
, C_NUM
}, ID1
},
138 /* saw L (start of wide string?) */
139 {ST1
, {C_XX
}, ACT(NAME
, S_NAME
)},
140 {ST1
, {C_ALPH
, C_NUM
}, ID1
},
144 /* saw " beginning string */
146 {ST2
, {'"'}, ACT(STRING
, S_SELF
)},
148 {ST2
, {'\n'}, S_STNL
},
149 {ST2
, {EOFC
}, S_EOFSTR
},
151 /* saw \ in string */
153 {ST3
, {'\n'}, S_STNL
},
154 {ST3
, {EOFC
}, S_EOFSTR
},
156 /* saw ' beginning character const */
158 {CC1
, {'\''}, ACT(CCON
, S_SELF
)},
160 {CC1
, {'\n'}, S_STNL
},
161 {CC1
, {EOFC
}, S_EOFSTR
},
165 {CC2
, {'\n'}, S_STNL
},
166 {CC2
, {EOFC
}, S_EOFSTR
},
168 /* saw /, perhaps start of comment */
169 {COM1
, {C_XX
}, ACT(SLASH
, S_SELFB
)},
170 {COM1
, {'='}, ACT(ASSLASH
, S_SELF
)},
174 /* saw / followed by *, start of comment */
175 {COM2
, {C_XX
}, COM2
},
176 {COM2
, {'\n'}, S_COMNL
},
178 {COM2
, {EOFC
}, S_EOFCOM
},
180 /* saw the * possibly ending a comment */
181 {COM3
, {C_XX
}, COM2
},
182 {COM3
, {'\n'}, S_COMNL
},
184 {COM3
, {'/'}, S_COMMENT
},
187 {COM4
, {C_XX
}, COM4
},
188 {COM4
, {'\n'}, S_NL
},
189 {COM4
, {EOFC
}, S_EOFCOM
},
191 /* saw white space, eat it up */
193 {WS1
, {'\t', '\v', ' '}, WS1
},
195 /* saw -, check --, -=, -> */
196 {MINUS1
, {C_XX
}, ACT(MINUS
, S_SELFB
)},
197 {MINUS1
, {'-'}, ACT(MMINUS
, S_SELF
)},
198 {MINUS1
, {'='}, ACT(ASMINUS
, S_SELF
)},
199 {MINUS1
, {'>'}, ACT(ARROW
, S_SELF
)},
201 /* saw +, check ++, += */
202 {PLUS1
, {C_XX
}, ACT(PLUS
, S_SELFB
)},
203 {PLUS1
, {'+'}, ACT(PPLUS
, S_SELF
)},
204 {PLUS1
, {'='}, ACT(ASPLUS
, S_SELF
)},
206 /* saw <, check <<, <<=, <= */
207 {LT1
, {C_XX
}, ACT(LT
, S_SELFB
)},
209 {LT1
, {'='}, ACT(LEQ
, S_SELF
)},
210 {LT2
, {C_XX
}, ACT(LSH
, S_SELFB
)},
211 {LT2
, {'='}, ACT(ASLSH
, S_SELF
)},
213 /* saw >, check >>, >>=, >= */
214 {GT1
, {C_XX
}, ACT(GT
, S_SELFB
)},
216 {GT1
, {'='}, ACT(GEQ
, S_SELF
)},
217 {GT2
, {C_XX
}, ACT(RSH
, S_SELFB
)},
218 {GT2
, {'='}, ACT(ASRSH
, S_SELF
)},
221 {ASG1
, {C_XX
}, ACT(ASGN
, S_SELFB
)},
222 {ASG1
, {'='}, ACT(EQ
, S_SELF
)},
225 {NOT1
, {C_XX
}, ACT(NOT
, S_SELFB
)},
226 {NOT1
, {'='}, ACT(NEQ
, S_SELF
)},
229 {AND1
, {C_XX
}, ACT(AND
, S_SELFB
)},
230 {AND1
, {'&'}, ACT(LAND
, S_SELF
)},
231 {AND1
, {'='}, ACT(ASAND
, S_SELF
)},
234 {OR1
, {C_XX
}, ACT(OR
, S_SELFB
)},
235 {OR1
, {'|'}, ACT(LOR
, S_SELF
)},
236 {OR1
, {'='}, ACT(ASOR
, S_SELF
)},
239 {SHARP1
, {C_XX
}, ACT(SHARP
, S_SELFB
)},
240 {SHARP1
, {'#'}, ACT(DSHARP
, S_SELF
)},
243 {PCT1
, {C_XX
}, ACT(PCT
, S_SELFB
)},
244 {PCT1
, {'='}, ACT(ASPCT
, S_SELF
)},
247 {STAR1
, {C_XX
}, ACT(STAR
, S_SELFB
)},
248 {STAR1
, {'='}, ACT(ASSTAR
, S_SELF
)},
251 {CIRC1
, {C_XX
}, ACT(CIRC
, S_SELFB
)},
252 {CIRC1
, {'='}, ACT(ASCIRC
, S_SELF
)},
257 /* first index is char, second is state */
258 /* increase #states to power of 2 to encourage use of shift */
259 short bigfsm
[256][MAXSTATE
];
264 /* const */ struct fsm
*fp
;
267 for (fp
= fsm
; fp
->state
>= 0; fp
++)
269 for (i
= 0; fp
->ch
[i
]; i
++)
271 nstate
= fp
->nextstate
;
272 if (nstate
>= S_SELF
)
277 case C_XX
: /* random characters */
278 for (j
= 0; j
< 256; j
++)
279 bigfsm
[j
][fp
->state
] = (short) nstate
;
282 for (j
= 0; j
< 256; j
++)
283 if (('a' <= j
&& j
<= 'z') || ('A' <= j
&& j
<= 'Z')
285 bigfsm
[j
][fp
->state
] = (short) nstate
;
288 for (j
= '0'; j
<= '9'; j
++)
289 bigfsm
[j
][fp
->state
] = (short) nstate
;
292 bigfsm
[fp
->ch
[i
]][fp
->state
] = (short) nstate
;
298 * install special cases for ? (trigraphs), \ (splicing), runes, and
301 for (i
= 0; i
< MAXSTATE
; i
++)
303 for (j
= 0; j
< 0xFF; j
++)
304 if (j
== '?' || j
== '\\' || j
== '\n' || j
== '\r')
306 if (bigfsm
[j
][i
] > 0)
307 bigfsm
[j
][i
] = ~bigfsm
[j
][i
];
308 bigfsm
[j
][i
] &= ~QBSBIT
;
310 bigfsm
[EOB
][i
] = ~S_EOB
;
311 if (bigfsm
[EOFC
][i
] >= 0)
312 bigfsm
[EOFC
][i
] = ~S_EOF
;
319 /* do C++ comments? */
320 if ((Cplusplus
== 0) || (Cflag
!= 0))
321 bigfsm
['/'][COM1
] = bigfsm
['x'][COM1
];
325 * fill in a row of tokens from input, terminated by NL or END
326 * First token is put at trp->lp.
327 * Reset is non-zero when the input buffer can be "rewound."
328 * The value is a flag indicating that possible macros have
329 * been seen in the row.
332 gettokens(Tokenrow
* trp
, int reset
)
334 int c
, state
, oldstate
;
338 Source
*s
= cursource
;
347 { /* nothing in buffer */
350 ip
= s
->inp
= s
->inb
;
353 if (ip
>= s
->inb
+ (3 * INS
/ 4))
355 memmove(s
->inb
, ip
, 4 + s
->inl
- ip
);
356 s
->inl
= s
->inb
+ (s
->inl
- ip
);
357 ip
= s
->inp
= s
->inb
;
360 maxp
= &trp
->bp
[trp
->max
];
368 tp
= growtokenrow(trp
);
369 maxp
= &trp
->bp
[trp
->max
];
382 if ((state
= bigfsm
[c
][state
]) >= 0)
390 switch (state
& 0177)
397 tp
->type
= (unsigned char) GETACT(state
);
398 tp
->len
= ip
- tp
->t
;
402 case S_NAME
: /* like S_SELFB but with nmac check */
404 tp
->len
= ip
- tp
->t
;
405 nmac
|= quicklook(tp
->t
[0], tp
->len
> 1 ? tp
->t
[1] : 0);
410 tp
->wslen
= ip
- tp
->t
;
416 if ((state
& QBSBIT
) == 0)
427 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
429 if (s
->inp
[1] == '\r')
431 memmove(s
->inp
+ 1, s
->inp
+ 2, s
->inl
- s
->inp
+ 2);
440 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
442 if (s
->inp
[1] == '\n')
444 memmove(s
->inp
, s
->inp
+ 1, s
->inl
- s
->inp
+ 1);
455 { /* check trigraph */
473 error(WARNING
, "Lexical botch in cpp");
488 if (tp
!= trp
->bp
&& (tp
- 1)->type
!= NL
&& cursource
->fd
!= -1)
489 error(WARNING
, "No newline at end of file");
494 error(ERROR
, "Unterminated string or char const");
507 error(FATAL
, "EOF in string or char constant");
518 error(WARNING
, "EOF inside comment");
542 tp
->len
= ip
- tp
->t
;
547 /* have seen ?; handle the trigraph it starts (if any) else 0 */
553 while (s
->inp
+ 2 >= s
->inl
&& fillbuf(s
) != EOF
);
555 if (s
->inp
[1] != '?')
591 memmove(s
->inp
+ 1, s
->inp
+ 3, s
->inl
- s
->inp
+ 2);
602 /* skip pending white spaces */
603 while ((s
->inp
[n
] == ' ') || (s
->inp
[n
] == '\t'))
606 if ((s
->inp
+ n
>= s
->inl
) && (fillbuf(s
) == EOF
))
611 while (s
->inp
+ (n
+ 1) >= s
->inl
&& fillbuf(s
) != EOF
);
613 /* skip DOS line ends */
614 if (((s
->inp
[n
] == '\r') && (s
->inp
[n
+1] == '\n')) ||
615 ((s
->inp
[n
] == '\n') && (s
->inp
[n
+1] == '\r')))
618 if ((s
->inp
[n
] == '\n') || (s
->inp
[n
] == '\r'))
620 memmove(s
->inp
, s
->inp
+ n
+ 1, s
->inl
- s
->inp
+ n
+ 2);
632 if (s
->fd
< 0 || (n
= read(s
->fd
, (char *) s
->inl
, INS
/ 8)) <= 0)
635 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOB
;
638 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOFC
;
645 * Push down to new source of characters.
646 * If fd>0 and str==NULL, then from a file `name';
647 * if fd==-1 and str, then from the string.
650 setsource(char *name
, int path
, int fd
, char *str
, int wrap
)
652 Source
*s
= new(Source
);
669 /* slop at right for EOB */
673 s
->inb
= domalloc(len
+ 4);
675 strncpy((char *) s
->inp
, str
, len
);
679 s
->inb
= domalloc(INS
+ 4);
683 s
->inl
= s
->inp
+ len
;
684 s
->inl
[0] = s
->inl
[1] = EOB
;
692 Source
*s
= cursource
;
706 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */