1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
23 #if (defined(_WIN32) || defined(__IBMC__))
30 * lexical FSM encoding
31 * when in state state, and one of the characters
32 * in ch arrives, enter nextstate.
33 * States >= S_SELF are either final, or at least require special action.
34 * In 'fsm' there is a line for each state X charset X nextstate.
35 * List chars that overwrite previous entries later (e.g. C_ALPH
36 * can be overridden by '_' by a later entry; and C_XX is the
37 * universal set, and should always be first.
38 * States above S_SELF are represented in the big table as negative values.
39 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
40 * These actions differ in that S_SELF doesn't have a lookahead char,
43 * The encoding is blown out into a big table for time-efficiency.
45 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
49 #define ACT(tok,act) ((tok<<7)+act)
51 #define GETACT(st) ((st>>7)&0x1ff)
53 /* character classes */
60 START
= 0, NUM1
, NUM2
, NUM3
, ID1
, ST1
, ST2
, ST3
, COM1
, COM2
, COM3
, COM4
,
61 CC1
, CC2
, WS1
, PLUS1
, MINUS1
, STAR1
, PCT1
, SHARP1
,
62 CIRC1
, GT1
, GT2
, LT1
, LT2
, OR1
, AND1
, ASG1
, NOT1
, DOTS1
,
63 S_SELF
= MAXSTATE
, S_SELFB
, S_EOF
, S_NL
, S_EOFSTR
,
64 S_STNL
, S_COMNL
, S_EOFCOM
, S_COMMENT
, S_EOB
, S_WS
, S_NAME
69 int state
; /* if in this state */
70 uchar ch
[4]; /* and see one of these characters */
71 int const nextstate
; /* enter this state if +ve */
74 static const struct fsm fsm
[] = {
76 {START
, {C_XX
}, ACT(UNCLASS
, S_SELF
)},
77 {START
, {' ', '\t', '\v'}, WS1
},
78 {START
, {C_NUM
}, NUM1
},
80 {START
, {C_ALPH
}, ID1
},
85 {START
, {EOFC
}, S_EOF
},
86 {START
, {'\n'}, S_NL
},
87 {START
, {'-'}, MINUS1
},
88 {START
, {'+'}, PLUS1
},
95 {START
, {'#'}, SHARP1
},
97 {START
, {'['}, ACT(SBRA
, S_SELF
)},
98 {START
, {']'}, ACT(SKET
, S_SELF
)},
99 {START
, {'('}, ACT(LP
, S_SELF
)},
100 {START
, {')'}, ACT(RP
, S_SELF
)},
101 {START
, {'*'}, STAR1
},
102 {START
, {','}, ACT(COMMA
, S_SELF
)},
103 {START
, {'?'}, ACT(QUEST
, S_SELF
)},
104 {START
, {':'}, ACT(COLON
, S_SELF
)},
105 {START
, {';'}, ACT(SEMIC
, S_SELF
)},
106 {START
, {'{'}, ACT(CBRA
, S_SELF
)},
107 {START
, {'}'}, ACT(CKET
, S_SELF
)},
108 {START
, {'~'}, ACT(TILDE
, S_SELF
)},
109 {START
, {'^'}, CIRC1
},
112 {NUM1
, {C_XX
}, ACT(NUMBER
, S_SELFB
)},
113 {NUM1
, {C_NUM
, C_ALPH
, '.'}, NUM1
},
114 {NUM1
, {'E', 'e'}, NUM2
},
115 {NUM1
, {'_'}, ACT(NUMBER
, S_SELFB
)},
117 /* saw possible start of exponent, digits-e */
118 {NUM2
, {C_XX
}, ACT(NUMBER
, S_SELFB
)},
119 {NUM2
, {'+', '-'}, NUM1
},
120 {NUM2
, {C_NUM
, C_ALPH
}, NUM1
},
121 {NUM2
, {'_'}, ACT(NUMBER
, S_SELFB
)},
123 /* saw a '.', which could be a number or an operator */
124 {NUM3
, {C_XX
}, ACT(DOT
, S_SELFB
)},
125 {NUM3
, {'.'}, DOTS1
},
126 {NUM3
, {C_NUM
}, NUM1
},
128 {DOTS1
, {C_XX
}, ACT(UNCLASS
, S_SELFB
)},
129 {DOTS1
, {C_NUM
}, NUM1
},
130 {DOTS1
, {'.'}, ACT(ELLIPS
, S_SELF
)},
132 /* saw a letter or _ */
133 {ID1
, {C_XX
}, ACT(NAME
, S_NAME
)},
134 {ID1
, {C_ALPH
, C_NUM
}, ID1
},
136 /* saw L (start of wide string?) */
137 {ST1
, {C_XX
}, ACT(NAME
, S_NAME
)},
138 {ST1
, {C_ALPH
, C_NUM
}, ID1
},
142 /* saw " beginning string */
144 {ST2
, {'"'}, ACT(STRING
, S_SELF
)},
146 {ST2
, {'\n'}, S_STNL
},
147 {ST2
, {EOFC
}, S_EOFSTR
},
149 /* saw \ in string */
151 {ST3
, {'\n'}, S_STNL
},
152 {ST3
, {EOFC
}, S_EOFSTR
},
154 /* saw ' beginning character const */
156 {CC1
, {'\''}, ACT(CCON
, S_SELF
)},
158 {CC1
, {'\n'}, S_STNL
},
159 {CC1
, {EOFC
}, S_EOFSTR
},
163 {CC2
, {'\n'}, S_STNL
},
164 {CC2
, {EOFC
}, S_EOFSTR
},
166 /* saw /, perhaps start of comment */
167 {COM1
, {C_XX
}, ACT(SLASH
, S_SELFB
)},
168 {COM1
, {'='}, ACT(ASSLASH
, S_SELF
)},
172 /* saw / followed by *, start of comment */
173 {COM2
, {C_XX
}, COM2
},
174 {COM2
, {'\n'}, S_COMNL
},
176 {COM2
, {EOFC
}, S_EOFCOM
},
178 /* saw the * possibly ending a comment */
179 {COM3
, {C_XX
}, COM2
},
180 {COM3
, {'\n'}, S_COMNL
},
182 {COM3
, {'/'}, S_COMMENT
},
185 {COM4
, {C_XX
}, COM4
},
186 {COM4
, {'\n'}, S_NL
},
187 {COM4
, {EOFC
}, S_EOFCOM
},
189 /* saw white space, eat it up */
191 {WS1
, {'\t', '\v', ' '}, WS1
},
193 /* saw -, check --, -=, -> */
194 {MINUS1
, {C_XX
}, ACT(MINUS
, S_SELFB
)},
195 {MINUS1
, {'-'}, ACT(MMINUS
, S_SELF
)},
196 {MINUS1
, {'='}, ACT(ASMINUS
, S_SELF
)},
197 {MINUS1
, {'>'}, ACT(ARROW
, S_SELF
)},
199 /* saw +, check ++, += */
200 {PLUS1
, {C_XX
}, ACT(PLUS
, S_SELFB
)},
201 {PLUS1
, {'+'}, ACT(PPLUS
, S_SELF
)},
202 {PLUS1
, {'='}, ACT(ASPLUS
, S_SELF
)},
204 /* saw <, check <<, <<=, <= */
205 {LT1
, {C_XX
}, ACT(LT
, S_SELFB
)},
207 {LT1
, {'='}, ACT(LEQ
, S_SELF
)},
208 {LT2
, {C_XX
}, ACT(LSH
, S_SELFB
)},
209 {LT2
, {'='}, ACT(ASLSH
, S_SELF
)},
211 /* saw >, check >>, >>=, >= */
212 {GT1
, {C_XX
}, ACT(GT
, S_SELFB
)},
214 {GT1
, {'='}, ACT(GEQ
, S_SELF
)},
215 {GT2
, {C_XX
}, ACT(RSH
, S_SELFB
)},
216 {GT2
, {'='}, ACT(ASRSH
, S_SELF
)},
219 {ASG1
, {C_XX
}, ACT(ASGN
, S_SELFB
)},
220 {ASG1
, {'='}, ACT(EQ
, S_SELF
)},
223 {NOT1
, {C_XX
}, ACT(NOT
, S_SELFB
)},
224 {NOT1
, {'='}, ACT(NEQ
, S_SELF
)},
227 {AND1
, {C_XX
}, ACT(AND
, S_SELFB
)},
228 {AND1
, {'&'}, ACT(LAND
, S_SELF
)},
229 {AND1
, {'='}, ACT(ASAND
, S_SELF
)},
232 {OR1
, {C_XX
}, ACT(OR
, S_SELFB
)},
233 {OR1
, {'|'}, ACT(LOR
, S_SELF
)},
234 {OR1
, {'='}, ACT(ASOR
, S_SELF
)},
237 {SHARP1
, {C_XX
}, ACT(SHARP
, S_SELFB
)},
238 {SHARP1
, {'#'}, ACT(DSHARP
, S_SELF
)},
241 {PCT1
, {C_XX
}, ACT(PCT
, S_SELFB
)},
242 {PCT1
, {'='}, ACT(ASPCT
, S_SELF
)},
245 {STAR1
, {C_XX
}, ACT(STAR
, S_SELFB
)},
246 {STAR1
, {'='}, ACT(ASSTAR
, S_SELF
)},
249 {CIRC1
, {C_XX
}, ACT(CIRC
, S_SELFB
)},
250 {CIRC1
, {'='}, ACT(ASCIRC
, S_SELF
)},
255 /* first index is char, second is state */
256 /* increase #states to power of 2 to encourage use of shift */
257 static short bigfsm
[256][MAXSTATE
];
262 const struct fsm
*fp
;
265 for (fp
= fsm
; fp
->state
>= 0; fp
++)
267 for (i
= 0; fp
->ch
[i
]; i
++)
269 nstate
= fp
->nextstate
;
270 if (nstate
>= S_SELF
)
275 case C_XX
: /* random characters */
276 for (j
= 0; j
< 256; j
++)
277 bigfsm
[j
][fp
->state
] = (short) nstate
;
280 for (j
= 0; j
< 256; j
++)
281 if (('a' <= j
&& j
<= 'z') || ('A' <= j
&& j
<= 'Z')
283 bigfsm
[j
][fp
->state
] = (short) nstate
;
286 for (j
= '0'; j
<= '9'; j
++)
287 bigfsm
[j
][fp
->state
] = (short) nstate
;
290 bigfsm
[fp
->ch
[i
]][fp
->state
] = (short) nstate
;
296 * install special cases for ? (trigraphs), \ (splicing), runes, and
299 for (i
= 0; i
< MAXSTATE
; i
++)
301 for (j
= 0; j
< 0xFF; j
++)
302 if (j
== '?' || j
== '\\' || j
== '\n' || j
== '\r')
304 if (bigfsm
[j
][i
] > 0)
305 bigfsm
[j
][i
] = ~bigfsm
[j
][i
];
306 bigfsm
[j
][i
] &= ~QBSBIT
;
308 bigfsm
[EOB
][i
] = ~S_EOB
;
309 if (bigfsm
[EOFC
][i
] >= 0)
310 bigfsm
[EOFC
][i
] = ~S_EOF
;
317 /* do C++ comments? */
318 if ((Cplusplus
== 0) || (Cflag
!= 0))
319 bigfsm
['/'][COM1
] = bigfsm
['x'][COM1
];
323 * fill in a row of tokens from input, terminated by NL or END
324 * First token is put at trp->lp.
325 * Reset is non-zero when the input buffer can be "rewound."
326 * The value is a flag indicating that possible macros have
327 * been seen in the row.
330 gettokens(Tokenrow
* trp
, int reset
)
332 int c
, state
, oldstate
;
336 Source
*s
= cursource
;
345 { /* nothing in buffer */
348 ip
= s
->inp
= s
->inb
;
351 if (ip
>= s
->inb
+ (3 * INS
/ 4))
353 memmove(s
->inb
, ip
, 4 + s
->inl
- ip
);
354 s
->inl
= s
->inb
+ (s
->inl
- ip
);
355 ip
= s
->inp
= s
->inb
;
358 maxp
= &trp
->bp
[trp
->max
];
366 tp
= growtokenrow(trp
);
367 // coverity[overrun-local : FALSE] - a multiple of trp->max is allocated, not trp->max itself
368 maxp
= &trp
->bp
[trp
->max
];
380 if ((state
= bigfsm
[c
][state
]) >= 0)
388 switch (state
& 0177)
395 tp
->type
= (unsigned char) GETACT(state
);
396 tp
->len
= ip
- tp
->t
;
400 case S_NAME
: /* like S_SELFB but with nmac check */
402 tp
->len
= ip
- tp
->t
;
403 nmac
|= quicklook(tp
->t
[0], tp
->len
> 1 ? tp
->t
[1] : 0);
408 tp
->wslen
= ip
- tp
->t
;
414 if ((state
& QBSBIT
) == 0)
425 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
427 if (s
->inp
[1] == '\r')
429 memmove(s
->inp
+ 1, s
->inp
+ 2, s
->inl
- s
->inp
+ 2);
438 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
440 if (s
->inp
[1] == '\n')
442 memmove(s
->inp
, s
->inp
+ 1, s
->inl
- s
->inp
+ 1);
453 { /* check trigraph */
471 error(WARNING
, "Lexical botch in cpp");
486 if (tp
!= trp
->bp
&& (tp
- 1)->type
!= NL
&& cursource
->fd
!= -1)
487 error(WARNING
, "No newline at end of file");
492 error(ERROR
, "Unterminated string or char const");
505 error(FATAL
, "EOF in string or char constant");
516 error(WARNING
, "EOF inside comment");
539 tp
->len
= ip
- tp
->t
;
544 /* have seen ?; handle the trigraph it starts (if any) else 0 */
550 while (s
->inp
+ 2 >= s
->inl
&& fillbuf(s
) != EOF
);
552 if (s
->inp
[1] != '?')
588 memmove(s
->inp
+ 1, s
->inp
+ 3, s
->inl
- s
->inp
+ 2);
599 /* skip pending white spaces */
600 while ((s
->inp
[n
] == ' ') || (s
->inp
[n
] == '\t'))
603 if ((s
->inp
+ n
>= s
->inl
) && (fillbuf(s
) == EOF
))
608 while (s
->inp
+ (n
+ 1) >= s
->inl
&& fillbuf(s
) != EOF
);
610 /* skip DOS line ends */
611 if (((s
->inp
[n
] == '\r') && (s
->inp
[n
+1] == '\n')) ||
612 ((s
->inp
[n
] == '\n') && (s
->inp
[n
+1] == '\r')))
615 if ((s
->inp
[n
] == '\n') || (s
->inp
[n
] == '\r'))
617 memmove(s
->inp
, s
->inp
+ n
+ 1, s
->inl
- s
->inp
+ n
+ 2);
629 if (s
->fd
< 0 || (n
= read(s
->fd
, (char *) s
->inl
, INS
/ 8)) <= 0)
632 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOB
;
635 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOFC
;
642 * Push down to new source of characters.
643 * If fd>0 and str==NULL, then from a file `name';
644 * if fd==-1 and str, then from the string.
647 setsource(char *name
, int path
, int fd
, char const *str
, int wrap
)
649 Source
*s
= new(Source
);
666 /* slop at right for EOB */
670 s
->inb
= domalloc(len
+ 4);
672 memcpy((char *) s
->inp
, str
, len
);
676 s
->inb
= domalloc(INS
+ 4);
680 s
->inl
= s
->inp
+ len
;
681 s
->inl
[0] = s
->inl
[1] = EOB
;
689 Source
*s
= cursource
;
703 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */