1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
33 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
41 * lexical FSM encoding
42 * when in state state, and one of the characters
43 * in ch arrives, enter nextstate.
44 * States >= S_SELF are either final, or at least require special action.
45 * In 'fsm' there is a line for each state X charset X nextstate.
46 * List chars that overwrite previous entries later (e.g. C_ALPH
47 * can be overridden by '_' by a later entry; and C_XX is the
48 * the universal set, and should always be first.
49 * States above S_SELF are represented in the big table as negative values.
50 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
51 * These actions differ in that S_SELF doesn't have a lookahead char,
54 * The encoding is blown out into a big table for time-efficiency.
56 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
60 #define ACT(tok,act) ((tok<<7)+act)
62 #define GETACT(st) ((st>>7)&0x1ff)
64 /* character classes */
73 START
= 0, NUM1
, NUM2
, NUM3
, ID1
, ST1
, ST2
, ST3
, COM1
, COM2
, COM3
, COM4
,
74 CC1
, CC2
, WS1
, PLUS1
, MINUS1
, STAR1
, SLASH1
, PCT1
, SHARP1
,
75 CIRC1
, GT1
, GT2
, LT1
, LT2
, OR1
, AND1
, ASG1
, NOT1
, DOTS1
,
76 S_SELF
= MAXSTATE
, S_SELFB
, S_EOF
, S_NL
, S_EOFSTR
,
77 S_STNL
, S_COMNL
, S_EOFCOM
, S_COMMENT
, S_EOB
, S_WS
, S_NAME
84 int state
; /* if in this state */
85 uchar ch
[4]; /* and see one of these characters */
86 int nextstate
; /* enter this state if +ve */
89 /*const*/ struct fsm fsm
[] = {
91 { START
, {C_XX
}, ACT(UNCLASS
, S_SELF
) },
92 { START
, {' ', '\t', '\v'}, WS1
},
93 { START
, {C_NUM
}, NUM1
},
94 { START
, {'.'}, NUM3
},
95 { START
, {C_ALPH
}, ID1
},
96 { START
, {'L'}, ST1
},
97 { START
, {'"'}, ST2
},
98 { START
, {'\''}, CC1
},
99 { START
, {'/'}, COM1
},
100 { START
, {EOFC
}, S_EOF
},
101 { START
, {'\n'}, S_NL
},
102 { START
, {'-'}, MINUS1
},
103 { START
, {'+'}, PLUS1
},
104 { START
, {'<'}, LT1
},
105 { START
, {'>'}, GT1
},
106 { START
, {'='}, ASG1
},
107 { START
, {'!'}, NOT1
},
108 { START
, {'&'}, AND1
},
109 { START
, {'|'}, OR1
},
110 { START
, {'#'}, SHARP1
},
111 { START
, {'%'}, PCT1
},
112 { START
, {'['}, ACT(SBRA
, S_SELF
) },
113 { START
, {']'}, ACT(SKET
, S_SELF
) },
114 { START
, {'('}, ACT(LP
, S_SELF
) },
115 { START
, {')'}, ACT(RP
, S_SELF
) },
116 { START
, {'*'}, STAR1
},
117 { START
, {','}, ACT(COMMA
, S_SELF
) },
118 { START
, {'?'}, ACT(QUEST
, S_SELF
) },
119 { START
, {':'}, ACT(COLON
, S_SELF
) },
120 { START
, {';'}, ACT(SEMIC
, S_SELF
) },
121 { START
, {'{'}, ACT(CBRA
, S_SELF
) },
122 { START
, {'}'}, ACT(CKET
, S_SELF
) },
123 { START
, {'~'}, ACT(TILDE
, S_SELF
) },
124 { START
, {'^'}, CIRC1
},
127 { NUM1
, {C_XX
}, ACT(NUMBER
, S_SELFB
) },
128 { NUM1
, {C_NUM
, C_ALPH
, '.'}, NUM1
},
129 { NUM1
, {'E', 'e'}, NUM2
},
130 { NUM1
, {'_'}, ACT(NUMBER
, S_SELFB
) },
132 /* saw possible start of exponent, digits-e */
133 { NUM2
, {C_XX
}, ACT(NUMBER
, S_SELFB
) },
134 { NUM2
, {'+', '-'}, NUM1
},
135 { NUM2
, {C_NUM
, C_ALPH
}, NUM1
},
136 { NUM2
, {'_'}, ACT(NUMBER
, S_SELFB
) },
138 /* saw a '.', which could be a number or an operator */
139 { NUM3
, {C_XX
}, ACT(DOT
, S_SELFB
) },
140 { NUM3
, {'.'}, DOTS1
},
141 { NUM3
, {C_NUM
}, NUM1
},
143 { DOTS1
, {C_XX
}, ACT(UNCLASS
, S_SELFB
) },
144 { DOTS1
, {C_NUM
}, NUM1
},
145 { DOTS1
, {'.'}, ACT(ELLIPS
, S_SELF
) },
147 /* saw a letter or _ */
148 { ID1
, {C_XX
}, ACT(NAME
, S_NAME
) },
149 { ID1
, {C_ALPH
, C_NUM
}, ID1
},
151 /* saw L (start of wide string?) */
152 { ST1
, {C_XX
}, ACT(NAME
, S_NAME
) },
153 { ST1
, {C_ALPH
, C_NUM
}, ID1
},
155 { ST1
, {'\''}, CC1
},
157 /* saw " beginning string */
158 { ST2
, {C_XX
}, ST2
},
159 { ST2
, {'"'}, ACT(STRING
, S_SELF
) },
160 { ST2
, {'\\'}, ST3
},
161 { ST2
, {'\n'}, S_STNL
},
162 { ST2
, {EOFC
}, S_EOFSTR
},
164 /* saw \ in string */
165 { ST3
, {C_XX
}, ST2
},
166 { ST3
, {'\n'}, S_STNL
},
167 { ST3
, {EOFC
}, S_EOFSTR
},
169 /* saw ' beginning character const */
170 { CC1
, {C_XX
}, CC1
},
171 { CC1
, {'\''}, ACT(CCON
, S_SELF
) },
172 { CC1
, {'\\'}, CC2
},
173 { CC1
, {'\n'}, S_STNL
},
174 { CC1
, {EOFC
}, S_EOFSTR
},
177 { CC2
, {C_XX
}, CC1
},
178 { CC2
, {'\n'}, S_STNL
},
179 { CC2
, {EOFC
}, S_EOFSTR
},
181 /* saw /, perhaps start of comment */
182 { COM1
, {C_XX
}, ACT(SLASH
, S_SELFB
) },
183 { COM1
, {'='}, ACT(ASSLASH
, S_SELF
) },
184 { COM1
, {'*'}, COM2
},
185 { COM1
, {'/'}, COM4
},
187 /* saw / followed by *, start of comment */
188 { COM2
, {C_XX
}, COM2
},
189 { COM2
, {'\n'}, S_COMNL
},
190 { COM2
, {'*'}, COM3
},
191 { COM2
, {EOFC
}, S_EOFCOM
},
193 /* saw the * possibly ending a comment */
194 { COM3
, {C_XX
}, COM2
},
195 { COM3
, {'\n'}, S_COMNL
},
196 { COM3
, {'*'}, COM3
},
197 { COM3
, {'/'}, S_COMMENT
},
200 { COM4
, {C_XX
}, COM4
},
201 { COM4
, {'\n'}, S_COMMENT
},
202 /* { COM4, {'\n'}, S_NL }, */
203 { COM4
, {EOFC
}, S_EOFCOM
},
205 /* saw white space, eat it up */
206 { WS1
, {C_XX
}, S_WS
},
207 { WS1
, {'\t', '\v', ' '}, WS1
},
209 /* saw -, check --, -=, -> */
210 { MINUS1
, {C_XX
}, ACT(MINUS
, S_SELFB
) },
211 { MINUS1
, {'-'}, ACT(MMINUS
, S_SELF
) },
212 { MINUS1
, {'='}, ACT(ASMINUS
, S_SELF
) },
213 { MINUS1
, {'>'}, ACT(ARROW
, S_SELF
) },
215 /* saw +, check ++, += */
216 { PLUS1
, {C_XX
}, ACT(PLUS
, S_SELFB
) },
217 { PLUS1
, {'+'}, ACT(PPLUS
, S_SELF
) },
218 { PLUS1
, {'='}, ACT(ASPLUS
, S_SELF
) },
220 /* saw <, check <<, <<=, <= */
221 { LT1
, {C_XX
}, ACT(LT
, S_SELFB
) },
223 { LT1
, {'='}, ACT(LEQ
, S_SELF
) },
224 { LT2
, {C_XX
}, ACT(LSH
, S_SELFB
) },
225 { LT2
, {'='}, ACT(ASLSH
, S_SELF
) },
227 /* saw >, check >>, >>=, >= */
228 { GT1
, {C_XX
}, ACT(GT
, S_SELFB
) },
230 { GT1
, {'='}, ACT(GEQ
, S_SELF
) },
231 { GT2
, {C_XX
}, ACT(RSH
, S_SELFB
) },
232 { GT2
, {'='}, ACT(ASRSH
, S_SELF
) },
235 { ASG1
, {C_XX
}, ACT(ASGN
, S_SELFB
) },
236 { ASG1
, {'='}, ACT(EQ
, S_SELF
) },
239 { NOT1
, {C_XX
}, ACT(NOT
, S_SELFB
) },
240 { NOT1
, {'='}, ACT(NEQ
, S_SELF
) },
243 { AND1
, {C_XX
}, ACT(AND
, S_SELFB
) },
244 { AND1
, {'&'}, ACT(LAND
, S_SELF
) },
245 { AND1
, {'='}, ACT(ASAND
, S_SELF
) },
248 { OR1
, {C_XX
}, ACT(OR
, S_SELFB
) },
249 { OR1
, {'|'}, ACT(LOR
, S_SELF
) },
250 { OR1
, {'='}, ACT(ASOR
, S_SELF
) },
253 { SHARP1
, {C_XX
}, ACT(SHARP
, S_SELFB
) },
254 { SHARP1
, {'#'}, ACT(DSHARP
, S_SELF
) },
257 { PCT1
, {C_XX
}, ACT(PCT
, S_SELFB
) },
258 { PCT1
, {'='}, ACT(ASPCT
, S_SELF
) },
261 { STAR1
, {C_XX
}, ACT(STAR
, S_SELFB
) },
262 { STAR1
, {'='}, ACT(ASSTAR
, S_SELF
) },
265 { CIRC1
, {C_XX
}, ACT(CIRC
, S_SELFB
) },
266 { CIRC1
, {'='}, ACT(ASCIRC
, S_SELF
) },
268 { -1, {'\0'}, S_SELF
}
271 /* first index is char, second is state */
272 /* increase #states to power of 2 to encourage use of shift */
273 short bigfsm
[256][MAXSTATE
];
278 /* const */ struct fsm
*fp
;
281 for (fp
= fsm
; fp
->state
>= 0; fp
++)
283 for (i
= 0; fp
->ch
[i
]; i
++)
285 nstate
= fp
->nextstate
;
286 if (nstate
>= S_SELF
)
291 case C_XX
: /* random characters */
292 for (j
= 0; j
< 256; j
++)
293 bigfsm
[j
][fp
->state
] = (short) nstate
;
296 for (j
= 0; j
<= 256; j
++)
297 if (('a' <= j
&& j
<= 'z') || ('A' <= j
&& j
<= 'Z')
299 bigfsm
[j
][fp
->state
] = (short) nstate
;
302 for (j
= '0'; j
<= '9'; j
++)
303 bigfsm
[j
][fp
->state
] = (short) nstate
;
306 bigfsm
[fp
->ch
[i
]][fp
->state
] = (short) nstate
;
312 * install special cases for ? (trigraphs), \ (splicing), runes, and
315 for (i
= 0; i
< MAXSTATE
; i
++)
317 for (j
= 0; j
< 0xFF; j
++)
318 if (j
== '?' || j
== '\\' || j
== '\n' || j
== '\r')
320 if (bigfsm
[j
][i
] > 0)
321 bigfsm
[j
][i
] = ~bigfsm
[j
][i
];
322 bigfsm
[j
][i
] &= ~QBSBIT
;
324 bigfsm
[EOB
][i
] = ~S_EOB
;
325 if (bigfsm
[EOFC
][i
] >= 0)
326 bigfsm
[EOFC
][i
] = ~S_EOF
;
331 * fill in a row of tokens from input, terminated by NL or END
332 * First token is put at trp->lp.
333 * Reset is non-zero when the input buffer can be "rewound."
334 * The value is a flag indicating that possible macros have
335 * been seen in the row.
338 gettokens(Tokenrow
* trp
, int reset
)
340 register int c
, state
, oldstate
;
342 register Token
*tp
, *maxp
;
344 Source
*s
= cursource
;
353 { /* nothing in buffer */
356 ip
= s
->inp
= s
->inb
;
359 if (ip
>= s
->inb
+ (3 * INS
/ 4))
361 memmove(s
->inb
, ip
, 4 + s
->inl
- ip
);
362 s
->inl
= s
->inb
+ (s
->inl
- ip
);
363 ip
= s
->inp
= s
->inb
;
366 maxp
= &trp
->bp
[trp
->max
];
374 tp
= growtokenrow(trp
);
375 maxp
= &trp
->bp
[trp
->max
];
388 if ((state
= bigfsm
[c
][state
]) >= 0)
396 switch (state
& 0177)
402 tp
->type
= (unsigned char) GETACT(state
);
403 tp
->len
= ip
- tp
->t
;
407 case S_NAME
: /* like S_SELFB but with nmac check */
409 tp
->len
= ip
- tp
->t
;
410 nmac
|= quicklook(tp
->t
[0], tp
->len
> 1 ? tp
->t
[1] : 0);
415 tp
->wslen
= ip
- tp
->t
;
421 if ((state
& QBSBIT
) == 0)
432 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
434 if (s
->inp
[1] == '\r')
436 memmove(s
->inp
+ 1, s
->inp
+ 2, s
->inl
- s
->inp
+ 2);
445 while (s
->inp
+ 1 >= s
->inl
&& fillbuf(s
) != EOF
);
447 if (s
->inp
[1] == '\n')
449 memmove(s
->inp
, s
->inp
+ 1, s
->inl
- s
->inp
+ 1);
460 { /* check trigraph */
478 error(WARNING
, "Lexical botch in cpp");
493 if (tp
!= trp
->bp
&& (tp
- 1)->type
!= NL
&& cursource
->fd
!= -1)
494 error(WARNING
, "No newline at end of file");
499 error(ERROR
, "Unterminated string or char const");
511 error(FATAL
, "EOF in string or char constant");
522 error(WARNING
, "EOF inside comment");
536 /* s->lineinc = 0; */
545 tp
->len
= ip
- tp
->t
;
550 /* have seen ?; handle the trigraph it starts (if any) else 0 */
556 while (s
->inp
+ 2 >= s
->inl
&& fillbuf(s
) != EOF
);
558 if (s
->inp
[1] != '?')
594 memmove(s
->inp
+ 1, s
->inp
+ 3, s
->inl
- s
->inp
+ 2);
605 while (s
->inp
+ 2 >= s
->inl
&& fillbuf(s
) != EOF
);
607 /* skip DOS line ends */
608 if (((s
->inp
[n
] == '\r') && (s
->inp
[n
+1] == '\n')) ||
609 ((s
->inp
[n
] == '\n') && (s
->inp
[n
+1] == '\r')))
612 if ((s
->inp
[n
] == '\n') || (s
->inp
[n
] == '\r'))
614 memmove(s
->inp
, s
->inp
+ n
+ 1, s
->inl
- s
->inp
+ n
+ 2);
626 if (s
->fd
< 0 || (n
= read(s
->fd
, (char *) s
->inl
, INS
/ 8)) <= 0)
629 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOB
;
632 s
->inl
[0] = s
->inl
[1] = s
->inl
[2] = s
->inl
[3] = EOFC
;
639 * Push down to new source of characters.
640 * If fd>0 and str==NULL, then from a file `name';
641 * if fd==-1 and str, then from the string.
644 setsource(char *name
, int path
, int fd
, char *str
, int wrap
)
646 Source
*s
= new(Source
);
663 /* slop at right for EOB */
667 s
->inb
= domalloc(len
+ 4);
669 strncpy((char *) s
->inp
, str
, len
);
673 s
->inb
= domalloc(INS
+ 4);
677 s
->inl
= s
->inp
+ len
;
678 s
->inl
[0] = s
->inl
[1] = EOB
;
686 Source
*s
= cursource
;