Fix xslt_process() to ensure that it inserts a NULL terminator after the
[PostgreSQL.git] / src / backend / parser / scan.l
blob6448cf2bcfc90fffe2a5ac019baebd402887e090
1 %{
2 /*-------------------------------------------------------------------------
3  *
4  * scan.l
5  *        lexical scanner for PostgreSQL
6  *
7  * NOTE NOTE NOTE:
8  *
9  * The rules in this file must be kept in sync with psql's lexer!!!
10  *
11  * The rules are designed so that the scanner never has to backtrack,
12  * in the sense that there is always a rule that can match the input
13  * consumed so far (the rule action may internally throw back some input
14  * with yyless(), however).  As explained in the flex manual, this makes
15  * for a useful speed increase --- about a third faster than a plain -CF
16  * lexer, in simple testing.  The extra complexity is mostly in the rules
17  * for handling float numbers and continued string literals.  If you change
18  * the lexical rules, verify that you haven't broken the no-backtrack
19  * property by running flex with the "-b" option and checking that the
20  * resulting "lex.backup" file says that no backing up is needed.
21  *
22  *
23  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
24  * Portions Copyright (c) 1994, Regents of the University of California
25  *
26  * IDENTIFICATION
27  *        $PostgreSQL$
28  *
29  *-------------------------------------------------------------------------
30  */
31 #include "postgres.h"
33 #include <ctype.h>
34 #include <unistd.h>
36 #include "parser/gramparse.h"
37 #include "parser/keywords.h"
38 /* Not needed now that this file is compiled as part of gram.y */
39 /* #include "parser/gram.h" */
40 #include "parser/scansup.h"
41 #include "mb/pg_wchar.h"
44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
45 #undef fprintf
46 #define fprintf(file, fmt, msg)  ereport(ERROR, (errmsg_internal("%s", msg)))
48 static int              xcdepth = 0;    /* depth of nesting in slash-star comments */
49 static char    *dolqstart;      /* current $foo$ quote start string */
52  * GUC variables.  This is a DIRECT violation of the warning given at the
53  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
54  * as such, changing their values can induce very unintuitive behavior.
55  * But we shall have to live with it as a short-term thing until the switch
56  * to SQL-standard string syntax is complete.
57  */
58 int                             backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
59 bool                    escape_string_warning = true;
60 bool                    standard_conforming_strings = false;
62 static bool             warn_on_first_escape;
63 static bool             saw_non_ascii = false;
66  * literalbuf is used to accumulate literal values when multiple rules
67  * are needed to parse a single literal.  Call startlit to reset buffer
68  * to empty, addlit to add text.  Note that the buffer is palloc'd and
69  * starts life afresh on every parse cycle.
70  */
71 static char        *literalbuf;         /* expandable buffer */
72 static int              literallen;             /* actual current length */
73 static int              literalalloc;   /* current allocated buffer size */
75 #define startlit()  (literalbuf[0] = '\0', literallen = 0)
76 static void addlit(char *ytext, int yleng);
77 static void addlitchar(unsigned char ychar);
78 static char *litbufdup(void);
79 static char *litbuf_udeescape(unsigned char escape);
81 #define lexer_errposition()  scanner_errposition(yylloc)
83 static void check_escape_warning(void);
84 static void check_string_escape_warning(unsigned char ychar);
87  * Each call to yylex must set yylloc to the location of the found token
88  * (expressed as a byte offset from the start of the input text).
89  * When we parse a token that requires multiple lexer rules to process,
90  * this should be done in the first such rule, else yylloc will point
91  * into the middle of the token.
92  */
93 #define SET_YYLLOC()  (yylloc = yytext - scanbuf)
95 /* Handles to the buffer that the lexer uses internally */
96 static YY_BUFFER_STATE scanbufhandle;
97 static char *scanbuf;
99 static unsigned char unescape_single_char(unsigned char c);
103 %option 8bit
104 %option never-interactive
105 %option nodefault
106 %option noinput
107 %option nounput
108 %option noyywrap
109 %option prefix="base_yy"
112  * OK, here is a short description of lex/flex rules behavior.
113  * The longest pattern which matches an input string is always chosen.
114  * For equal-length patterns, the first occurring in the rules list is chosen.
115  * INITIAL is the starting state, to which all non-conditional rules apply.
116  * Exclusive states change parsing rules while the state is active.  When in
117  * an exclusive state, only those rules defined for that state apply.
119  * We use exclusive states for quoted strings, extended comments,
120  * and to eliminate parsing troubles for numeric strings.
121  * Exclusive states:
122  *  <xb> bit string literal
123  *  <xc> extended C-style comments
124  *  <xd> delimited identifiers (double-quoted identifiers)
125  *  <xh> hexadecimal numeric string
126  *  <xq> standard quoted strings
127  *  <xe> extended quoted strings (support backslash escape sequences)
128  *  <xdolq> $foo$ quoted strings
129  *  <xui> quoted identifier with Unicode escapes
130  *  <xus> quoted string with Unicode escapes
131  */
133 %x xb
134 %x xc
135 %x xd
136 %x xh
137 %x xe
138 %x xq
139 %x xdolq
140 %x xui
141 %x xus
144  * In order to make the world safe for Windows and Mac clients as well as
145  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
146  * sequence will be seen as two successive newlines, but that doesn't cause
147  * any problems.  Comments that start with -- and extend to the next
148  * newline are treated as equivalent to a single whitespace character.
150  * NOTE a fine point: if there is no newline following --, we will absorb
151  * everything to the end of the input as a comment.  This is correct.  Older
152  * versions of Postgres failed to recognize -- as a comment if the input
153  * did not end with a newline.
155  * XXX perhaps \f (formfeed) should be treated as a newline as well?
157  * XXX if you change the set of whitespace characters, fix scanner_isspace()
158  * to agree, and see also the plpgsql lexer.
159  */
161 space                   [ \t\n\r\f]
162 horiz_space             [ \t\f]
163 newline                 [\n\r]
164 non_newline             [^\n\r]
166 comment                 ("--"{non_newline}*)
168 whitespace              ({space}+|{comment})
171  * SQL requires at least one newline in the whitespace separating
172  * string literals that are to be concatenated.  Silly, but who are we
173  * to argue?  Note that {whitespace_with_newline} should not have * after
174  * it, whereas {whitespace} should generally have a * after it...
175  */
177 special_whitespace              ({space}+|{comment}{newline})
178 horiz_whitespace                ({horiz_space}|{comment})
179 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
182  * To ensure that {quotecontinue} can be scanned without having to back up
183  * if the full pattern isn't matched, we include trailing whitespace in
184  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
185  * except for {quote} followed by whitespace and just one "-" (not two,
186  * which would start a {comment}).  To cover that we have {quotefail}.
187  * The actions for {quotestop} and {quotefail} must throw back characters
188  * beyond the quote proper.
189  */
190 quote                   '
191 quotestop               {quote}{whitespace}*
192 quotecontinue   {quote}{whitespace_with_newline}{quote}
193 quotefail               {quote}{whitespace}*"-"
195 /* Bit string
196  * It is tempting to scan the string for only those characters
197  * which are allowed. However, this leads to silently swallowed
198  * characters if illegal characters are included in the string.
199  * For example, if xbinside is [01] then B'ABCD' is interpreted
200  * as a zero-length string, and the ABCD' is lost!
201  * Better to pass the string forward and let the input routines
202  * validate the contents.
203  */
204 xbstart                 [bB]{quote}
205 xbinside                [^']*
207 /* Hexadecimal number */
208 xhstart                 [xX]{quote}
209 xhinside                [^']*
211 /* National character */
212 xnstart                 [nN]{quote}
214 /* Quoted string that allows backslash escapes */
215 xestart                 [eE]{quote}
216 xeinside                [^\\']+
217 xeescape                [\\][^0-7]
218 xeoctesc                [\\][0-7]{1,3}
219 xehexesc                [\\]x[0-9A-Fa-f]{1,2}
221 /* Extended quote
222  * xqdouble implements embedded quote, ''''
223  */
224 xqstart                 {quote}
225 xqdouble                {quote}{quote}
226 xqinside                [^']+
228 /* $foo$ style quotes ("dollar quoting")
229  * The quoted string starts with $foo$ where "foo" is an optional string
230  * in the form of an identifier, except that it may not contain "$", 
231  * and extends to the first occurrence of an identical string.  
232  * There is *no* processing of the quoted text.
234  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
235  * fails to match its trailing "$".
236  */
237 dolq_start              [A-Za-z\200-\377_]
238 dolq_cont               [A-Za-z\200-\377_0-9]
239 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
240 dolqfailed              \${dolq_start}{dolq_cont}*
241 dolqinside              [^$]+
243 /* Double quote
244  * Allows embedded spaces and other special characters into identifiers.
245  */
246 dquote                  \"
247 xdstart                 {dquote}
248 xdstop                  {dquote}
249 xddouble                {dquote}{dquote}
250 xdinside                [^"]+
252 /* Unicode escapes */
253 uescape                 [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
254 /* error rule to avoid backup */
255 uescapefail             ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
257 /* Quoted identifier with Unicode escapes */
258 xuistart                [uU]&{dquote}
259 xuistop1                {dquote}{whitespace}*{uescapefail}?
260 xuistop2                {dquote}{whitespace}*{uescape}
262 /* Quoted string with Unicode escapes */
263 xusstart                [uU]&{quote}
264 xusstop1                {quote}{whitespace}*{uescapefail}?
265 xusstop2                {quote}{whitespace}*{uescape}
267 /* error rule to avoid backup */
268 xufailed                [uU]&
271 /* C-style comments
273  * The "extended comment" syntax closely resembles allowable operator syntax.
274  * The tricky part here is to get lex to recognize a string starting with
275  * slash-star as a comment, when interpreting it as an operator would produce
276  * a longer match --- remember lex will prefer a longer match!  Also, if we
277  * have something like plus-slash-star, lex will think this is a 3-character
278  * operator whereas we want to see it as a + operator and a comment start.
279  * The solution is two-fold:
280  * 1. append {op_chars}* to xcstart so that it matches as much text as
281  *    {operator} would. Then the tie-breaker (first matching rule of same
282  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
283  *    in case it contains a star-slash that should terminate the comment.
284  * 2. In the operator rule, check for slash-star within the operator, and
285  *    if found throw it back with yyless().  This handles the plus-slash-star
286  *    problem.
287  * Dash-dash comments have similar interactions with the operator rule.
288  */
289 xcstart                 \/\*{op_chars}*
290 xcstop                  \*+\/
291 xcinside                [^*/]+
293 digit                   [0-9]
294 ident_start             [A-Za-z\200-\377_]
295 ident_cont              [A-Za-z\200-\377_0-9\$]
297 identifier              {ident_start}{ident_cont}*
299 typecast                "::"
302  * "self" is the set of chars that should be returned as single-character
303  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
304  * which can be one or more characters long (but if a single-char token
305  * appears in the "self" set, it is not to be returned as an Op).  Note
306  * that the sets overlap, but each has some chars that are not in the other.
308  * If you change either set, adjust the character lists appearing in the
309  * rule for "operator"!
310  */
311 self                    [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
312 op_chars                [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
313 operator                {op_chars}+
315 /* we no longer allow unary minus in numbers. 
316  * instead we pass it separately to parser. there it gets
317  * coerced via doNegate() -- Leon aug 20 1999
319  * {realfail1} and {realfail2} are added to prevent the need for scanner
320  * backup when the {real} rule fails to match completely.
321  */
323 integer                 {digit}+
324 decimal                 (({digit}*\.{digit}+)|({digit}+\.{digit}*))
325 real                    ({integer}|{decimal})[Ee][-+]?{digit}+
326 realfail1               ({integer}|{decimal})[Ee]
327 realfail2               ({integer}|{decimal})[Ee][-+]
329 param                   \${integer}
331 other                   .
334  * Dollar quoted strings are totally opaque, and no escaping is done on them.
335  * Other quoted strings must allow some special characters such as single-quote
336  *  and newline.
337  * Embedded single-quotes are implemented both in the SQL standard
338  *  style of two adjacent single quotes "''" and in the Postgres/Java style
339  *  of escaped-quote "\'".
340  * Other embedded escaped characters are matched explicitly and the leading
341  *  backslash is dropped from the string.
342  * Note that xcstart must appear before operator, as explained above!
343  *  Also whitespace (comment) must appear before operator.
344  */
348 {whitespace}    {
349                                         /* ignore */
350                                 }
352 {xcstart}               {
353                                         /* Set location in case of syntax error in comment */
354                                         SET_YYLLOC();
355                                         xcdepth = 0;
356                                         BEGIN(xc);
357                                         /* Put back any characters past slash-star; see above */
358                                         yyless(2);
359                                 }
361 <xc>{xcstart}   {
362                                         xcdepth++;
363                                         /* Put back any characters past slash-star; see above */
364                                         yyless(2);
365                                 }
367 <xc>{xcstop}    {
368                                         if (xcdepth <= 0)
369                                                 BEGIN(INITIAL);
370                                         else
371                                                 xcdepth--;
372                                 }
374 <xc>{xcinside}  {
375                                         /* ignore */
376                                 }
378 <xc>{op_chars}  {
379                                         /* ignore */
380                                 }
382 <xc>\*+                 {
383                                         /* ignore */
384                                 }
386 <xc><<EOF>>             { yyerror("unterminated /* comment"); }
388 {xbstart}               {
389                                         /* Binary bit type.
390                                          * At some point we should simply pass the string
391                                          * forward to the parser and label it there.
392                                          * In the meantime, place a leading "b" on the string
393                                          * to mark it for the input routine as a binary string.
394                                          */
395                                         SET_YYLLOC();
396                                         BEGIN(xb);
397                                         startlit();
398                                         addlitchar('b');
399                                 }
400 <xb>{quotestop} |
401 <xb>{quotefail} {
402                                         yyless(1);
403                                         BEGIN(INITIAL);
404                                         yylval.str = litbufdup();
405                                         return BCONST;
406                                 }
407 <xh>{xhinside}  |
408 <xb>{xbinside}  {
409                                         addlit(yytext, yyleng);
410                                 }
411 <xh>{quotecontinue}     |
412 <xb>{quotecontinue}     {
413                                         /* ignore */
414                                 }
415 <xb><<EOF>>             { yyerror("unterminated bit string literal"); }
417 {xhstart}               {
418                                         /* Hexadecimal bit type.
419                                          * At some point we should simply pass the string
420                                          * forward to the parser and label it there.
421                                          * In the meantime, place a leading "x" on the string
422                                          * to mark it for the input routine as a hex string.
423                                          */
424                                         SET_YYLLOC();
425                                         BEGIN(xh);
426                                         startlit();
427                                         addlitchar('x');
428                                 }
429 <xh>{quotestop} |
430 <xh>{quotefail} {
431                                         yyless(1);
432                                         BEGIN(INITIAL);
433                                         yylval.str = litbufdup();
434                                         return XCONST;
435                                 }
436 <xh><<EOF>>             { yyerror("unterminated hexadecimal string literal"); }
438 {xnstart}               {
439                                         /* National character.
440                                          * We will pass this along as a normal character string,
441                                          * but preceded with an internally-generated "NCHAR".
442                                          */
443                                         const ScanKeyword *keyword;
445                                         SET_YYLLOC();
446                                         yyless(1);                              /* eat only 'n' this time */
447                                         /* nchar had better be a keyword! */
448                                         keyword = ScanKeywordLookup("nchar");
449                                         Assert(keyword != NULL);
450                                         yylval.keyword = keyword->name;
451                                         return keyword->value;
452                                 }
454 {xqstart}               {
455                                         warn_on_first_escape = true;
456                                         saw_non_ascii = false;
457                                         SET_YYLLOC();
458                                         if (standard_conforming_strings)
459                                                 BEGIN(xq);
460                                         else
461                                                 BEGIN(xe);
462                                         startlit();
463                                 }
464 {xestart}               {
465                                         warn_on_first_escape = false;
466                                         saw_non_ascii = false;
467                                         SET_YYLLOC();
468                                         BEGIN(xe);
469                                         startlit();
470                                 }
471 {xusstart}              {
472                                         SET_YYLLOC();
473                                         if (!standard_conforming_strings)
474                                                 ereport(ERROR,
475                                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
476                                                                  errmsg("unsafe use of string constant with Unicode escapes"),
477                                                                  errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
478                                                                  lexer_errposition()));
479                                         BEGIN(xus);
480                                         startlit();
481                                 }
482 <xq,xe>{quotestop}      |
483 <xq,xe>{quotefail} {
484                                         yyless(1);
485                                         BEGIN(INITIAL);
486                                         /*
487                                          * check that the data remains valid if it might have been
488                                          * made invalid by unescaping any chars.
489                                          */
490                                         if (saw_non_ascii)
491                                                 pg_verifymbstr(literalbuf, literallen, false);
492                                         yylval.str = litbufdup();
493                                         return SCONST;
494                                 }
495 <xus>{xusstop1} {
496                                         /* throw back all but the quote */
497                                         yyless(1);
498                                         BEGIN(INITIAL);
499                                         yylval.str = litbuf_udeescape('\\');
500                                         return SCONST;
501                                 }
502 <xus>{xusstop2} {
503                                         BEGIN(INITIAL);
504                                         yylval.str = litbuf_udeescape(yytext[yyleng-2]);
505                                         return SCONST;
506                                 }
507 <xq,xe,xus>{xqdouble} {
508                                         addlitchar('\'');
509                                 }
510 <xq,xus>{xqinside}  {
511                                         addlit(yytext, yyleng);
512                                 }
513 <xe>{xeinside}  {
514                                         addlit(yytext, yyleng);
515                                 }
516 <xe>{xeescape}  {
517                                         if (yytext[1] == '\'')
518                                         {
519                                                 if (backslash_quote == BACKSLASH_QUOTE_OFF ||
520                                                         (backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
521                                                          PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
522                                                         ereport(ERROR,
523                                                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
524                                                                          errmsg("unsafe use of \\' in a string literal"),
525                                                                          errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
526                                                                          lexer_errposition()));
527                                         }
528                                         check_string_escape_warning(yytext[1]);
529                                         addlitchar(unescape_single_char(yytext[1]));
530                                 }
531 <xe>{xeoctesc}  {
532                                         unsigned char c = strtoul(yytext+1, NULL, 8);
534                                         check_escape_warning();
535                                         addlitchar(c);
536                                         if (c == '\0' || IS_HIGHBIT_SET(c))
537                                                 saw_non_ascii = true;
538                                 }
539 <xe>{xehexesc}  {
540                                         unsigned char c = strtoul(yytext+2, NULL, 16);
542                                         check_escape_warning();
543                                         addlitchar(c);
544                                         if (c == '\0' || IS_HIGHBIT_SET(c))
545                                                 saw_non_ascii = true;
546                                 }
547 <xq,xe,xus>{quotecontinue} {
548                                         /* ignore */
549                                 }
550 <xe>.                   {
551                                         /* This is only needed for \ just before EOF */
552                                         addlitchar(yytext[0]);
553                                 }
554 <xq,xe,xus><<EOF>>              { yyerror("unterminated quoted string"); }
556 {dolqdelim}             {
557                                         SET_YYLLOC();
558                                         dolqstart = pstrdup(yytext);
559                                         BEGIN(xdolq);
560                                         startlit();
561                                 }
562 {dolqfailed}    {
563                                         SET_YYLLOC();
564                                         /* throw back all but the initial "$" */
565                                         yyless(1);
566                                         /* and treat it as {other} */
567                                         return yytext[0];
568                                 }
569 <xdolq>{dolqdelim} {
570                                         if (strcmp(yytext, dolqstart) == 0)
571                                         {
572                                                 pfree(dolqstart);
573                                                 BEGIN(INITIAL);
574                                                 yylval.str = litbufdup();
575                                                 return SCONST;
576                                         }
577                                         else
578                                         {
579                                                 /*
580                                                  * When we fail to match $...$ to dolqstart, transfer
581                                                  * the $... part to the output, but put back the final
582                                                  * $ for rescanning.  Consider $delim$...$junk$delim$
583                                                  */
584                                                 addlit(yytext, yyleng-1);
585                                                 yyless(yyleng-1);
586                                         }
587                                 }
588 <xdolq>{dolqinside} {
589                                         addlit(yytext, yyleng);
590                                 }
591 <xdolq>{dolqfailed} {
592                                         addlit(yytext, yyleng);
593                                 }
594 <xdolq>.                {
595                                         /* This is only needed for $ inside the quoted text */
596                                         addlitchar(yytext[0]);
597                                 }
598 <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
600 {xdstart}               {
601                                         SET_YYLLOC();
602                                         BEGIN(xd);
603                                         startlit();
604                                 }
605 {xuistart}              {
606                                         SET_YYLLOC();
607                                         BEGIN(xui);
608                                         startlit();
609                                 }
610 <xd>{xdstop}    {
611                                         char               *ident;
613                                         BEGIN(INITIAL);
614                                         if (literallen == 0)
615                                                 yyerror("zero-length delimited identifier");
616                                         ident = litbufdup();
617                                         if (literallen >= NAMEDATALEN)
618                                                 truncate_identifier(ident, literallen, true);
619                                         yylval.str = ident;
620                                         return IDENT;
621                                 }
622 <xui>{xuistop1} {
623                                         char               *ident;
625                                         BEGIN(INITIAL);
626                                         if (literallen == 0)
627                                                 yyerror("zero-length delimited identifier");
628                                         ident = litbuf_udeescape('\\');
629                                         if (literallen >= NAMEDATALEN)
630                                                 truncate_identifier(ident, literallen, true);
631                                         yylval.str = ident;
632                                         /* throw back all but the quote */
633                                         yyless(1);
634                                         return IDENT;
635                                 }
636 <xui>{xuistop2} {
637                                         char               *ident;
639                                         BEGIN(INITIAL);
640                                         if (literallen == 0)
641                                                 yyerror("zero-length delimited identifier");
642                                         ident = litbuf_udeescape(yytext[yyleng - 2]);
643                                         if (literallen >= NAMEDATALEN)
644                                                 truncate_identifier(ident, literallen, true);
645                                         yylval.str = ident;
646                                         return IDENT;
647                                 }
648 <xd,xui>{xddouble}      {
649                                         addlitchar('"');
650                                 }
651 <xd,xui>{xdinside}      {
652                                         addlit(yytext, yyleng);
653                                 }
654 <xd,xui><<EOF>>         { yyerror("unterminated quoted identifier"); }
656 {xufailed}      {
657                                         char               *ident;
659                                         SET_YYLLOC();
660                                         /* throw back all but the initial u/U */
661                                         yyless(1);
662                                         /* and treat it as {identifier} */
663                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
664                                         yylval.str = ident;
665                                         return IDENT;
666                                 }
668 {typecast}              {
669                                         SET_YYLLOC();
670                                         return TYPECAST;
671                                 }
673 {self}                  {
674                                         SET_YYLLOC();
675                                         return yytext[0];
676                                 }
678 {operator}              {
679                                         /*
680                                          * Check for embedded slash-star or dash-dash; those
681                                          * are comment starts, so operator must stop there.
682                                          * Note that slash-star or dash-dash at the first
683                                          * character will match a prior rule, not this one.
684                                          */
685                                         int             nchars = yyleng;
686                                         char   *slashstar = strstr(yytext, "/*");
687                                         char   *dashdash = strstr(yytext, "--");
689                                         if (slashstar && dashdash)
690                                         {
691                                                 /* if both appear, take the first one */
692                                                 if (slashstar > dashdash)
693                                                         slashstar = dashdash;
694                                         }
695                                         else if (!slashstar)
696                                                 slashstar = dashdash;
697                                         if (slashstar)
698                                                 nchars = slashstar - yytext;
700                                         /*
701                                          * For SQL compatibility, '+' and '-' cannot be the
702                                          * last char of a multi-char operator unless the operator
703                                          * contains chars that are not in SQL operators.
704                                          * The idea is to lex '=-' as two operators, but not
705                                          * to forbid operator names like '?-' that could not be
706                                          * sequences of SQL operators.
707                                          */
708                                         while (nchars > 1 &&
709                                                    (yytext[nchars-1] == '+' ||
710                                                         yytext[nchars-1] == '-'))
711                                         {
712                                                 int             ic;
714                                                 for (ic = nchars-2; ic >= 0; ic--)
715                                                 {
716                                                         if (strchr("~!@#^&|`?%", yytext[ic]))
717                                                                 break;
718                                                 }
719                                                 if (ic >= 0)
720                                                         break; /* found a char that makes it OK */
721                                                 nchars--; /* else remove the +/-, and check again */
722                                         }
724                                         SET_YYLLOC();
726                                         if (nchars < yyleng)
727                                         {
728                                                 /* Strip the unwanted chars from the token */
729                                                 yyless(nchars);
730                                                 /*
731                                                  * If what we have left is only one char, and it's
732                                                  * one of the characters matching "self", then
733                                                  * return it as a character token the same way
734                                                  * that the "self" rule would have.
735                                                  */
736                                                 if (nchars == 1 &&
737                                                         strchr(",()[].;:+-*/%^<>=", yytext[0]))
738                                                         return yytext[0];
739                                         }
741                                         /*
742                                          * Complain if operator is too long.  Unlike the case
743                                          * for identifiers, we make this an error not a notice-
744                                          * and-truncate, because the odds are we are looking at
745                                          * a syntactic mistake anyway.
746                                          */
747                                         if (nchars >= NAMEDATALEN)
748                                                 yyerror("operator too long");
750                                         /* Convert "!=" operator to "<>" for compatibility */
751                                         if (strcmp(yytext, "!=") == 0)
752                                                 yylval.str = pstrdup("<>");
753                                         else
754                                                 yylval.str = pstrdup(yytext);
755                                         return Op;
756                                 }
758 {param}                 {
759                                         SET_YYLLOC();
760                                         yylval.ival = atol(yytext + 1);
761                                         return PARAM;
762                                 }
764 {integer}               {
765                                         long val;
766                                         char* endptr;
768                                         SET_YYLLOC();
769                                         errno = 0;
770                                         val = strtol(yytext, &endptr, 10);
771                                         if (*endptr != '\0' || errno == ERANGE
772 #ifdef HAVE_LONG_INT_64
773                                                 /* if long > 32 bits, check for overflow of int4 */
774                                                 || val != (long) ((int32) val)
775 #endif
776                                                 )
777                                         {
778                                                 /* integer too large, treat it as a float */
779                                                 yylval.str = pstrdup(yytext);
780                                                 return FCONST;
781                                         }
782                                         yylval.ival = val;
783                                         return ICONST;
784                                 }
785 {decimal}               {
786                                         SET_YYLLOC();
787                                         yylval.str = pstrdup(yytext);
788                                         return FCONST;
789                                 }
790 {real}                  {
791                                         SET_YYLLOC();
792                                         yylval.str = pstrdup(yytext);
793                                         return FCONST;
794                                 }
795 {realfail1}             {
796                                         /*
797                                          * throw back the [Ee], and treat as {decimal}.  Note
798                                          * that it is possible the input is actually {integer},
799                                          * but since this case will almost certainly lead to a
800                                          * syntax error anyway, we don't bother to distinguish.
801                                          */
802                                         yyless(yyleng-1);
803                                         SET_YYLLOC();
804                                         yylval.str = pstrdup(yytext);
805                                         return FCONST;
806                                 }
807 {realfail2}             {
808                                         /* throw back the [Ee][+-], and proceed as above */
809                                         yyless(yyleng-2);
810                                         SET_YYLLOC();
811                                         yylval.str = pstrdup(yytext);
812                                         return FCONST;
813                                 }
816 {identifier}    {
817                                         const ScanKeyword *keyword;
818                                         char               *ident;
820                                         SET_YYLLOC();
822                                         /* Is it a keyword? */
823                                         keyword = ScanKeywordLookup(yytext);
824                                         if (keyword != NULL)
825                                         {
826                                                 yylval.keyword = keyword->name;
827                                                 return keyword->value;
828                                         }
830                                         /*
831                                          * No.  Convert the identifier to lower case, and truncate
832                                          * if necessary.
833                                          */
834                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
835                                         yylval.str = ident;
836                                         return IDENT;
837                                 }
839 {other}                 {
840                                         SET_YYLLOC();
841                                         return yytext[0];
842                                 }
844 <<EOF>>                 {
845                                         SET_YYLLOC();
846                                         yyterminate();
847                                 }
852  * scanner_errposition
853  *              Report a lexer or grammar error cursor position, if possible.
855  * This is expected to be used within an ereport() call.  The return value
856  * is a dummy (always 0, in fact).
858  * Note that this can only be used for messages emitted during raw parsing
859  * (essentially, scan.l and gram.y), since it requires scanbuf to still be
860  * valid.
861  */
863 scanner_errposition(int location)
865         int             pos;
867         Assert(scanbuf != NULL);        /* else called from wrong place */
868         if (location < 0)
869                 return 0;                               /* no-op if location is unknown */
871         /* Convert byte offset to character number */
872         pos = pg_mbstrlen_with_len(scanbuf, location) + 1;
873         /* And pass it to the ereport mechanism */
874         return errposition(pos);
878  * yyerror
879  *              Report a lexer or grammar error.
881  * The message's cursor position identifies the most recently lexed token.
882  * This is OK for syntax error messages from the Bison parser, because Bison
883  * parsers report error as soon as the first unparsable token is reached.
884  * Beware of using yyerror for other purposes, as the cursor position might
885  * be misleading!
886  */
887 void
888 yyerror(const char *message)
890         const char *loc = scanbuf + yylloc;
892         if (*loc == YY_END_OF_BUFFER_CHAR)
893         {
894                 ereport(ERROR,
895                                 (errcode(ERRCODE_SYNTAX_ERROR),
896                                  /* translator: %s is typically the translation of "syntax error" */
897                                  errmsg("%s at end of input", _(message)),
898                                  lexer_errposition()));
899         }
900         else
901         {
902                 ereport(ERROR,
903                                 (errcode(ERRCODE_SYNTAX_ERROR),
904                                  /* translator: first %s is typically the translation of "syntax error" */
905                                  errmsg("%s at or near \"%s\"", _(message), loc),
906                                  lexer_errposition()));
907         }
912  * Called before any actual parsing is done
913  */
914 void
915 scanner_init(const char *str)
917         Size    slen = strlen(str);
919         /*
920          * Might be left over after ereport()
921          */
922         if (YY_CURRENT_BUFFER)
923                 yy_delete_buffer(YY_CURRENT_BUFFER);
925         /*
926          * Make a scan buffer with special termination needed by flex.
927          */
928         scanbuf = palloc(slen + 2);
929         memcpy(scanbuf, str, slen);
930         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
931         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
933         /* initialize literal buffer to a reasonable but expansible size */
934         literalalloc = 1024;
935         literalbuf = (char *) palloc(literalalloc);
936         startlit();
938         BEGIN(INITIAL);
943  * Called after parsing is done to clean up after scanner_init()
944  */
945 void
946 scanner_finish(void)
948         yy_delete_buffer(scanbufhandle);
949         pfree(scanbuf);
950         scanbuf = NULL;
954 static void
955 addlit(char *ytext, int yleng)
957         /* enlarge buffer if needed */
958         if ((literallen+yleng) >= literalalloc)
959         {
960                 do {
961                         literalalloc *= 2;
962                 } while ((literallen+yleng) >= literalalloc);
963                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
964         }
965         /* append new data, add trailing null */
966         memcpy(literalbuf+literallen, ytext, yleng);
967         literallen += yleng;
968         literalbuf[literallen] = '\0';
972 static void
973 addlitchar(unsigned char ychar)
975         /* enlarge buffer if needed */
976         if ((literallen+1) >= literalalloc)
977         {
978                 literalalloc *= 2;
979                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
980         }
981         /* append new data, add trailing null */
982         literalbuf[literallen] = ychar;
983         literallen += 1;
984         literalbuf[literallen] = '\0';
989  * One might be tempted to write pstrdup(literalbuf) instead of this,
990  * but for long literals this is much faster because the length is
991  * already known.
992  */
993 static char *
994 litbufdup(void)
996         char *new;
998         new = palloc(literallen + 1);
999         memcpy(new, literalbuf, literallen+1);
1000         return new;
1003 static int
1004 hexval(unsigned char c)
1006         if (c >= '0' && c <= '9')
1007                 return c - '0';
1008         if (c >= 'a' && c <= 'f')
1009                 return c - 'a' + 0xA;
1010         if (c >= 'A' && c <= 'F')
1011                 return c - 'A' + 0xA;
1012         elog(ERROR, "invalid hexadecimal digit");
1013         return 0; /* not reached */
1016 static void
1017 check_unicode_value(pg_wchar c, char * loc)
1019         if (GetDatabaseEncoding() == PG_UTF8)
1020                 return;
1022         if (c > 0x7F)
1023         {
1024                 yylloc += (char *) loc - literalbuf + 3;   /* 3 for U&" */
1025                 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1026         }
1029 static char *
1030 litbuf_udeescape(unsigned char escape)
1032         char *new;
1033         char *in, *out;
1035         if (isxdigit(escape)
1036                 || escape == '+'
1037                 || escape == '\''
1038                 || escape == '"'
1039                 || scanner_isspace(escape))
1040         {
1041                 yylloc += literallen + yyleng + 1;
1042                 yyerror("invalid Unicode escape character");
1043         }
1045         /*
1046          * This relies on the subtle assumption that a UTF-8 expansion
1047          * cannot be longer than its escaped representation.
1048          */
1049         new = palloc(literallen + 1);
1051         in = literalbuf;
1052         out = new;
1053         while (*in)
1054         {
1055                 if (in[0] == escape)
1056                 {
1057                         if (in[1] == escape)
1058                         {
1059                                 *out++ = escape;
1060                                 in += 2;
1061                         }
1062                         else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
1063                         {
1064                                 pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
1065                                 check_unicode_value(unicode, in);
1066                                 unicode_to_utf8(unicode, (unsigned char *) out);
1067                                 in += 5;
1068                                 out += pg_mblen(out);
1069                         }
1070                         else if (in[1] == '+'
1071                                          && isxdigit(in[2]) && isxdigit(in[3])
1072                                          && isxdigit(in[4]) && isxdigit(in[5])
1073                                          && isxdigit(in[6]) && isxdigit(in[7]))
1074                         {
1075                                 pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
1076                                                                         + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
1077                                 check_unicode_value(unicode, in);
1078                                 unicode_to_utf8(unicode, (unsigned char *) out);
1079                                 in += 8;
1080                                 out += pg_mblen(out);
1081                         }
1082                         else
1083                         {
1084                                 yylloc += in - literalbuf + 3;   /* 3 for U&" */
1085                                 yyerror("invalid Unicode escape value");
1086                         }
1087                 }
1088                 else
1089                         *out++ = *in++;
1090         }
1092         *out = '\0';
1093         /*
1094          * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1095          * codes; but it's probably not worth the trouble, since this isn't
1096          * likely to be a performance-critical path.
1097          */
1098         pg_verifymbstr(new, out - new, false);
1099         return new;
1102 static unsigned char
1103 unescape_single_char(unsigned char c)
1105         switch (c)
1106         {
1107                 case 'b':
1108                         return '\b';
1109                 case 'f':
1110                         return '\f';
1111                 case 'n':
1112                         return '\n';
1113                 case 'r':
1114                         return '\r';
1115                 case 't':
1116                         return '\t';
1117                 default:
1118                         /* check for backslash followed by non-7-bit-ASCII */
1119                         if (c == '\0' || IS_HIGHBIT_SET(c))
1120                                 saw_non_ascii = true;
1122                         return c;
1123         }
1126 static void
1127 check_string_escape_warning(unsigned char ychar)
1129         if (ychar == '\'')
1130         {
1131                 if (warn_on_first_escape && escape_string_warning)
1132                         ereport(WARNING,
1133                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1134                                          errmsg("nonstandard use of \\' in a string literal"),
1135                                          errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1136                                          lexer_errposition()));
1137                 warn_on_first_escape = false;   /* warn only once per string */
1138         }
1139         else if (ychar == '\\')
1140         {
1141                 if (warn_on_first_escape && escape_string_warning)
1142                         ereport(WARNING,
1143                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1144                                          errmsg("nonstandard use of \\\\ in a string literal"),
1145                                          errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1146                                          lexer_errposition()));
1147                 warn_on_first_escape = false;   /* warn only once per string */
1148         }
1149         else
1150                 check_escape_warning();
1153 static void
1154 check_escape_warning(void)
1156         if (warn_on_first_escape && escape_string_warning)
1157                 ereport(WARNING,
1158                                 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1159                                  errmsg("nonstandard use of escape in a string literal"),
1160                                  errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1161                                  lexer_errposition()));
1162         warn_on_first_escape = false;   /* warn only once per string */