src/backend/parser/scan.l

   1 %{
   2 /*-------------------------------------------------------------------------
   3  *
   4  * scan.l
   5  *        lexical scanner for PostgreSQL
   6  *
   7  * NOTE NOTE NOTE:
   8  *
   9  * The rules in this file must be kept in sync with psql's lexer!!!
  10  *
  11  * The rules are designed so that the scanner never has to backtrack,
  12  * in the sense that there is always a rule that can match the input
  13  * consumed so far (the rule action may internally throw back some input
  14  * with yyless(), however).  As explained in the flex manual, this makes
  15  * for a useful speed increase --- about a third faster than a plain -CF
  16  * lexer, in simple testing.  The extra complexity is mostly in the rules
  17  * for handling float numbers and continued string literals.  If you change
  18  * the lexical rules, verify that you haven't broken the no-backtrack
  19  * property by running flex with the "-b" option and checking that the
  20  * resulting "lex.backup" file says that no backing up is needed.
  21  *
  22  *
  23  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  24  * Portions Copyright (c) 1994, Regents of the University of California
  25  *
  26  * IDENTIFICATION
  27  *        $PostgreSQL$
  28  *
  29  *-------------------------------------------------------------------------
  30  */
  31 #include "postgres.h"
  32
  33 #include <ctype.h>
  34 #include <unistd.h>
  35
  36 #include "parser/gramparse.h"
  37 #include "parser/keywords.h"
  38 /* Not needed now that this file is compiled as part of gram.y */
  39 /* #include "parser/gram.h" */
  40 #include "parser/scansup.h"
  41 #include "mb/pg_wchar.h"
  42
  43
  44 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
  45 #undef fprintf
  46 #define fprintf(file, fmt, msg)  ereport(ERROR, (errmsg_internal("%s", msg)))
  47
  48 static int              xcdepth = 0;    /* depth of nesting in slash-star comments */
  49 static char    *dolqstart;      /* current $foo$ quote start string */
  50
  51 /*
  52  * GUC variables.  This is a DIRECT violation of the warning given at the
  53  * head of gram.y, ie flex/bison code must not depend on any GUC variables;
  54  * as such, changing their values can induce very unintuitive behavior.
  55  * But we shall have to live with it as a short-term thing until the switch
  56  * to SQL-standard string syntax is complete.
  57  */
  58 int                             backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
  59 bool                    escape_string_warning = true;
  60 bool                    standard_conforming_strings = false;
  61
  62 static bool             warn_on_first_escape;
  63 static bool             saw_non_ascii = false;
  64
  65 /*
  66  * literalbuf is used to accumulate literal values when multiple rules
  67  * are needed to parse a single literal.  Call startlit to reset buffer
  68  * to empty, addlit to add text.  Note that the buffer is palloc'd and
  69  * starts life afresh on every parse cycle.
  70  */
  71 static char        *literalbuf;         /* expandable buffer */
  72 static int              literallen;             /* actual current length */
  73 static int              literalalloc;   /* current allocated buffer size */
  74
  75 #define startlit()  (literalbuf[0] = '\0', literallen = 0)
  76 static void addlit(char *ytext, int yleng);
  77 static void addlitchar(unsigned char ychar);
  78 static char *litbufdup(void);
  79 static char *litbuf_udeescape(unsigned char escape);
  80
  81 #define lexer_errposition()  scanner_errposition(yylloc)
  82
  83 static void check_escape_warning(void);
  84 static void check_string_escape_warning(unsigned char ychar);
  85
  86 /*
  87  * Each call to yylex must set yylloc to the location of the found token
  88  * (expressed as a byte offset from the start of the input text).
  89  * When we parse a token that requires multiple lexer rules to process,
  90  * this should be done in the first such rule, else yylloc will point
  91  * into the middle of the token.
  92  */
  93 #define SET_YYLLOC()  (yylloc = yytext - scanbuf)
  94
  95 /* Handles to the buffer that the lexer uses internally */
  96 static YY_BUFFER_STATE scanbufhandle;
  97 static char *scanbuf;
  98
  99 static unsigned char unescape_single_char(unsigned char c);
 100
 101 %}
 102
 103 %option 8bit
 104 %option never-interactive
 105 %option nodefault
 106 %option noinput
 107 %option nounput
 108 %option noyywrap
 109 %option prefix="base_yy"
 110
 111 /*
 112  * OK, here is a short description of lex/flex rules behavior.
 113  * The longest pattern which matches an input string is always chosen.
 114  * For equal-length patterns, the first occurring in the rules list is chosen.
 115  * INITIAL is the starting state, to which all non-conditional rules apply.
 116  * Exclusive states change parsing rules while the state is active.  When in
 117  * an exclusive state, only those rules defined for that state apply.
 118  *
 119  * We use exclusive states for quoted strings, extended comments,
 120  * and to eliminate parsing troubles for numeric strings.
 121  * Exclusive states:
 122  *  <xb> bit string literal
 123  *  <xc> extended C-style comments
 124  *  <xd> delimited identifiers (double-quoted identifiers)
 125  *  <xh> hexadecimal numeric string
 126  *  <xq> standard quoted strings
 127  *  <xe> extended quoted strings (support backslash escape sequences)
 128  *  <xdolq> $foo$ quoted strings
 129  *  <xui> quoted identifier with Unicode escapes
 130  *  <xus> quoted string with Unicode escapes
 131  */
 132
 133 %x xb
 134 %x xc
 135 %x xd
 136 %x xh
 137 %x xe
 138 %x xq
 139 %x xdolq
 140 %x xui
 141 %x xus
 142
 143 /*
 144  * In order to make the world safe for Windows and Mac clients as well as
 145  * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n
 146  * sequence will be seen as two successive newlines, but that doesn't cause
 147  * any problems.  Comments that start with -- and extend to the next
 148  * newline are treated as equivalent to a single whitespace character.
 149  *
 150  * NOTE a fine point: if there is no newline following --, we will absorb
 151  * everything to the end of the input as a comment.  This is correct.  Older
 152  * versions of Postgres failed to recognize -- as a comment if the input
 153  * did not end with a newline.
 154  *
 155  * XXX perhaps \f (formfeed) should be treated as a newline as well?
 156  *
 157  * XXX if you change the set of whitespace characters, fix scanner_isspace()
 158  * to agree, and see also the plpgsql lexer.
 159  */
 160
 161 space                   [ \t\n\r\f]
 162 horiz_space             [ \t\f]
 163 newline                 [\n\r]
 164 non_newline             [^\n\r]
 165
 166 comment                 ("--"{non_newline}*)
 167
 168 whitespace              ({space}+|{comment})
 169
 170 /*
 171  * SQL requires at least one newline in the whitespace separating
 172  * string literals that are to be concatenated.  Silly, but who are we
 173  * to argue?  Note that {whitespace_with_newline} should not have * after
 174  * it, whereas {whitespace} should generally have a * after it...
 175  */
 176
 177 special_whitespace              ({space}+|{comment}{newline})
 178 horiz_whitespace                ({horiz_space}|{comment})
 179 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
 180
 181 /*
 182  * To ensure that {quotecontinue} can be scanned without having to back up
 183  * if the full pattern isn't matched, we include trailing whitespace in
 184  * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
 185  * except for {quote} followed by whitespace and just one "-" (not two,
 186  * which would start a {comment}).  To cover that we have {quotefail}.
 187  * The actions for {quotestop} and {quotefail} must throw back characters
 188  * beyond the quote proper.
 189  */
 190 quote                   '
 191 quotestop               {quote}{whitespace}*
 192 quotecontinue   {quote}{whitespace_with_newline}{quote}
 193 quotefail               {quote}{whitespace}*"-"
 194
 195 /* Bit string
 196  * It is tempting to scan the string for only those characters
 197  * which are allowed. However, this leads to silently swallowed
 198  * characters if illegal characters are included in the string.
 199  * For example, if xbinside is [01] then B'ABCD' is interpreted
 200  * as a zero-length string, and the ABCD' is lost!
 201  * Better to pass the string forward and let the input routines
 202  * validate the contents.
 203  */
 204 xbstart                 [bB]{quote}
 205 xbinside                [^']*
 206
 207 /* Hexadecimal number */
 208 xhstart                 [xX]{quote}
 209 xhinside                [^']*
 210
 211 /* National character */
 212 xnstart                 [nN]{quote}
 213
 214 /* Quoted string that allows backslash escapes */
 215 xestart                 [eE]{quote}
 216 xeinside                [^\\']+
 217 xeescape                [\\][^0-7]
 218 xeoctesc                [\\][0-7]{1,3}
 219 xehexesc                [\\]x[0-9A-Fa-f]{1,2}
 220
 221 /* Extended quote
 222  * xqdouble implements embedded quote, ''''
 223  */
 224 xqstart                 {quote}
 225 xqdouble                {quote}{quote}
 226 xqinside                [^']+
 227
 228 /* $foo$ style quotes ("dollar quoting")
 229  * The quoted string starts with $foo$ where "foo" is an optional string
 230  * in the form of an identifier, except that it may not contain "$",
 231  * and extends to the first occurrence of an identical string.
 232  * There is *no* processing of the quoted text.
 233  *
 234  * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
 235  * fails to match its trailing "$".
 236  */
 237 dolq_start              [A-Za-z\200-\377_]
 238 dolq_cont               [A-Za-z\200-\377_0-9]
 239 dolqdelim               \$({dolq_start}{dolq_cont}*)?\$
 240 dolqfailed              \${dolq_start}{dolq_cont}*
 241 dolqinside              [^$]+
 242
 243 /* Double quote
 244  * Allows embedded spaces and other special characters into identifiers.
 245  */
 246 dquote                  \"
 247 xdstart                 {dquote}
 248 xdstop                  {dquote}
 249 xddouble                {dquote}{dquote}
 250 xdinside                [^"]+
 251
 252 /* Unicode escapes */
 253 uescape                 [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
 254 /* error rule to avoid backup */
 255 uescapefail             ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
 256
 257 /* Quoted identifier with Unicode escapes */
 258 xuistart                [uU]&{dquote}
 259 xuistop1                {dquote}{whitespace}*{uescapefail}?
 260 xuistop2                {dquote}{whitespace}*{uescape}
 261
 262 /* Quoted string with Unicode escapes */
 263 xusstart                [uU]&{quote}
 264 xusstop1                {quote}{whitespace}*{uescapefail}?
 265 xusstop2                {quote}{whitespace}*{uescape}
 266
 267 /* error rule to avoid backup */
 268 xufailed                [uU]&
 269
 270
 271 /* C-style comments
 272  *
 273  * The "extended comment" syntax closely resembles allowable operator syntax.
 274  * The tricky part here is to get lex to recognize a string starting with
 275  * slash-star as a comment, when interpreting it as an operator would produce
 276  * a longer match --- remember lex will prefer a longer match!  Also, if we
 277  * have something like plus-slash-star, lex will think this is a 3-character
 278  * operator whereas we want to see it as a + operator and a comment start.
 279  * The solution is two-fold:
 280  * 1. append {op_chars}* to xcstart so that it matches as much text as
 281  *    {operator} would. Then the tie-breaker (first matching rule of same
 282  *    length) ensures xcstart wins.  We put back the extra stuff with yyless()
 283  *    in case it contains a star-slash that should terminate the comment.
 284  * 2. In the operator rule, check for slash-star within the operator, and
 285  *    if found throw it back with yyless().  This handles the plus-slash-star
 286  *    problem.
 287  * Dash-dash comments have similar interactions with the operator rule.
 288  */
 289 xcstart                 \/\*{op_chars}*
 290 xcstop                  \*+\/
 291 xcinside                [^*/]+
 292
 293 digit                   [0-9]
 294 ident_start             [A-Za-z\200-\377_]
 295 ident_cont              [A-Za-z\200-\377_0-9\$]
 296
 297 identifier              {ident_start}{ident_cont}*
 298
 299 typecast                "::"
 300
 301 /*
 302  * "self" is the set of chars that should be returned as single-character
 303  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
 304  * which can be one or more characters long (but if a single-char token
 305  * appears in the "self" set, it is not to be returned as an Op).  Note
 306  * that the sets overlap, but each has some chars that are not in the other.
 307  *
 308  * If you change either set, adjust the character lists appearing in the
 309  * rule for "operator"!
 310  */
 311 self                    [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
 312 op_chars                [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
 313 operator                {op_chars}+
 314
 315 /* we no longer allow unary minus in numbers.
 316  * instead we pass it separately to parser. there it gets
 317  * coerced via doNegate() -- Leon aug 20 1999
 318  *
 319  * {realfail1} and {realfail2} are added to prevent the need for scanner
 320  * backup when the {real} rule fails to match completely.
 321  */
 322
 323 integer                 {digit}+
 324 decimal                 (({digit}*\.{digit}+)|({digit}+\.{digit}*))
 325 real                    ({integer}|{decimal})[Ee][-+]?{digit}+
 326 realfail1               ({integer}|{decimal})[Ee]
 327 realfail2               ({integer}|{decimal})[Ee][-+]
 328
 329 param                   \${integer}
 330
 331 other                   .
 332
 333 /*
 334  * Dollar quoted strings are totally opaque, and no escaping is done on them.
 335  * Other quoted strings must allow some special characters such as single-quote
 336  *  and newline.
 337  * Embedded single-quotes are implemented both in the SQL standard
 338  *  style of two adjacent single quotes "''" and in the Postgres/Java style
 339  *  of escaped-quote "\'".
 340  * Other embedded escaped characters are matched explicitly and the leading
 341  *  backslash is dropped from the string.
 342  * Note that xcstart must appear before operator, as explained above!
 343  *  Also whitespace (comment) must appear before operator.
 344  */
 345
 346 %%
 347
 348 {whitespace}    {
 349                                         /* ignore */
 350                                 }
 351
 352 {xcstart}               {
 353                                         /* Set location in case of syntax error in comment */
 354                                         SET_YYLLOC();
 355                                         xcdepth = 0;
 356                                         BEGIN(xc);
 357                                         /* Put back any characters past slash-star; see above */
 358                                         yyless(2);
 359                                 }
 360
 361 <xc>{xcstart}   {
 362                                         xcdepth++;
 363                                         /* Put back any characters past slash-star; see above */
 364                                         yyless(2);
 365                                 }
 366
 367 <xc>{xcstop}    {
 368                                         if (xcdepth <= 0)
 369                                                 BEGIN(INITIAL);
 370                                         else
 371                                                 xcdepth--;
 372                                 }
 373
 374 <xc>{xcinside}  {
 375                                         /* ignore */
 376                                 }
 377
 378 <xc>{op_chars}  {
 379                                         /* ignore */
 380                                 }
 381
 382 <xc>\*+                 {
 383                                         /* ignore */
 384                                 }
 385
 386 <xc><<EOF>>             { yyerror("unterminated /* comment"); }
 387
 388 {xbstart}               {
 389                                         /* Binary bit type.
 390                                          * At some point we should simply pass the string
 391                                          * forward to the parser and label it there.
 392                                          * In the meantime, place a leading "b" on the string
 393                                          * to mark it for the input routine as a binary string.
 394                                          */
 395                                         SET_YYLLOC();
 396                                         BEGIN(xb);
 397                                         startlit();
 398                                         addlitchar('b');
 399                                 }
 400 <xb>{quotestop} |
 401 <xb>{quotefail} {
 402                                         yyless(1);
 403                                         BEGIN(INITIAL);
 404                                         yylval.str = litbufdup();
 405                                         return BCONST;
 406                                 }
 407 <xh>{xhinside}  |
 408 <xb>{xbinside}  {
 409                                         addlit(yytext, yyleng);
 410                                 }
 411 <xh>{quotecontinue}     |
 412 <xb>{quotecontinue}     {
 413                                         /* ignore */
 414                                 }
 415 <xb><<EOF>>             { yyerror("unterminated bit string literal"); }
 416
 417 {xhstart}               {
 418                                         /* Hexadecimal bit type.
 419                                          * At some point we should simply pass the string
 420                                          * forward to the parser and label it there.
 421                                          * In the meantime, place a leading "x" on the string
 422                                          * to mark it for the input routine as a hex string.
 423                                          */
 424                                         SET_YYLLOC();
 425                                         BEGIN(xh);
 426                                         startlit();
 427                                         addlitchar('x');
 428                                 }
 429 <xh>{quotestop} |
 430 <xh>{quotefail} {
 431                                         yyless(1);
 432                                         BEGIN(INITIAL);
 433                                         yylval.str = litbufdup();
 434                                         return XCONST;
 435                                 }
 436 <xh><<EOF>>             { yyerror("unterminated hexadecimal string literal"); }
 437
 438 {xnstart}               {
 439                                         /* National character.
 440                                          * We will pass this along as a normal character string,
 441                                          * but preceded with an internally-generated "NCHAR".
 442                                          */
 443                                         const ScanKeyword *keyword;
 444
 445                                         SET_YYLLOC();
 446                                         yyless(1);                              /* eat only 'n' this time */
 447                                         /* nchar had better be a keyword! */
 448                                         keyword = ScanKeywordLookup("nchar");
 449                                         Assert(keyword != NULL);
 450                                         yylval.keyword = keyword->name;
 451                                         return keyword->value;
 452                                 }
 453
 454 {xqstart}               {
 455                                         warn_on_first_escape = true;
 456                                         saw_non_ascii = false;
 457                                         SET_YYLLOC();
 458                                         if (standard_conforming_strings)
 459                                                 BEGIN(xq);
 460                                         else
 461                                                 BEGIN(xe);
 462                                         startlit();
 463                                 }
 464 {xestart}               {
 465                                         warn_on_first_escape = false;
 466                                         saw_non_ascii = false;
 467                                         SET_YYLLOC();
 468                                         BEGIN(xe);
 469                                         startlit();
 470                                 }
 471 {xusstart}              {
 472                                         SET_YYLLOC();
 473                                         if (!standard_conforming_strings)
 474                                                 ereport(ERROR,
 475                                                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 476                                                                  errmsg("unsafe use of string constant with Unicode escapes"),
 477                                                                  errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
 478                                                                  lexer_errposition()));
 479                                         BEGIN(xus);
 480                                         startlit();
 481                                 }
 482 <xq,xe>{quotestop}      |
 483 <xq,xe>{quotefail} {
 484                                         yyless(1);
 485                                         BEGIN(INITIAL);
 486                                         /*
 487                                          * check that the data remains valid if it might have been
 488                                          * made invalid by unescaping any chars.
 489                                          */
 490                                         if (saw_non_ascii)
 491                                                 pg_verifymbstr(literalbuf, literallen, false);
 492                                         yylval.str = litbufdup();
 493                                         return SCONST;
 494                                 }
 495 <xus>{xusstop1} {
 496                                         /* throw back all but the quote */
 497                                         yyless(1);
 498                                         BEGIN(INITIAL);
 499                                         yylval.str = litbuf_udeescape('\\');
 500                                         return SCONST;
 501                                 }
 502 <xus>{xusstop2} {
 503                                         BEGIN(INITIAL);
 504                                         yylval.str = litbuf_udeescape(yytext[yyleng-2]);
 505                                         return SCONST;
 506                                 }
 507 <xq,xe,xus>{xqdouble} {
 508                                         addlitchar('\'');
 509                                 }
 510 <xq,xus>{xqinside}  {
 511                                         addlit(yytext, yyleng);
 512                                 }
 513 <xe>{xeinside}  {
 514                                         addlit(yytext, yyleng);
 515                                 }
 516 <xe>{xeescape}  {
 517                                         if (yytext[1] == '\'')
 518                                         {
 519                                                 if (backslash_quote == BACKSLASH_QUOTE_OFF ||
 520                                                         (backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
 521                                                          PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
 522                                                         ereport(ERROR,
 523                                                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
 524                                                                          errmsg("unsafe use of \\' in a string literal"),
 525                                                                          errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
 526                                                                          lexer_errposition()));
 527                                         }
 528                                         check_string_escape_warning(yytext[1]);
 529                                         addlitchar(unescape_single_char(yytext[1]));
 530                                 }
 531 <xe>{xeoctesc}  {
 532                                         unsigned char c = strtoul(yytext+1, NULL, 8);
 533
 534                                         check_escape_warning();
 535                                         addlitchar(c);
 536                                         if (c == '\0' || IS_HIGHBIT_SET(c))
 537                                                 saw_non_ascii = true;
 538                                 }
 539 <xe>{xehexesc}  {
 540                                         unsigned char c = strtoul(yytext+2, NULL, 16);
 541
 542                                         check_escape_warning();
 543                                         addlitchar(c);
 544                                         if (c == '\0' || IS_HIGHBIT_SET(c))
 545                                                 saw_non_ascii = true;
 546                                 }
 547 <xq,xe,xus>{quotecontinue} {
 548                                         /* ignore */
 549                                 }
 550 <xe>.                   {
 551                                         /* This is only needed for \ just before EOF */
 552                                         addlitchar(yytext[0]);
 553                                 }
 554 <xq,xe,xus><<EOF>>              { yyerror("unterminated quoted string"); }
 555
 556 {dolqdelim}             {
 557                                         SET_YYLLOC();
 558                                         dolqstart = pstrdup(yytext);
 559                                         BEGIN(xdolq);
 560                                         startlit();
 561                                 }
 562 {dolqfailed}    {
 563                                         SET_YYLLOC();
 564                                         /* throw back all but the initial "$" */
 565                                         yyless(1);
 566                                         /* and treat it as {other} */
 567                                         return yytext[0];
 568                                 }
 569 <xdolq>{dolqdelim} {
 570                                         if (strcmp(yytext, dolqstart) == 0)
 571                                         {
 572                                                 pfree(dolqstart);
 573                                                 BEGIN(INITIAL);
 574                                                 yylval.str = litbufdup();
 575                                                 return SCONST;
 576                                         }
 577                                         else
 578                                         {
 579                                                 /*
 580                                                  * When we fail to match $...$ to dolqstart, transfer
 581                                                  * the $... part to the output, but put back the final
 582                                                  * $ for rescanning.  Consider $delim$...$junk$delim$
 583                                                  */
 584                                                 addlit(yytext, yyleng-1);
 585                                                 yyless(yyleng-1);
 586                                         }
 587                                 }
 588 <xdolq>{dolqinside} {
 589                                         addlit(yytext, yyleng);
 590                                 }
 591 <xdolq>{dolqfailed} {
 592                                         addlit(yytext, yyleng);
 593                                 }
 594 <xdolq>.                {
 595                                         /* This is only needed for $ inside the quoted text */
 596                                         addlitchar(yytext[0]);
 597                                 }
 598 <xdolq><<EOF>>  { yyerror("unterminated dollar-quoted string"); }
 599
 600 {xdstart}               {
 601                                         SET_YYLLOC();
 602                                         BEGIN(xd);
 603                                         startlit();
 604                                 }
 605 {xuistart}              {
 606                                         SET_YYLLOC();
 607                                         BEGIN(xui);
 608                                         startlit();
 609                                 }
 610 <xd>{xdstop}    {
 611                                         char               *ident;
 612
 613                                         BEGIN(INITIAL);
 614                                         if (literallen == 0)
 615                                                 yyerror("zero-length delimited identifier");
 616                                         ident = litbufdup();
 617                                         if (literallen >= NAMEDATALEN)
 618                                                 truncate_identifier(ident, literallen, true);
 619                                         yylval.str = ident;
 620                                         return IDENT;
 621                                 }
 622 <xui>{xuistop1} {
 623                                         char               *ident;
 624
 625                                         BEGIN(INITIAL);
 626                                         if (literallen == 0)
 627                                                 yyerror("zero-length delimited identifier");
 628                                         ident = litbuf_udeescape('\\');
 629                                         if (literallen >= NAMEDATALEN)
 630                                                 truncate_identifier(ident, literallen, true);
 631                                         yylval.str = ident;
 632                                         /* throw back all but the quote */
 633                                         yyless(1);
 634                                         return IDENT;
 635                                 }
 636 <xui>{xuistop2} {
 637                                         char               *ident;
 638
 639                                         BEGIN(INITIAL);
 640                                         if (literallen == 0)
 641                                                 yyerror("zero-length delimited identifier");
 642                                         ident = litbuf_udeescape(yytext[yyleng - 2]);
 643                                         if (literallen >= NAMEDATALEN)
 644                                                 truncate_identifier(ident, literallen, true);
 645                                         yylval.str = ident;
 646                                         return IDENT;
 647                                 }
 648 <xd,xui>{xddouble}      {
 649                                         addlitchar('"');
 650                                 }
 651 <xd,xui>{xdinside}      {
 652                                         addlit(yytext, yyleng);
 653                                 }
 654 <xd,xui><<EOF>>         { yyerror("unterminated quoted identifier"); }
 655
 656 {xufailed}      {
 657                                         char               *ident;
 658
 659                                         SET_YYLLOC();
 660                                         /* throw back all but the initial u/U */
 661                                         yyless(1);
 662                                         /* and treat it as {identifier} */
 663                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
 664                                         yylval.str = ident;
 665                                         return IDENT;
 666                                 }
 667
 668 {typecast}              {
 669                                         SET_YYLLOC();
 670                                         return TYPECAST;
 671                                 }
 672
 673 {self}                  {
 674                                         SET_YYLLOC();
 675                                         return yytext[0];
 676                                 }
 677
 678 {operator}              {
 679                                         /*
 680                                          * Check for embedded slash-star or dash-dash; those
 681                                          * are comment starts, so operator must stop there.
 682                                          * Note that slash-star or dash-dash at the first
 683                                          * character will match a prior rule, not this one.
 684                                          */
 685                                         int             nchars = yyleng;
 686                                         char   *slashstar = strstr(yytext, "/*");
 687                                         char   *dashdash = strstr(yytext, "--");
 688
 689                                         if (slashstar && dashdash)
 690                                         {
 691                                                 /* if both appear, take the first one */
 692                                                 if (slashstar > dashdash)
 693                                                         slashstar = dashdash;
 694                                         }
 695                                         else if (!slashstar)
 696                                                 slashstar = dashdash;
 697                                         if (slashstar)
 698                                                 nchars = slashstar - yytext;
 699
 700                                         /*
 701                                          * For SQL compatibility, '+' and '-' cannot be the
 702                                          * last char of a multi-char operator unless the operator
 703                                          * contains chars that are not in SQL operators.
 704                                          * The idea is to lex '=-' as two operators, but not
 705                                          * to forbid operator names like '?-' that could not be
 706                                          * sequences of SQL operators.
 707                                          */
 708                                         while (nchars > 1 &&
 709                                                    (yytext[nchars-1] == '+' ||
 710                                                         yytext[nchars-1] == '-'))
 711                                         {
 712                                                 int             ic;
 713
 714                                                 for (ic = nchars-2; ic >= 0; ic--)
 715                                                 {
 716                                                         if (strchr("~!@#^&|`?%", yytext[ic]))
 717                                                                 break;
 718                                                 }
 719                                                 if (ic >= 0)
 720                                                         break; /* found a char that makes it OK */
 721                                                 nchars--; /* else remove the +/-, and check again */
 722                                         }
 723
 724                                         SET_YYLLOC();
 725
 726                                         if (nchars < yyleng)
 727                                         {
 728                                                 /* Strip the unwanted chars from the token */
 729                                                 yyless(nchars);
 730                                                 /*
 731                                                  * If what we have left is only one char, and it's
 732                                                  * one of the characters matching "self", then
 733                                                  * return it as a character token the same way
 734                                                  * that the "self" rule would have.
 735                                                  */
 736                                                 if (nchars == 1 &&
 737                                                         strchr(",()[].;:+-*/%^<>=", yytext[0]))
 738                                                         return yytext[0];
 739                                         }
 740
 741                                         /*
 742                                          * Complain if operator is too long.  Unlike the case
 743                                          * for identifiers, we make this an error not a notice-
 744                                          * and-truncate, because the odds are we are looking at
 745                                          * a syntactic mistake anyway.
 746                                          */
 747                                         if (nchars >= NAMEDATALEN)
 748                                                 yyerror("operator too long");
 749
 750                                         /* Convert "!=" operator to "<>" for compatibility */
 751                                         if (strcmp(yytext, "!=") == 0)
 752                                                 yylval.str = pstrdup("<>");
 753                                         else
 754                                                 yylval.str = pstrdup(yytext);
 755                                         return Op;
 756                                 }
 757
 758 {param}                 {
 759                                         SET_YYLLOC();
 760                                         yylval.ival = atol(yytext + 1);
 761                                         return PARAM;
 762                                 }
 763
 764 {integer}               {
 765                                         long val;
 766                                         char* endptr;
 767
 768                                         SET_YYLLOC();
 769                                         errno = 0;
 770                                         val = strtol(yytext, &endptr, 10);
 771                                         if (*endptr != '\0' || errno == ERANGE
 772 #ifdef HAVE_LONG_INT_64
 773                                                 /* if long > 32 bits, check for overflow of int4 */
 774                                                 || val != (long) ((int32) val)
 775 #endif
 776                                                 )
 777                                         {
 778                                                 /* integer too large, treat it as a float */
 779                                                 yylval.str = pstrdup(yytext);
 780                                                 return FCONST;
 781                                         }
 782                                         yylval.ival = val;
 783                                         return ICONST;
 784                                 }
 785 {decimal}               {
 786                                         SET_YYLLOC();
 787                                         yylval.str = pstrdup(yytext);
 788                                         return FCONST;
 789                                 }
 790 {real}                  {
 791                                         SET_YYLLOC();
 792                                         yylval.str = pstrdup(yytext);
 793                                         return FCONST;
 794                                 }
 795 {realfail1}             {
 796                                         /*
 797                                          * throw back the [Ee], and treat as {decimal}.  Note
 798                                          * that it is possible the input is actually {integer},
 799                                          * but since this case will almost certainly lead to a
 800                                          * syntax error anyway, we don't bother to distinguish.
 801                                          */
 802                                         yyless(yyleng-1);
 803                                         SET_YYLLOC();
 804                                         yylval.str = pstrdup(yytext);
 805                                         return FCONST;
 806                                 }
 807 {realfail2}             {
 808                                         /* throw back the [Ee][+-], and proceed as above */
 809                                         yyless(yyleng-2);
 810                                         SET_YYLLOC();
 811                                         yylval.str = pstrdup(yytext);
 812                                         return FCONST;
 813                                 }
 814
 815
 816 {identifier}    {
 817                                         const ScanKeyword *keyword;
 818                                         char               *ident;
 819
 820                                         SET_YYLLOC();
 821
 822                                         /* Is it a keyword? */
 823                                         keyword = ScanKeywordLookup(yytext);
 824                                         if (keyword != NULL)
 825                                         {
 826                                                 yylval.keyword = keyword->name;
 827                                                 return keyword->value;
 828                                         }
 829
 830                                         /*
 831                                          * No.  Convert the identifier to lower case, and truncate
 832                                          * if necessary.
 833                                          */
 834                                         ident = downcase_truncate_identifier(yytext, yyleng, true);
 835                                         yylval.str = ident;
 836                                         return IDENT;
 837                                 }
 838
 839 {other}                 {
 840                                         SET_YYLLOC();
 841                                         return yytext[0];
 842                                 }
 843
 844 <<EOF>>                 {
 845                                         SET_YYLLOC();
 846                                         yyterminate();
 847                                 }
 848
 849 %%
 850
 851 /*
 852  * scanner_errposition
 853  *              Report a lexer or grammar error cursor position, if possible.
 854  *
 855  * This is expected to be used within an ereport() call.  The return value
 856  * is a dummy (always 0, in fact).
 857  *
 858  * Note that this can only be used for messages emitted during raw parsing
 859  * (essentially, scan.l and gram.y), since it requires scanbuf to still be
 860  * valid.
 861  */
 862 int
 863 scanner_errposition(int location)
 864 {
 865         int             pos;
 866
 867         Assert(scanbuf != NULL);        /* else called from wrong place */
 868         if (location < 0)
 869                 return 0;                               /* no-op if location is unknown */
 870
 871         /* Convert byte offset to character number */
 872         pos = pg_mbstrlen_with_len(scanbuf, location) + 1;
 873         /* And pass it to the ereport mechanism */
 874         return errposition(pos);
 875 }
 876
 877 /*
 878  * yyerror
 879  *              Report a lexer or grammar error.
 880  *
 881  * The message's cursor position identifies the most recently lexed token.
 882  * This is OK for syntax error messages from the Bison parser, because Bison
 883  * parsers report error as soon as the first unparsable token is reached.
 884  * Beware of using yyerror for other purposes, as the cursor position might
 885  * be misleading!
 886  */
 887 void
 888 yyerror(const char *message)
 889 {
 890         const char *loc = scanbuf + yylloc;
 891
 892         if (*loc == YY_END_OF_BUFFER_CHAR)
 893         {
 894                 ereport(ERROR,
 895                                 (errcode(ERRCODE_SYNTAX_ERROR),
 896                                  /* translator: %s is typically the translation of "syntax error" */
 897                                  errmsg("%s at end of input", _(message)),
 898                                  lexer_errposition()));
 899         }
 900         else
 901         {
 902                 ereport(ERROR,
 903                                 (errcode(ERRCODE_SYNTAX_ERROR),
 904                                  /* translator: first %s is typically the translation of "syntax error" */
 905                                  errmsg("%s at or near \"%s\"", _(message), loc),
 906                                  lexer_errposition()));
 907         }
 908 }
 909
 910
 911 /*
 912  * Called before any actual parsing is done
 913  */
 914 void
 915 scanner_init(const char *str)
 916 {
 917         Size    slen = strlen(str);
 918
 919         /*
 920          * Might be left over after ereport()
 921          */
 922         if (YY_CURRENT_BUFFER)
 923                 yy_delete_buffer(YY_CURRENT_BUFFER);
 924
 925         /*
 926          * Make a scan buffer with special termination needed by flex.
 927          */
 928         scanbuf = palloc(slen + 2);
 929         memcpy(scanbuf, str, slen);
 930         scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
 931         scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
 932
 933         /* initialize literal buffer to a reasonable but expansible size */
 934         literalalloc = 1024;
 935         literalbuf = (char *) palloc(literalalloc);
 936         startlit();
 937
 938         BEGIN(INITIAL);
 939 }
 940
 941
 942 /*
 943  * Called after parsing is done to clean up after scanner_init()
 944  */
 945 void
 946 scanner_finish(void)
 947 {
 948         yy_delete_buffer(scanbufhandle);
 949         pfree(scanbuf);
 950         scanbuf = NULL;
 951 }
 952
 953
 954 static void
 955 addlit(char *ytext, int yleng)
 956 {
 957         /* enlarge buffer if needed */
 958         if ((literallen+yleng) >= literalalloc)
 959         {
 960                 do {
 961                         literalalloc *= 2;
 962                 } while ((literallen+yleng) >= literalalloc);
 963                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
 964         }
 965         /* append new data, add trailing null */
 966         memcpy(literalbuf+literallen, ytext, yleng);
 967         literallen += yleng;
 968         literalbuf[literallen] = '\0';
 969 }
 970
 971
 972 static void
 973 addlitchar(unsigned char ychar)
 974 {
 975         /* enlarge buffer if needed */
 976         if ((literallen+1) >= literalalloc)
 977         {
 978                 literalalloc *= 2;
 979                 literalbuf = (char *) repalloc(literalbuf, literalalloc);
 980         }
 981         /* append new data, add trailing null */
 982         literalbuf[literallen] = ychar;
 983         literallen += 1;
 984         literalbuf[literallen] = '\0';
 985 }
 986
 987
 988 /*
 989  * One might be tempted to write pstrdup(literalbuf) instead of this,
 990  * but for long literals this is much faster because the length is
 991  * already known.
 992  */
 993 static char *
 994 litbufdup(void)
 995 {
 996         char *new;
 997
 998         new = palloc(literallen + 1);
 999         memcpy(new, literalbuf, literallen+1);
1000         return new;
1001 }
1002
1003 static int
1004 hexval(unsigned char c)
1005 {
1006         if (c >= '0' && c <= '9')
1007                 return c - '0';
1008         if (c >= 'a' && c <= 'f')
1009                 return c - 'a' + 0xA;
1010         if (c >= 'A' && c <= 'F')
1011                 return c - 'A' + 0xA;
1012         elog(ERROR, "invalid hexadecimal digit");
1013         return 0; /* not reached */
1014 }
1015
1016 static void
1017 check_unicode_value(pg_wchar c, char * loc)
1018 {
1019         if (GetDatabaseEncoding() == PG_UTF8)
1020                 return;
1021
1022         if (c > 0x7F)
1023         {
1024                 yylloc += (char *) loc - literalbuf + 3;   /* 3 for U&" */
1025                 yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1026         }
1027 }
1028
1029 static char *
1030 litbuf_udeescape(unsigned char escape)
1031 {
1032         char *new;
1033         char *in, *out;
1034
1035         if (isxdigit(escape)
1036                 || escape == '+'
1037                 || escape == '\''
1038                 || escape == '"'
1039                 || scanner_isspace(escape))
1040         {
1041                 yylloc += literallen + yyleng + 1;
1042                 yyerror("invalid Unicode escape character");
1043         }
1044
1045         /*
1046          * This relies on the subtle assumption that a UTF-8 expansion
1047          * cannot be longer than its escaped representation.
1048          */
1049         new = palloc(literallen + 1);
1050
1051         in = literalbuf;
1052         out = new;
1053         while (*in)
1054         {
1055                 if (in[0] == escape)
1056                 {
1057                         if (in[1] == escape)
1058                         {
1059                                 *out++ = escape;
1060                                 in += 2;
1061                         }
1062                         else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
1063                         {
1064                                 pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
1065                                 check_unicode_value(unicode, in);
1066                                 unicode_to_utf8(unicode, (unsigned char *) out);
1067                                 in += 5;
1068                                 out += pg_mblen(out);
1069                         }
1070                         else if (in[1] == '+'
1071                                          && isxdigit(in[2]) && isxdigit(in[3])
1072                                          && isxdigit(in[4]) && isxdigit(in[5])
1073                                          && isxdigit(in[6]) && isxdigit(in[7]))
1074                         {
1075                                 pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
1076                                                                         + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
1077                                 check_unicode_value(unicode, in);
1078                                 unicode_to_utf8(unicode, (unsigned char *) out);
1079                                 in += 8;
1080                                 out += pg_mblen(out);
1081                         }
1082                         else
1083                         {
1084                                 yylloc += in - literalbuf + 3;   /* 3 for U&" */
1085                                 yyerror("invalid Unicode escape value");
1086                         }
1087                 }
1088                 else
1089                         *out++ = *in++;
1090         }
1091
1092         *out = '\0';
1093         /*
1094          * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1095          * codes; but it's probably not worth the trouble, since this isn't
1096          * likely to be a performance-critical path.
1097          */
1098         pg_verifymbstr(new, out - new, false);
1099         return new;
1100 }
1101
1102 static unsigned char
1103 unescape_single_char(unsigned char c)
1104 {
1105         switch (c)
1106         {
1107                 case 'b':
1108                         return '\b';
1109                 case 'f':
1110                         return '\f';
1111                 case 'n':
1112                         return '\n';
1113                 case 'r':
1114                         return '\r';
1115                 case 't':
1116                         return '\t';
1117                 default:
1118                         /* check for backslash followed by non-7-bit-ASCII */
1119                         if (c == '\0' || IS_HIGHBIT_SET(c))
1120                                 saw_non_ascii = true;
1121
1122                         return c;
1123         }
1124 }
1125
1126 static void
1127 check_string_escape_warning(unsigned char ychar)
1128 {
1129         if (ychar == '\'')
1130         {
1131                 if (warn_on_first_escape && escape_string_warning)
1132                         ereport(WARNING,
1133                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1134                                          errmsg("nonstandard use of \\' in a string literal"),
1135                                          errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1136                                          lexer_errposition()));
1137                 warn_on_first_escape = false;   /* warn only once per string */
1138         }
1139         else if (ychar == '\\')
1140         {
1141                 if (warn_on_first_escape && escape_string_warning)
1142                         ereport(WARNING,
1143                                         (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1144                                          errmsg("nonstandard use of \\\\ in a string literal"),
1145                                          errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1146                                          lexer_errposition()));
1147                 warn_on_first_escape = false;   /* warn only once per string */
1148         }
1149         else
1150                 check_escape_warning();
1151 }
1152
1153 static void
1154 check_escape_warning(void)
1155 {
1156         if (warn_on_first_escape && escape_string_warning)
1157                 ereport(WARNING,
1158                                 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1159                                  errmsg("nonstandard use of escape in a string literal"),
1160                                  errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1161                                  lexer_errposition()));
1162         warn_on_first_escape = false;   /* warn only once per string */
1163 }