epan/dfilter/scanner.l

   1 %top {
   2 /* Include this before everything else, for various large-file definitions */
   3 #include "config.h"
   4 #define WS_LOG_DOMAIN LOG_DOMAIN_DFILTER
   5 #include <wireshark.h>
   6
   7 #include <stdlib.h>
   8 #include <errno.h>
   9 #include <math.h>
  10
  11 #include <wsutil/str_util.h>
  12
  13 #include "dfilter-int.h"
  14 #include "syntax-tree.h"
  15 #include "grammar.h"
  16 #include "dfunctions.h"
  17 #include "sttype-number.h"
  18 }
  19
  20 /*
  21  * Always generate warnings.
  22  */
  23 %option warn
  24
  25 /*
  26  * We want a reentrant scanner.
  27  */
  28 %option reentrant
  29
  30 /*
  31  * We don't use input, so don't generate code for it.
  32  */
  33 %option noinput
  34
  35 /*
  36  * We don't use unput, so don't generate code for it.
  37  */
  38 %option nounput
  39
  40 /*
  41  * We don't read interactively from the terminal.
  42  */
  43 %option never-interactive
  44
  45 /*
  46  * Prefix scanner routines with "df_yy" rather than "yy", so this scanner
  47  * can coexist with other scanners.
  48  */
  49 %option prefix="df_yy"
  50
  51 /*
  52  * We're reading from a string, so we don't need yywrap.
  53  */
  54 %option noyywrap
  55
  56 /*
  57  * The type for the dfs we keep for a scanner.
  58  */
  59 %option extra-type="dfsyntax_t *"
  60
  61 %{
  62 /*
  63  * Wireshark - Network traffic analyzer
  64  * By Gerald Combs <gerald@wireshark.org>
  65  * Copyright 2001 Gerald Combs
  66  *
  67  * SPDX-License-Identifier: GPL-2.0-or-later
  68  */
  69
  70 /*
  71  * Disable diagnostics in the code generated by Flex.
  72  */
  73 DIAG_OFF_FLEX()
  74
  75 WS_WARN_UNUSED static int set_lval_simple(dfsyntax_t *dfs, int token, const char *token_value, sttype_id_t type_id);
  76 #define simple(token)   (update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_UNINITIALIZED))
  77 #define test(token)     (update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_TEST))
  78 #define math(token)     (update_location(yyextra, yytext), set_lval_simple(yyextra, token, yytext, STTYPE_ARITHMETIC))
  79
  80 WS_WARN_UNUSED static int set_lval_literal(dfsyntax_t *dfs,  const char *value, const char *token_value);
  81 WS_WARN_UNUSED static int set_lval_identifier(dfsyntax_t *dfs,  const char *value, const char *token_value);
  82 WS_WARN_UNUSED static int set_lval_unparsed(dfsyntax_t *dfs, const char *value, const char *token_value);
  83
  84 WS_WARN_UNUSED static int set_lval_field(dfsyntax_t *dfs, const header_field_info *hfinfo, const char *token_value);
  85 WS_WARN_UNUSED static int set_lval_quoted_string(dfsyntax_t *dfs, GString *quoted_string);
  86 WS_WARN_UNUSED static int set_lval_charconst(dfsyntax_t *dfs, GString *quoted_string);
  87 WS_WARN_UNUSED static int set_lval_integer(dfsyntax_t *dfs, const char *value, const char *token_value);
  88 WS_WARN_UNUSED static int set_lval_float(dfsyntax_t *dfs, const char *value, const char *token_value);
  89
  90 static bool append_escaped_char(dfsyntax_t *dfs, GString *str, char c);
  91 static bool append_universal_character_name(dfsyntax_t *dfs, GString *str, const char *ucn);
  92 static bool parse_charconst(dfsyntax_t *dfs, const char *s, unsigned long *valuep);
  93 static bool parse_unsigned_long_long(dfsyntax_t *dfs, const char *s, unsigned long long *valuep, bool set_error);
  94 static bool parse_double(dfsyntax_t *dfs, const char *s, double *valuep);
  95
  96 static void update_location(dfsyntax_t *dfs, const char *text);
  97 static void update_string_loc(dfsyntax_t *dfs, const char *text);
  98
  99 #define FAIL(...) \
 100         do { \
 101                 ws_noisy("Scanning failed here."); \
 102                 dfilter_fail(yyextra, DF_ERROR_GENERIC, yyextra->location, __VA_ARGS__); \
 103         } while (0)
 104
 105 %}
 106
 107 FunctionIdentifier      [[:alpha:]_][[:alnum:]_]*
 108
 109 /*
 110  * Cannot start with '-'. * Some protocol name can contain '-', for example "mac-lte".
 111  * Fields that contain '-' anywhere cannot start with a decimal digit.
 112  * Note that some protocol names start with a number, for example "9p". This is
 113  * handled as a special case for numeric patterns.
 114  * Some protocol names contain dots, e.g: _ws.expert
 115  * Protocol or protocol field cannot contain DOTDOT anywhere.
 116  */
 117 VarIdentifier           [[:alnum:]_][[:alnum:]_-]*
 118 ProtoFieldIdentifier    {VarIdentifier}(\.{VarIdentifier})*
 119
 120 hex2                    [[:xdigit:]]{2}
 121 ColonMacAddress         {hex2}:{hex2}:{hex2}:{hex2}:{hex2}:{hex2}
 122 HyphenMacAddress        {hex2}-{hex2}-{hex2}-{hex2}-{hex2}-{hex2}
 123 DotMacAddress           {hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}\.{hex2}
 124
 125 hex4                    [[:xdigit:]]{4}
 126 DotQuadMacAddress       {hex4}\.{hex4}\.{hex4}
 127
 128 ColonBytes              ({hex2}:)|({hex2}(:{hex2})+)
 129 HyphenBytes             {hex2}(-{hex2})+
 130 DotBytes                {hex2}(\.{hex2})+
 131
 132 DecOctet                [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]
 133 IPv4Address             {DecOctet}\.{DecOctet}\.{DecOctet}\.{DecOctet}
 134
 135 h16                     [0-9A-Fa-f]{1,4}
 136 ls32                    {h16}:{h16}|{IPv4Address}
 137 IPv6Address             ({h16}:){6}{ls32}|::({h16}:){5}{ls32}|({h16})?::({h16}:){4}{ls32}|(({h16}:){0,1}{h16})?::({h16}:){3}{ls32}|(({h16}:){0,2}{h16})?::({h16}:){2}{ls32}|(({h16}:){0,3}{h16})?::{h16}:{ls32}|(({h16}:){0,4}{h16})?::{ls32}|(({h16}:){0,5}{h16})?::{h16}|(({h16}:){0,6}{h16})?::
 138
 139 V4CidrPrefix            \/[[:digit:]]{1,2}
 140 V6CidrPrefix            \/[[:digit:]]{1,3}
 141
 142 /* Catch all valid semantic values. Cannot contain DOT DOT or start with MINUS. */
 143 StartAlphabet           [[:alnum:]_:]
 144 Alphabet                [[:alnum:]_:/-]
 145 LiteralValue            {StartAlphabet}{Alphabet}*(\.{Alphabet}+)*
 146
 147 Exponent                ([eE][+-]?[[:digit:]]+)
 148 HexExponent             ([pP][+-]?[[:digit:]]+)
 149
 150 %x RANGE
 151 %x LAYER
 152 %x DQUOTE
 153 %x SQUOTE
 154
 155 %%
 156
 157 [[:blank:]\n\r]+        {
 158         update_location(yyextra, yytext);
 159 }
 160
 161 "("             return simple(TOKEN_LPAREN);
 162 ")"             return simple(TOKEN_RPAREN);
 163 ","             return simple(TOKEN_COMMA);
 164 "{"             return simple(TOKEN_LBRACE);
 165 ".."            return simple(TOKEN_DOTDOT);
 166 "}"             return simple(TOKEN_RBRACE);
 167 "$"             return simple(TOKEN_DOLLAR);
 168 "@"             return simple(TOKEN_ATSIGN);
 169 "any"           return simple(TOKEN_ANY);
 170 "all"           return simple(TOKEN_ALL);
 171
 172 "=="            return test(TOKEN_TEST_ANY_EQ);
 173 "eq"            return test(TOKEN_TEST_ANY_EQ);
 174 "any_eq"        return test(TOKEN_TEST_ANY_EQ);
 175 "!="            return test(TOKEN_TEST_ALL_NE);
 176 "ne"            return test(TOKEN_TEST_ALL_NE);
 177 "all_ne"        return test(TOKEN_TEST_ALL_NE);
 178 "==="           return test(TOKEN_TEST_ALL_EQ);
 179 "all_eq"        return test(TOKEN_TEST_ALL_EQ);
 180 "!=="           return test(TOKEN_TEST_ANY_NE);
 181 "any_ne"        return test(TOKEN_TEST_ANY_NE);
 182 ">"             return test(TOKEN_TEST_GT);
 183 "gt"            return test(TOKEN_TEST_GT);
 184 ">="            return test(TOKEN_TEST_GE);
 185 "ge"            return test(TOKEN_TEST_GE);
 186 "<"             return test(TOKEN_TEST_LT);
 187 "lt"            return test(TOKEN_TEST_LT);
 188 "<="            return test(TOKEN_TEST_LE);
 189 "le"            return test(TOKEN_TEST_LE);
 190 "contains"      return test(TOKEN_TEST_CONTAINS);
 191 "~"             return test(TOKEN_TEST_MATCHES);
 192 "matches"       return test(TOKEN_TEST_MATCHES);
 193 "!"             return test(TOKEN_TEST_NOT);
 194 "not"           return test(TOKEN_TEST_NOT);
 195 "&&"            return test(TOKEN_TEST_AND);
 196 "and"           return test(TOKEN_TEST_AND);
 197 "||"            return test(TOKEN_TEST_OR);
 198 "or"            return test(TOKEN_TEST_OR);
 199 "^^"            return test(TOKEN_TEST_XOR);
 200 "xor"           return test(TOKEN_TEST_XOR);
 201 "in"            return test(TOKEN_TEST_IN);
 202
 203 "+"             return math(TOKEN_PLUS);
 204 "-"             return math(TOKEN_MINUS);
 205 "*"             return math(TOKEN_STAR);
 206 "/"             return math(TOKEN_RSLASH);
 207 "%"             return math(TOKEN_PERCENT);
 208 "&"             return math(TOKEN_BITWISE_AND);
 209 "bitand"        return math(TOKEN_BITWISE_AND);
 210 "bitwise_and"   return math(TOKEN_BITWISE_AND);
 211
 212 "#"                             {
 213         BEGIN(LAYER);
 214         return simple(TOKEN_HASH);
 215 }
 216
 217 <LAYER>[[:digit:]]+             {
 218         BEGIN(INITIAL);
 219         update_location(yyextra, yytext);
 220         return set_lval_simple(yyextra, TOKEN_INDEX, yytext, STTYPE_UNINITIALIZED);
 221 }
 222
 223 <LAYER>[^[:digit:][]                    {
 224         update_location(yyextra, yytext);
 225         FAIL("Expected digit or \"[\", not \"%s\"", yytext);
 226         return SCAN_FAILED;
 227 }
 228
 229 <INITIAL,LAYER>"["              {
 230         BEGIN(RANGE);
 231         return simple(TOKEN_LBRACKET);
 232 }
 233
 234 <RANGE>[^],]+                   {
 235         update_location(yyextra, yytext);
 236         return set_lval_simple(yyextra, TOKEN_RANGE_NODE, yytext, STTYPE_UNINITIALIZED);
 237 }
 238
 239 <RANGE>","                      {
 240         return simple(TOKEN_COMMA);
 241 }
 242
 243 <RANGE>"]"                              {
 244         BEGIN(INITIAL);
 245         return simple(TOKEN_RBRACKET);
 246 }
 247
 248 <RANGE><<EOF>>                          {
 249         update_location(yyextra, yytext);
 250         FAIL("The right bracket was missing from a slice.");
 251         return SCAN_FAILED;
 252 }
 253
 254 [rR]{0,1}\042                   {
 255         /* start quote of a quoted string */
 256         /*
 257          * The example of how to scan for strings was taken from
 258          * the flex manual, from the section "Start Conditions".
 259          * See: https://westes.github.io/flex/manual/Start-Conditions.html
 260          */
 261         BEGIN(DQUOTE);
 262         update_location(yyextra, yytext);
 263         yyextra->string_loc = yyextra->location;
 264
 265         yyextra->quoted_string = g_string_new(NULL);
 266
 267         if (yytext[0] == 'r' || yytext[0] == 'R') {
 268                 /*
 269                  * This is a raw string (like in Python). Rules: 1) The two
 270                  * escape sequences are \\ and \". 2) Backslashes are
 271                  * preserved. 3) Double quotes in the string must be escaped.
 272                  * Corollary: Strings cannot end with an odd number of
 273                  * backslashes.
 274                  * Example: r"a\b\x12\"\\" is the string (including the implicit NUL terminator)
 275                  * {'a', '\\', 'b', '\\', 'x', '1', '2', '\\', '"', '\\'. '\\', '\0'}
 276                  */
 277                 yyextra->raw_string = true;
 278         }
 279         else {
 280                 yyextra->raw_string = false;
 281         }
 282 }
 283
 284 <DQUOTE><<EOF>>                         {
 285         /* unterminated string */
 286         update_string_loc(yyextra, yytext);
 287         g_string_free(yyextra->quoted_string, TRUE);
 288         yyextra->quoted_string = NULL;
 289         FAIL("The final quote was missing from a quoted string.");
 290         return SCAN_FAILED;
 291 }
 292
 293 <DQUOTE>\042                    {
 294         /* end quote */
 295         BEGIN(INITIAL);
 296         update_string_loc(yyextra, yytext);
 297         int token = set_lval_quoted_string(yyextra, yyextra->quoted_string);
 298         yyextra->quoted_string = NULL;
 299         yyextra->string_loc.col_start = -1;
 300         return token;
 301 }
 302
 303 <DQUOTE>\\[0-7]{1,3} {
 304         /* octal sequence */
 305         update_string_loc(yyextra, yytext);
 306         if (yyextra->raw_string) {
 307                 g_string_append(yyextra->quoted_string, yytext);
 308         }
 309         else {
 310                 unsigned long result;
 311                 result = strtoul(yytext + 1, NULL, 8);
 312                 if (result > 0xff) {
 313                         g_string_free(yyextra->quoted_string, TRUE);
 314                         yyextra->quoted_string = NULL;
 315                         FAIL("%s is larger than 255.", yytext);
 316                         return SCAN_FAILED;
 317                 }
 318                 g_string_append_c(yyextra->quoted_string, (char) result);
 319         }
 320 }
 321
 322 <DQUOTE>\\x[[:xdigit:]]{1,2} {
 323         /* hex sequence */
 324         /*
 325          * C standard does not place a limit on the number of hex
 326          * digits after \x... but we do. \xNN can have 1 or two Ns, not more.
 327          */
 328         update_string_loc(yyextra, yytext);
 329         if (yyextra->raw_string) {
 330                 g_string_append(yyextra->quoted_string, yytext);
 331         }
 332         else {
 333                 unsigned long result;
 334                 result = strtoul(yytext + 2, NULL, 16);
 335                 g_string_append_c(yyextra->quoted_string, (char) result);
 336         }
 337 }
 338
 339 <DQUOTE>\\u[[:xdigit:]]{0,4} {
 340         /* universal character name */
 341         update_string_loc(yyextra, yytext);
 342         if (yyextra->raw_string) {
 343                 g_string_append(yyextra->quoted_string, yytext);
 344         }
 345         else if (!append_universal_character_name(yyextra, yyextra->quoted_string, yytext)) {
 346                 g_string_free(yyextra->quoted_string, TRUE);
 347                 yyextra->quoted_string = NULL;
 348                 return SCAN_FAILED;
 349         }
 350 }
 351
 352 <DQUOTE>\\U[[:xdigit:]]{0,8} {
 353         /* universal character name */
 354         update_string_loc(yyextra, yytext);
 355         if (yyextra->raw_string) {
 356                 g_string_append(yyextra->quoted_string, yytext);
 357         }
 358         else if (!append_universal_character_name(yyextra, yyextra->quoted_string, yytext)) {
 359                 g_string_free(yyextra->quoted_string, TRUE);
 360                 yyextra->quoted_string = NULL;
 361                 return SCAN_FAILED;
 362         }
 363 }
 364
 365
 366 <DQUOTE>\\.                             {
 367         /* escaped character */
 368         update_string_loc(yyextra, yytext);
 369         if (yyextra->raw_string) {
 370                 g_string_append(yyextra->quoted_string, yytext);
 371         }
 372         else if (!append_escaped_char(yyextra, yyextra->quoted_string, yytext[1])) {
 373                 g_string_free(yyextra->quoted_string, TRUE);
 374                 yyextra->quoted_string = NULL;
 375                 return SCAN_FAILED;
 376         }
 377 }
 378
 379 <DQUOTE>[^\\\042]+                      {
 380         /* non-escaped string */
 381         update_string_loc(yyextra, yytext);
 382         g_string_append(yyextra->quoted_string, yytext);
 383 }
 384
 385
 386 \047                            {
 387         /* start quote of a quoted character value */
 388         BEGIN(SQUOTE);
 389         update_location(yyextra, yytext);
 390         yyextra->string_loc = yyextra->location;
 391
 392         yyextra->quoted_string = g_string_new("'");
 393 }
 394
 395 <SQUOTE><<EOF>>                         {
 396         /* unterminated character value */
 397         update_string_loc(yyextra, yytext);
 398         g_string_free(yyextra->quoted_string, TRUE);
 399         yyextra->quoted_string = NULL;
 400         FAIL("The final quote was missing from a character constant.");
 401         return SCAN_FAILED;
 402 }
 403
 404 <SQUOTE>\047                    {
 405         /* end quote */
 406         BEGIN(INITIAL);
 407         update_string_loc(yyextra, yytext);
 408         g_string_append_c(yyextra->quoted_string, '\'');
 409         int token = set_lval_charconst(yyextra, yyextra->quoted_string);
 410         yyextra->quoted_string = NULL;
 411         yyextra->string_loc.col_start = -1;
 412         return token;
 413 }
 414
 415 <SQUOTE>\\.                             {
 416         /* escaped character */
 417         update_string_loc(yyextra, yytext);
 418         g_string_append(yyextra->quoted_string, yytext);
 419 }
 420
 421 <SQUOTE>[^\\\047]+                      {
 422         /* non-escaped string */
 423         update_string_loc(yyextra, yytext);
 424         g_string_append(yyextra->quoted_string, yytext);
 425 }
 426
 427         /* NOTE: None of the patterns below can match ".." anywhere in the token string. */
 428
 429         /* MAC address. */
 430
 431 {ColonMacAddress}|{HyphenMacAddress}    {
 432         /* MAC Address. */
 433         update_location(yyextra, yytext);
 434         return set_lval_literal(yyextra, yytext, yytext);
 435 }
 436
 437         /* IP address. */
 438
 439 {IPv4Address}{V4CidrPrefix}?            {
 440         /* IPv4 with or without prefix. */
 441         update_location(yyextra, yytext);
 442         return set_lval_literal(yyextra, yytext, yytext);
 443 }
 444
 445 {IPv6Address}{V6CidrPrefix}?            {
 446         /* IPv6 with or without prefix. */
 447         update_location(yyextra, yytext);
 448         return set_lval_literal(yyextra, yytext, yytext);
 449 }
 450
 451         /* Integer */
 452
 453 [[:digit:]][[:digit:]]* {
 454         /* Numeric or field. */
 455         update_location(yyextra, yytext);
 456         /* Check if we have a protocol or protocol field, otherwise assume a literal. */
 457         /* It is only reasonable to assume a literal here, instead of a
 458          * (possibly non-existent) protocol field, because protocol field filter names
 459          * should not start with a digit (the lexical syntax for numbers). */
 460         header_field_info *hfinfo = dfilter_resolve_unparsed(yytext, yyextra->deprecated);
 461         if (hfinfo != NULL) {
 462                 return set_lval_field(yyextra, hfinfo, yytext);
 463         }
 464         return set_lval_integer(yyextra, yytext, yytext);
 465 }
 466
 467 0[bBxX]?[[:xdigit:]]+   {
 468         /* Binary or octal or hexadecimal. */
 469         update_location(yyextra, yytext);
 470         return set_lval_integer(yyextra, yytext, yytext);
 471 }
 472
 473         /* Floating point. */
 474
 475 [[:digit:]]+\.[[:digit:]]+      {
 476         /* Decimal float. */
 477         update_location(yyextra, yytext);
 478         return set_lval_float(yyextra, yytext, yytext);
 479 }
 480
 481 [[:digit:]]+{Exponent}|[[:digit:]]+\.[[:digit:]]+{Exponent}?    {
 482         /* Decimal float with optional exponent. */
 483         /* Significand cannot have any side omitted. */
 484         update_location(yyextra, yytext);
 485         /* Check if we have a protocol or protocol field, otherwise assume a literal. */
 486         /* It is only reasonable to assume a literal here, instead of a
 487          * (possibly non-existent) protocol field, because protocol field filter names
 488          * should not start with a digit (the lexical syntax for numbers). */
 489         header_field_info *hfinfo = dfilter_resolve_unparsed(yytext, yyextra->deprecated);
 490         if (hfinfo != NULL) {
 491                 return set_lval_field(yyextra, hfinfo, yytext);
 492         }
 493         return set_lval_float(yyextra, yytext, yytext);
 494 }
 495
 496 0[xX][[:xdigit:]]+{HexExponent}|0[xX][[:xdigit:]]+\.[[:xdigit:]]+{HexExponent}? {
 497         /* Hexadecimal float with optional exponent. Can't be a field because
 498          * field cannot beging with 0x. */
 499         /* Significand cannot have any side omitted. */
 500         update_location(yyextra, yytext);
 501         return set_lval_float(yyextra, yytext, yytext);
 502 }
 503
 504 (?i:inf)(?i:inity)? {
 505         /* Infinity. */
 506         update_location(yyextra, yytext);
 507         return set_lval_float(yyextra, yytext, yytext);
 508 }
 509
 510 (?i:nan)(\([[:alnum:]_]*\))? {
 511         /* NaNs (including quiet NaNs). */
 512         update_location(yyextra, yytext);
 513         return set_lval_float(yyextra, yytext, yytext);
 514 }
 515
 516 :[[:xdigit:]]+  {
 517         /* Numeric prefixed with ':'. */
 518         update_location(yyextra, yytext);
 519         return set_lval_literal(yyextra, yytext + 1, yytext);
 520 }
 521
 522         /* Bytes. */
 523
 524 :?{ColonBytes}  {
 525         /* Bytes. */
 526         update_location(yyextra, yytext);
 527         if (yytext[0] == ':')
 528                 return set_lval_literal(yyextra, yytext + 1, yytext);
 529         return set_lval_literal(yyextra, yytext, yytext);
 530 }
 531
 532 :{HyphenBytes}  {
 533         /* Bytes. */
 534         update_location(yyextra, yytext);
 535         return set_lval_literal(yyextra, yytext + 1, yytext);
 536 }
 537
 538 :{DotBytes}     {
 539         /* DotBytes, can be a field without ':' prefix. */
 540         update_location(yyextra, yytext);
 541         return set_lval_literal(yyextra, yytext + 1, yytext);
 542 }
 543
 544         /* Identifier (protocol/field/function name). */
 545
 546         /* This must come before FieldIdentifier to match function names. */
 547 {FunctionIdentifier}    {
 548         /* Identifier (field or function) or literal (bytes without separator). */
 549         update_location(yyextra, yytext);
 550         return set_lval_identifier(yyextra, yytext, yytext);
 551 }
 552
 553 \.{ProtoFieldIdentifier}        {
 554         /* Identifier, prefixed with a '.', must be a field, no ifs or buts. */
 555         update_location(yyextra, yytext);
 556         const char *name = yytext + 1;
 557         header_field_info *hfinfo = dfilter_resolve_unparsed(name, yyextra->deprecated);
 558         if (hfinfo == NULL) {
 559                 FAIL("\"%s\" is not a valid protocol or protocol field.", name);
 560                 return SCAN_FAILED;
 561         }
 562         return set_lval_field(yyextra, hfinfo, yytext);
 563 }
 564
 565 {ProtoFieldIdentifier}  {
 566         /* Catch-all for protocol values. Can also be a literal. */
 567         update_location(yyextra, yytext);
 568         return set_lval_unparsed(yyextra, yytext, yytext);
 569 }
 570
 571 {LiteralValue}  {
 572         /* Catch-all for semantic values. */
 573         update_location(yyextra, yytext);
 574         /* We use literal here because identifiers (using unparsed) should have
 575          * matched one of the previous rules. */
 576         return set_lval_literal(yyextra, yytext, yytext);
 577 }
 578
 579 . {
 580         /* Default */
 581         update_location(yyextra, yytext);
 582         if (isprint_string(yytext))
 583                 FAIL("\"%s\" was unexpected in this context.", yytext);
 584         else
 585                 FAIL("Non-printable ASCII characters may only appear inside double-quotes.");
 586         return SCAN_FAILED;
 587 }
 588
 589 %%
 590
 591 /*
 592  * Turn diagnostics back on, so we check the code that we've written.
 593  */
 594 DIAG_ON_FLEX()
 595
 596 static void
 597 _update_location(dfsyntax_t *dfs, size_t len)
 598 {
 599         dfs->location.col_start += (long)dfs->location.col_len;
 600         dfs->location.col_len = len;
 601 }
 602
 603 static void
 604 update_location(dfsyntax_t *dfs, const char *text)
 605 {
 606         _update_location(dfs, strlen(text));
 607 }
 608
 609 static void
 610 update_string_loc(dfsyntax_t *dfs, const char *text)
 611 {
 612         size_t len = strlen(text);
 613         dfs->string_loc.col_len += len;
 614         _update_location(dfs, len);
 615 }
 616
 617 static int
 618 set_lval_simple(dfsyntax_t *dfs, int token, const char *token_value, sttype_id_t type_id)
 619 {
 620         dfs->lval = stnode_new(type_id, NULL, g_strdup(token_value), dfs->location);
 621         return token;
 622 }
 623
 624 static int
 625 set_lval_literal(dfsyntax_t *dfs, const char *value, const char *token_value)
 626 {
 627         dfs->lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), dfs->location);
 628         return TOKEN_LITERAL;
 629 }
 630
 631 static int
 632 set_lval_identifier(dfsyntax_t *dfs, const char *value, const char *token_value)
 633 {
 634         dfs->lval = stnode_new(STTYPE_UNPARSED, g_strdup(value), g_strdup(token_value), dfs->location);
 635         stnode_set_flags(dfs->lval, STFLAG_UNPARSED);
 636         return TOKEN_IDENTIFIER;
 637 }
 638
 639 static int
 640 set_lval_unparsed(dfsyntax_t *dfs, const char *value, const char *token_value)
 641 {
 642         dfs->lval = stnode_new(STTYPE_UNPARSED, g_strdup(value), g_strdup(token_value), dfs->location);
 643         stnode_set_flags(dfs->lval, STFLAG_UNPARSED);
 644         return TOKEN_UNPARSED;
 645 }
 646
 647 static int
 648 set_lval_field(dfsyntax_t *dfs, const header_field_info *hfinfo, const char *token_value)
 649 {
 650         dfs->lval = stnode_new(STTYPE_FIELD, (void *)hfinfo, g_strdup(token_value), dfs->location);
 651         return TOKEN_FIELD;
 652 }
 653
 654 static int
 655 set_lval_quoted_string(dfsyntax_t *dfs, GString *quoted_string)
 656 {
 657         char *token_value;
 658
 659         token_value = ws_escape_string_len(NULL, quoted_string->str, quoted_string->len, true);
 660         dfs->lval = stnode_new(STTYPE_STRING, quoted_string, token_value, dfs->string_loc);
 661         return TOKEN_STRING;
 662 }
 663
 664 static int
 665 set_lval_charconst(dfsyntax_t *dfs, GString *quoted_string)
 666 {
 667         unsigned long number;
 668         bool ok;
 669
 670         char *token_value = g_string_free(quoted_string, FALSE);
 671         ok = parse_charconst(dfs, token_value, &number);
 672         if (!ok) {
 673                 g_free(token_value);
 674                 return SCAN_FAILED;
 675         }
 676         dfs->lval = stnode_new(STTYPE_CHARCONST, g_memdup2(&number, sizeof(number)), token_value, dfs->string_loc);
 677         return TOKEN_CHARCONST;
 678 }
 679
 680 static int
 681 set_lval_integer(dfsyntax_t *dfs, const char *value, const char *token_value)
 682 {
 683         unsigned long long number;
 684         bool ok;
 685
 686         ok = parse_unsigned_long_long(dfs, value, &number, false);
 687         if (!ok) {
 688                 /* Instead of failing assume this is a literal such as
 689                   "10f3deccc00d5c8f629fba7a0fff34aa" that can be interpreted
 690                   as a literal bytes valid. */
 691                 dfs->lval = stnode_new(STTYPE_LITERAL, g_strdup(value), g_strdup(token_value), dfs->location);
 692                 return TOKEN_LITERAL;
 693         }
 694         dfs->lval = stnode_new(STTYPE_NUMBER, NULL, g_strdup(token_value), dfs->location);
 695         sttype_number_set_unsigned(dfs->lval, number);
 696         return TOKEN_NUMBER;
 697 }
 698
 699 static int
 700 set_lval_float(dfsyntax_t *dfs, const char *value, const char *token_value)
 701 {
 702         double number;
 703         bool ok;
 704
 705         ok = parse_double(dfs, value, &number);
 706         if (!ok) {
 707                 return SCAN_FAILED;
 708         }
 709         dfs->lval = stnode_new(STTYPE_NUMBER, NULL, g_strdup(token_value), dfs->location);
 710         sttype_number_set_float(dfs->lval, number);
 711         return TOKEN_NUMBER;
 712 }
 713
 714 static bool
 715 append_escaped_char(dfsyntax_t *dfs, GString *str, char c)
 716 {
 717         switch (c) {
 718                 case 'a':
 719                         c = '\a';
 720                         break;
 721                 case 'b':
 722                         c = '\b';
 723                         break;
 724                 case 'f':
 725                         c = '\f';
 726                         break;
 727                 case 'n':
 728                         c = '\n';
 729                         break;
 730                 case 'r':
 731                         c = '\r';
 732                         break;
 733                 case 't':
 734                         c = '\t';
 735                         break;
 736                 case 'v':
 737                         c = '\v';
 738                         break;
 739                 case '\\':
 740                 case '\'':
 741                 case '\"':
 742                         break;
 743                 default:
 744                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->location,
 745                                         "\\%c is not a valid character escape sequence", c);
 746                         return false;
 747         }
 748
 749         g_string_append_c(str, c);
 750         return true;
 751 }
 752
 753 static bool
 754 parse_universal_character_name(dfsyntax_t *dfs _U_, const char *str, char **ret_endptr, gunichar *valuep)
 755 {
 756         uint64_t val;
 757         char *endptr;
 758         int ndigits;
 759
 760         if (str[0] != '\\')
 761                 return false;
 762
 763         if (str[1] == 'u')
 764                 ndigits = 4;
 765         else if (str[1] == 'U')
 766                 ndigits = 8;
 767         else
 768                 return false;
 769
 770         for (int i = 2; i < ndigits + 2; i++) {
 771                 if (!g_ascii_isxdigit(str[i])) {
 772                         return false;
 773                 }
 774         }
 775
 776         errno = 0;
 777         val = g_ascii_strtoull(str + 2, &endptr, 16); /* skip leading 'u' or 'U' */
 778
 779         if (errno != 0 || endptr == str || val > UINT32_MAX) {
 780                 return false;
 781         }
 782
 783         /*
 784          * Ref: https://en.cppreference.com/w/c/language/escape
 785          * Range of universal character names
 786          *
 787          * If a universal character name corresponds to a code point that is
 788          * not 0x24 ($), 0x40 (@), nor 0x60 (`) and less than 0xA0, or a
 789          * surrogate code point (the range 0xD800-0xDFFF, inclusive), or
 790          * greater than 0x10FFFF, i.e. not a Unicode code point (since C23),
 791          * the program is ill-formed. In other words, members of basic source
 792          * character set and control characters (in ranges 0x0-0x1F and
 793          * 0x7F-0x9F) cannot be expressed in universal character names.
 794          */
 795         if (val < 0xA0 && val != 0x24 && val != 0x40 && val != 0x60)
 796                 return false;
 797         else if (val >= 0xD800 && val <= 0xDFFF)
 798                 return false;
 799         else if (val > 0x10FFFF)
 800                 return false;
 801
 802         *valuep = (gunichar)val;
 803         if (ret_endptr)
 804                 *ret_endptr = endptr;
 805         return true;
 806 }
 807
 808 static bool
 809 append_universal_character_name(dfsyntax_t *dfs, GString *str, const char *ucn)
 810 {
 811         gunichar val;
 812
 813         if (!parse_universal_character_name(dfs, ucn, NULL, &val)) {
 814                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->location, "%s is not a valid universal character name", ucn);
 815                 return false;
 816         }
 817
 818         g_string_append_unichar(str, val);
 819         return true;
 820 }
 821
 822 static bool
 823 parse_charconst(dfsyntax_t *dfs, const char *s, unsigned long *valuep)
 824 {
 825         const char *cp;
 826         unsigned long value;
 827         gunichar unival;
 828         char *endptr;
 829
 830         cp = s + 1;     /* skip the leading ' */
 831         if (*cp == '\'') {
 832                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "Empty character constant.");
 833                 return false;
 834         }
 835
 836         if (*cp == '\\') {
 837                 /*
 838                  * C escape sequence.
 839                  * An escape sequence is an octal number \NNN,
 840                  * an hex number \xNN, or one of \' \" \\ \a \b \f \n \r \t \v
 841                  * that stands for the byte value of the equivalent
 842                  * C-escape in ASCII encoding.
 843                  */
 844                 cp++;
 845                 switch (*cp) {
 846
 847                 case '\0':
 848                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
 849                         return false;
 850
 851                 case 'a':
 852                         value = '\a';
 853                         cp++;
 854                         break;
 855
 856                 case 'b':
 857                         value = '\b';
 858                         cp++;
 859                         break;
 860
 861                 case 'f':
 862                         value = '\f';
 863                         cp++;
 864                         break;
 865
 866                 case 'n':
 867                         value = '\n';
 868                         break;
 869
 870                 case 'r':
 871                         value = '\r';
 872                         cp++;
 873                         break;
 874
 875                 case 't':
 876                         value = '\t';
 877                         cp++;
 878                         break;
 879
 880                 case 'v':
 881                         value = '\v';
 882                         cp++;
 883                         break;
 884
 885                 case '\'':
 886                         value = '\'';
 887                         cp++;
 888                         break;
 889
 890                 case '\\':
 891                         value = '\\';
 892                         cp++;
 893                         break;
 894
 895                 case '"':
 896                         value = '"';
 897                         cp++;
 898                         break;
 899
 900                 case 'x':
 901                         cp++;
 902                         if (*cp >= '0' && *cp <= '9')
 903                                 value = *cp - '0';
 904                         else if (*cp >= 'A' && *cp <= 'F')
 905                                 value = 10 + (*cp - 'A');
 906                         else if (*cp >= 'a' && *cp <= 'f')
 907                                 value = 10 + (*cp - 'a');
 908                         else {
 909                                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
 910                                 return false;
 911                         }
 912                         cp++;
 913                         if (*cp != '\'') {
 914                                 value <<= 4;
 915                                 if (*cp >= '0' && *cp <= '9')
 916                                         value |= *cp - '0';
 917                                 else if (*cp >= 'A' && *cp <= 'F')
 918                                         value |= 10 + (*cp - 'A');
 919                                 else if (*cp >= 'a' && *cp <= 'f')
 920                                         value |= 10 + (*cp - 'a');
 921                                 else {
 922                                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
 923                                         return false;
 924                                 }
 925                                 cp++;
 926                         }
 927                         break;
 928
 929                 case 'u':
 930                 case 'U':
 931                         if (!parse_universal_character_name(dfs, s+1, &endptr, &unival)) {
 932                                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s is not a valid universal character name", s);
 933                                 return false;
 934                         }
 935                         value = (unsigned long)unival;
 936                         cp = endptr;
 937                         break;
 938
 939                 default:
 940                         /* Octal */
 941                         if (*cp >= '0' && *cp <= '7')
 942                                 value = *cp - '0';
 943                         else {
 944                                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
 945                                 return false;
 946                         }
 947                         if (*(cp + 1) != '\'') {
 948                                 cp++;
 949                                 value <<= 3;
 950                                 if (*cp >= '0' && *cp <= '7')
 951                                         value |= *cp - '0';
 952                                 else {
 953                                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
 954                                         return false;
 955                                 }
 956                                 if (*(cp + 1) != '\'') {
 957                                         cp++;
 958                                         value <<= 3;
 959                                         if (*cp >= '0' && *cp <= '7')
 960                                                 value |= *cp - '0';
 961                                         else {
 962                                                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s isn't a valid character constant.", s);
 963                                                 return false;
 964                                         }
 965                                 }
 966                         }
 967                         if (value > 0xFF) {
 968                                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s is too large to be a valid character constant.", s);
 969                                 return false;
 970                         }
 971                         cp++;
 972                 }
 973         } else {
 974                 value = *cp++;
 975                 if (!g_ascii_isprint(value)) {
 976                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "Non-printable value '0x%02lx' in character constant.", value);
 977                         return false;
 978                 }
 979         }
 980
 981         if ((*cp != '\'') || (*(cp + 1) != '\0')){
 982                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "%s is too long to be a valid character constant.", s);
 983                 return false;
 984         }
 985
 986         *valuep = value;
 987         return true;
 988 }
 989
 990 static bool
 991 parse_unsigned_long_long(dfsyntax_t *dfs, const char *s, unsigned long long *valuep, bool set_error)
 992 {
 993         char *endptr;
 994
 995         errno = 0;
 996         if (s[0] == '0' && (s[1] == 'b' || s[1] == 'B')) {
 997                 *valuep = g_ascii_strtoull(s + 2, &endptr, 2);
 998         }
 999         else {
1000                 *valuep = g_ascii_strtoull(s, &endptr, 0);
1001         }
1002
1003         if (errno == EINVAL || endptr == s || *endptr != '\0') {
1004                 /* This isn't a valid number. */
1005                 if (set_error)
1006                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "\"%s\" is not a valid number.", s);
1007                 return false;
1008         }
1009         if (errno == ERANGE) {
1010                 if (set_error)
1011                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "\"%s\" is too large to be represented as a 64-bit number.", s);
1012                 return false;
1013         }
1014         if (errno != 0) {
1015                 // Should not happen
1016                 if (set_error)
1017                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "\"%s\" is not a valid number (%s).", s, g_strerror(errno));
1018                 return false;
1019         }
1020
1021         return true;
1022 }
1023
1024 static bool
1025 parse_double(dfsyntax_t *dfs, const char *s, double *valuep)
1026 {
1027         char *endptr = NULL;
1028
1029         errno = 0;
1030         *valuep = g_ascii_strtod(s, &endptr);
1031
1032         if (endptr == s || *endptr != '\0') {
1033                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "\"%s\" is not a valid floating-point number.", s);
1034                 return false;
1035         }
1036         if (errno == ERANGE) {
1037                 if (*valuep == HUGE_VAL) {
1038                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "\"%s\" causes floating-point overflow.", s);
1039                 }
1040                 else {
1041                         dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc, "\"%s\" causes floating-point underflow.", s);
1042                 }
1043                 return false;
1044         }
1045         if (errno != 0) {
1046                 // Should not happen
1047                 dfilter_fail(dfs, DF_ERROR_GENERIC, dfs->string_loc,
1048                                 "\"%s\" is not a valid floating-point number (%s).",
1049                                 s, g_strerror(errno));
1050                 return false;
1051         }
1052
1053         return true;
1054 }