gcc/input.cc

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2025 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 const char *
  33 special_fname_builtin ()
  34 {
  35   return _("<built-in>");
  36 }
  37
  38 /* Input charset configuration.  */
  39 static const char *default_charset_callback (const char *)
  40 {
  41   return nullptr;
  42 }
  43
  44 void
  45 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  46                                       bool should_skip_bom)
  47 {
  48   m_input_context.ccb = (ccb ? ccb : default_charset_callback);
  49   m_input_context.should_skip_bom = should_skip_bom;
  50 }
  51
  52 /* This is a cache used by get_next_line to store the content of a
  53    file to be searched for file lines.  */
  54 class file_cache_slot
  55 {
  56 public:
  57   file_cache_slot ();
  58   ~file_cache_slot ();
  59
  60   void dump (FILE *out, int indent) const;
  61   void DEBUG_FUNCTION dump () const { dump (stderr, 0); }
  62
  63   bool read_line_num (size_t line_num,
  64                       char ** line, ssize_t *line_len);
  65
  66   /* Accessors.  */
  67   const char *get_file_path () const { return m_file_path; }
  68   unsigned get_use_count () const { return m_use_count; }
  69   bool missing_trailing_newline_p () const
  70   {
  71     return m_missing_trailing_newline;
  72   }
  73   char_span get_full_file_content ();
  74
  75   void inc_use_count () { m_use_count++; }
  76
  77   bool create (const file_cache::input_context &in_context,
  78                const char *file_path, FILE *fp, unsigned highest_use_count);
  79   void evict ();
  80   void set_content (const char *buf, size_t sz);
  81
  82  private:
  83   /* These are information used to store a line boundary.  */
  84   class line_info
  85   {
  86   public:
  87     /* The line number.  It starts from 1.  */
  88     size_t line_num;
  89
  90     /* The position (byte count) of the beginning of the line,
  91        relative to the file data pointer.  This starts at zero.  */
  92     size_t start_pos;
  93
  94     /* The position (byte count) of the last byte of the line.  This
  95        normally points to the '\n' character, or to one byte after the
  96        last byte of the file, if the file doesn't contain a '\n'
  97        character.  */
  98     size_t end_pos;
  99
 100     line_info (size_t l, size_t s, size_t e)
 101       : line_num (l), start_pos (s), end_pos (e)
 102     {}
 103
 104     line_info ()
 105       :line_num (0), start_pos (0), end_pos (0)
 106     {}
 107   };
 108
 109   bool needs_read_p () const;
 110   bool needs_grow_p () const;
 111   void maybe_grow ();
 112   bool read_data ();
 113   bool maybe_read_data ();
 114   bool get_next_line (char **line, ssize_t *line_len);
 115   bool read_next_line (char ** line, ssize_t *line_len);
 116   bool goto_next_line ();
 117
 118   static const size_t buffer_size = 4 * 1024;
 119   static const size_t line_record_size = 100;
 120
 121   /* The number of time this file has been accessed.  This is used
 122      to designate which file cache to evict from the cache
 123      array.  */
 124   unsigned m_use_count;
 125
 126   /* The file_path is the key for identifying a particular file in
 127      the cache.
 128      For libcpp-using code, the underlying buffer for this field is
 129      owned by the corresponding _cpp_file within the cpp_reader.  */
 130   const char *m_file_path;
 131
 132   FILE *m_fp;
 133
 134   /* True when an read error happened.  */
 135   bool m_error;
 136
 137   /* This points to the content of the file that we've read so
 138      far.  */
 139   char *m_data;
 140
 141   /* The allocated buffer to be freed may start a little earlier than DATA,
 142      e.g. if a UTF8 BOM was skipped at the beginning.  */
 143   int m_alloc_offset;
 144
 145   /*  The size of the DATA array above.*/
 146   size_t m_size;
 147
 148   /* The number of bytes read from the underlying file so far.  This
 149      must be less (or equal) than SIZE above.  */
 150   size_t m_nb_read;
 151
 152   /* The index of the beginning of the current line.  */
 153   size_t m_line_start_idx;
 154
 155   /* The number of the previous line read.  This starts at 1.  Zero
 156      means we've read no line so far.  */
 157   size_t m_line_num;
 158
 159   /* This is the total number of lines of the current file.  At the
 160      moment, we try to get this information from the line map
 161      subsystem.  Note that this is just a hint.  When using the C++
 162      front-end, this hint is correct because the input file is then
 163      completely tokenized before parsing starts; so the line map knows
 164      the number of lines before compilation really starts.  For e.g,
 165      the C front-end, it can happen that we start emitting diagnostics
 166      before the line map has seen the end of the file.  */
 167   size_t m_total_lines;
 168
 169   /* Could this file be missing a trailing newline on its final line?
 170      Initially true (to cope with empty files), set to true/false
 171      as each line is read.  */
 172   bool m_missing_trailing_newline;
 173
 174   /* This is a record of the beginning and end of the lines we've seen
 175      while reading the file.  This is useful to avoid walking the data
 176      from the beginning when we are asked to read a line that is
 177      before LINE_START_IDX above.  Note that the maximum size of this
 178      record is line_record_size, so that the memory consumption
 179      doesn't explode.  We thus scale total_lines down to
 180      line_record_size.  */
 181   vec<line_info, va_heap> m_line_record;
 182
 183   void offset_buffer (int offset)
 184   {
 185     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 186                 : (size_t) offset <= m_size);
 187     gcc_assert (m_data);
 188     m_alloc_offset += offset;
 189     m_data += offset;
 190     m_size -= offset;
 191   }
 192
 193 };
 194
 195 static const char *
 196 find_end_of_line (const char *s, size_t len);
 197
 198 /* Current position in real source file.  */
 199
 200 location_t input_location = UNKNOWN_LOCATION;
 201
 202 class line_maps *line_table;
 203
 204 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 205    This needs to be a global so that it can be a GC root, and thus
 206    prevent the stashed copy from being garbage-collected if the GC runs
 207    during a line_table_test.  */
 208
 209 class line_maps *saved_line_table;
 210
 211 /* Expand the source location LOC into a human readable location.  If
 212    LOC resolves to a builtin location, the file name of the readable
 213    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 214    TRUE and LOC is virtual, then it is resolved to the expansion
 215    point of the involved macro.  Otherwise, it is resolved to the
 216    spelling location of the token.
 217
 218    When resolving to the spelling location of the token, if the
 219    resulting location is for a built-in location (that is, it has no
 220    associated line/column) in the context of a macro expansion, the
 221    returned location is the first one (while unwinding the macro
 222    location towards its expansion point) that is in real source
 223    code.
 224
 225    ASPECT controls which part of the location to use.  */
 226
 227 static expanded_location
 228 expand_location_1 (const line_maps *set,
 229                    location_t loc,
 230                    bool expansion_point_p,
 231                    enum location_aspect aspect)
 232 {
 233   expanded_location xloc;
 234   const line_map_ordinary *map;
 235   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 236   tree block = NULL;
 237
 238   if (IS_ADHOC_LOC (loc))
 239     {
 240       block = LOCATION_BLOCK (loc);
 241       loc = LOCATION_LOCUS (loc);
 242     }
 243
 244   memset (&xloc, 0, sizeof (xloc));
 245
 246   if (loc >= RESERVED_LOCATION_COUNT)
 247     {
 248       if (!expansion_point_p)
 249         {
 250           /* We want to resolve LOC to its spelling location.
 251
 252              But if that spelling location is a reserved location that
 253              appears in the context of a macro expansion (like for a
 254              location for a built-in token), let's consider the first
 255              location (toward the expansion point) that is not reserved;
 256              that is, the first location that is in real source code.  */
 257           loc = linemap_unwind_to_first_non_reserved_loc (set,
 258                                                           loc, NULL);
 259           lrk = LRK_SPELLING_LOCATION;
 260         }
 261       loc = linemap_resolve_location (set, loc, lrk, &map);
 262
 263       /* loc is now either in an ordinary map, or is a reserved location.
 264          If it is a compound location, the caret is in a spelling location,
 265          but the start/finish might still be a virtual location.
 266          Depending of what the caller asked for, we may need to recurse
 267          one level in order to resolve any virtual locations in the
 268          end-points.  */
 269       switch (aspect)
 270         {
 271         default:
 272           gcc_unreachable ();
 273           /* Fall through.  */
 274         case LOCATION_ASPECT_CARET:
 275           break;
 276         case LOCATION_ASPECT_START:
 277           {
 278             location_t start = get_start (loc);
 279             if (start != loc)
 280               return expand_location_1 (set, start, expansion_point_p, aspect);
 281           }
 282           break;
 283         case LOCATION_ASPECT_FINISH:
 284           {
 285             location_t finish = get_finish (loc);
 286             if (finish != loc)
 287               return expand_location_1 (set, finish, expansion_point_p, aspect);
 288           }
 289           break;
 290         }
 291       xloc = linemap_expand_location (set, map, loc);
 292     }
 293
 294   xloc.data = block;
 295   if (loc <= BUILTINS_LOCATION)
 296     xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
 297
 298   return xloc;
 299 }
 300
 301 /* Return the total lines number that have been read so far by the
 302    line map (in the preprocessor) so far.  For languages like C++ that
 303    entirely preprocess the input file before starting to parse, this
 304    equals the actual number of lines of the file.  */
 305
 306 static size_t
 307 total_lines_num (const char *file_path)
 308 {
 309   size_t r = 0;
 310   location_t l = 0;
 311   if (linemap_get_file_highest_location (line_table, file_path, &l))
 312     {
 313       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 314       expanded_location xloc = expand_location (l);
 315       r = xloc.line;
 316     }
 317   return r;
 318 }
 319
 320 /* Lookup the cache used for the content of a given file accessed by
 321    caret diagnostic.  Return the found cached file, or NULL if no
 322    cached file was found.  */
 323
 324 file_cache_slot *
 325 file_cache::lookup_file (const char *file_path)
 326 {
 327   gcc_assert (file_path);
 328
 329   /* This will contain the found cached file.  */
 330   file_cache_slot *r = NULL;
 331   for (unsigned i = 0; i < num_file_slots; ++i)
 332     {
 333       file_cache_slot *c = &m_file_slots[i];
 334       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 335         {
 336           c->inc_use_count ();
 337           r = c;
 338         }
 339     }
 340
 341   if (r)
 342     r->inc_use_count ();
 343
 344   return r;
 345 }
 346
 347 /* Purge any mention of FILENAME from the cache of files used for
 348    printing source code.  For use in selftests when working
 349    with tempfiles.  */
 350
 351 void
 352 file_cache::forcibly_evict_file (const char *file_path)
 353 {
 354   gcc_assert (file_path);
 355
 356   file_cache_slot *r = lookup_file (file_path);
 357   if (!r)
 358     /* Not found.  */
 359     return;
 360
 361   r->evict ();
 362 }
 363
 364 /* Determine if FILE_PATH missing a trailing newline on its final line.
 365    Only valid to call once all of the file has been loaded, by
 366    requesting a line number beyond the end of the file.  */
 367
 368 bool
 369 file_cache::missing_trailing_newline_p (const char *file_path)
 370 {
 371   gcc_assert (file_path);
 372
 373   file_cache_slot *r = lookup_or_add_file (file_path);
 374   return r->missing_trailing_newline_p ();
 375 }
 376
 377 void
 378 file_cache::add_buffered_content (const char *file_path,
 379                                   const char *buffer,
 380                                   size_t sz)
 381 {
 382   gcc_assert (file_path);
 383
 384   file_cache_slot *r = lookup_file (file_path);
 385   if (!r)
 386     {
 387       unsigned highest_use_count = 0;
 388       r = evicted_cache_tab_entry (&highest_use_count);
 389       if (!r->create (m_input_context, file_path, nullptr, highest_use_count))
 390         return;
 391     }
 392
 393   r->set_content (buffer, sz);
 394 }
 395
 396 void
 397 file_cache_slot::evict ()
 398 {
 399   m_file_path = NULL;
 400   if (m_fp)
 401     fclose (m_fp);
 402   m_error = false;
 403   m_fp = NULL;
 404   m_nb_read = 0;
 405   m_line_start_idx = 0;
 406   m_line_num = 0;
 407   m_line_record.truncate (0);
 408   m_use_count = 0;
 409   m_total_lines = 0;
 410   m_missing_trailing_newline = true;
 411 }
 412
 413 /* Return the file cache that has been less used, recently, or the
 414    first empty one.  If HIGHEST_USE_COUNT is non-null,
 415    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 416    in the cache table.  */
 417
 418 file_cache_slot*
 419 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 420 {
 421   file_cache_slot *to_evict = &m_file_slots[0];
 422   unsigned huc = to_evict->get_use_count ();
 423   for (unsigned i = 1; i < num_file_slots; ++i)
 424     {
 425       file_cache_slot *c = &m_file_slots[i];
 426       bool c_is_empty = (c->get_file_path () == NULL);
 427
 428       if (c->get_use_count () < to_evict->get_use_count ()
 429           || (to_evict->get_file_path () && c_is_empty))
 430         /* We evict C because it's either an entry with a lower use
 431            count or one that is empty.  */
 432         to_evict = c;
 433
 434       if (huc < c->get_use_count ())
 435         huc = c->get_use_count ();
 436
 437       if (c_is_empty)
 438         /* We've reached the end of the cache; subsequent elements are
 439            all empty.  */
 440         break;
 441     }
 442
 443   if (highest_use_count)
 444     *highest_use_count = huc;
 445
 446   return to_evict;
 447 }
 448
 449 /* Create the cache used for the content of a given file to be
 450    accessed by caret diagnostic.  This cache is added to an array of
 451    cache and can be retrieved by lookup_file_in_cache_tab.  This
 452    function returns the created cache.  Note that only the last
 453    num_file_slots files are cached.
 454
 455    This can return nullptr if the FILE_PATH can't be opened for
 456    reading, or if the content can't be converted to the input_charset.  */
 457
 458 file_cache_slot*
 459 file_cache::add_file (const char *file_path)
 460 {
 461
 462   FILE *fp = fopen (file_path, "r");
 463   if (fp == NULL)
 464     return NULL;
 465
 466   unsigned highest_use_count = 0;
 467   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 468   if (!r->create (m_input_context, file_path, fp, highest_use_count))
 469     return NULL;
 470   return r;
 471 }
 472
 473 /* Get a borrowed char_span to the full content of this file
 474    as decoded according to the input charset, encoded as UTF-8.  */
 475
 476 char_span
 477 file_cache_slot::get_full_file_content ()
 478 {
 479   char *line;
 480   ssize_t line_len;
 481   while (get_next_line (&line, &line_len))
 482     {
 483     }
 484   return char_span (m_data, m_nb_read);
 485 }
 486
 487 /* Populate this slot for use on FILE_PATH and FP, dropping any
 488    existing cached content within it.  */
 489
 490 bool
 491 file_cache_slot::create (const file_cache::input_context &in_context,
 492                          const char *file_path, FILE *fp,
 493                          unsigned highest_use_count)
 494 {
 495   m_file_path = file_path;
 496   if (m_fp)
 497     fclose (m_fp);
 498   m_error = false;
 499   m_fp = fp;
 500   if (m_alloc_offset)
 501     offset_buffer (-m_alloc_offset);
 502   m_nb_read = 0;
 503   m_line_start_idx = 0;
 504   m_line_num = 0;
 505   m_line_record.truncate (0);
 506   /* Ensure that this cache entry doesn't get evicted next time
 507      add_file_to_cache_tab is called.  */
 508   m_use_count = ++highest_use_count;
 509   m_total_lines = total_lines_num (file_path);
 510   m_missing_trailing_newline = true;
 511
 512
 513   /* Check the input configuration to determine if we need to do any
 514      transformations, such as charset conversion or BOM skipping.  */
 515   if (const char *input_charset = in_context.ccb (file_path))
 516     {
 517       /* Need a full-blown conversion of the input charset.  */
 518       fclose (m_fp);
 519       m_fp = NULL;
 520       const cpp_converted_source cs
 521         = cpp_get_converted_source (file_path, input_charset);
 522       if (!cs.data)
 523         return false;
 524       if (m_data)
 525         XDELETEVEC (m_data);
 526       m_data = cs.data;
 527       m_nb_read = m_size = cs.len;
 528       m_alloc_offset = cs.data - cs.to_free;
 529     }
 530   else if (in_context.should_skip_bom)
 531     {
 532       if (read_data ())
 533         {
 534           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 535           offset_buffer (offset);
 536           m_nb_read -= offset;
 537         }
 538     }
 539
 540   return true;
 541 }
 542
 543 void
 544 file_cache_slot::set_content (const char *buf, size_t sz)
 545 {
 546   m_data = (char *)xmalloc (sz);
 547   memcpy (m_data, buf, sz);
 548   m_nb_read = m_size = sz;
 549   m_alloc_offset = 0;
 550
 551   if (m_fp)
 552     {
 553       fclose (m_fp);
 554       m_fp = nullptr;
 555     }
 556
 557   /* Compute m_total_lines based on content of buffer.  */
 558   m_total_lines = 0;
 559   const char *line_start = m_data;
 560   size_t remaining_size = sz;
 561   while (const char *line_end = find_end_of_line (line_start, remaining_size))
 562     {
 563       ++m_total_lines;
 564       remaining_size -= line_end + 1 - line_start;
 565       line_start = line_end + 1;
 566     }
 567 }
 568
 569 /* file_cache's ctor.  */
 570
 571 file_cache::file_cache ()
 572 : m_file_slots (new file_cache_slot[num_file_slots])
 573 {
 574   initialize_input_context (nullptr, false);
 575 }
 576
 577 /* file_cache's dtor.  */
 578
 579 file_cache::~file_cache ()
 580 {
 581   delete[] m_file_slots;
 582 }
 583
 584 void
 585 file_cache::dump (FILE *out, int indent) const
 586 {
 587   for (size_t i = 0; i < num_file_slots; ++i)
 588     {
 589       fprintf (out, "%*sslot[%i]:\n", indent, "", (int)i);
 590       m_file_slots[i].dump (out, indent + 2);
 591     }
 592 }
 593
 594 void
 595 file_cache::dump () const
 596 {
 597   dump (stderr, 0);
 598 }
 599
 600 /* Lookup the cache used for the content of a given file accessed by
 601    caret diagnostic.  If no cached file was found, create a new cache
 602    for this file, add it to the array of cached file and return
 603    it.
 604
 605    This can return nullptr on a cache miss if FILE_PATH can't be opened for
 606    reading, or if the content can't be converted to the input_charset.  */
 607
 608 file_cache_slot*
 609 file_cache::lookup_or_add_file (const char *file_path)
 610 {
 611   file_cache_slot *r = lookup_file (file_path);
 612   if (r == NULL)
 613     r = add_file (file_path);
 614   return r;
 615 }
 616
 617 /* Default constructor for a cache of file used by caret
 618    diagnostic.  */
 619
 620 file_cache_slot::file_cache_slot ()
 621 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_error (false), m_data (0),
 622   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 623   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 624 {
 625   m_line_record.create (0);
 626 }
 627
 628 /* Destructor for a cache of file used by caret diagnostic.  */
 629
 630 file_cache_slot::~file_cache_slot ()
 631 {
 632   if (m_fp)
 633     {
 634       fclose (m_fp);
 635       m_fp = NULL;
 636     }
 637   if (m_data)
 638     {
 639       offset_buffer (-m_alloc_offset);
 640       XDELETEVEC (m_data);
 641       m_data = 0;
 642     }
 643   m_line_record.release ();
 644 }
 645
 646 void
 647 file_cache_slot::dump (FILE *out, int indent) const
 648 {
 649   if (!m_file_path)
 650     {
 651       fprintf (out, "%*s(unused)\n", indent, "");
 652       return;
 653     }
 654   fprintf (out, "%*sfile_path: %s\n", indent, "", m_file_path);
 655   fprintf (out, "%*sfp: %p\n", indent, "", (void *)m_fp);
 656   fprintf (out, "%*sneeds_read_p: %i\n", indent, "", (int)needs_read_p ());
 657   fprintf (out, "%*sneeds_grow_p: %i\n", indent, "", (int)needs_grow_p ());
 658   fprintf (out, "%*suse_count: %i\n", indent, "", m_use_count);
 659   fprintf (out, "%*ssize: %zi\n", indent, "", m_size);
 660   fprintf (out, "%*snb_read: %zi\n", indent, "", m_nb_read);
 661   fprintf (out, "%*sstart_line_idx: %zi\n", indent, "", m_line_start_idx);
 662   fprintf (out, "%*sline_num: %zi\n", indent, "", m_line_num);
 663   fprintf (out, "%*stotal_lines: %zi\n", indent, "", m_total_lines);
 664   fprintf (out, "%*smissing_trailing_newline: %i\n",
 665            indent, "", (int)m_missing_trailing_newline);
 666   fprintf (out, "%*sline records (%i):\n",
 667            indent, "", m_line_record.length ());
 668   int idx = 0;
 669   for (auto &line : m_line_record)
 670     fprintf (out, "%*s[%i]: line %zi: byte offsets: %zi-%zi\n",
 671              indent + 2, "",
 672              idx++, line.line_num, line.start_pos, line.end_pos);
 673 }
 674
 675 /* Returns TRUE iff the cache would need to be filled with data coming
 676    from the file.  That is, either the cache is empty or full or the
 677    current line is empty.  Note that if the cache is full, it would
 678    need to be extended and filled again.  */
 679
 680 bool
 681 file_cache_slot::needs_read_p () const
 682 {
 683   return m_fp && (m_nb_read == 0
 684           || m_nb_read == m_size
 685           || (m_line_start_idx >= m_nb_read - 1));
 686 }
 687
 688 /*  Return TRUE iff the cache is full and thus needs to be
 689     extended.  */
 690
 691 bool
 692 file_cache_slot::needs_grow_p () const
 693 {
 694   return m_nb_read == m_size;
 695 }
 696
 697 /* Grow the cache if it needs to be extended.  */
 698
 699 void
 700 file_cache_slot::maybe_grow ()
 701 {
 702   if (!needs_grow_p ())
 703     return;
 704
 705   if (!m_data)
 706     {
 707       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 708       m_size = buffer_size;
 709       m_data = XNEWVEC (char, m_size);
 710     }
 711   else
 712     {
 713       const int offset = m_alloc_offset;
 714       offset_buffer (-offset);
 715       m_size *= 2;
 716       m_data = XRESIZEVEC (char, m_data, m_size);
 717       offset_buffer (offset);
 718     }
 719 }
 720
 721 /*  Read more data into the cache.  Extends the cache if need be.
 722     Returns TRUE iff new data could be read.  */
 723
 724 bool
 725 file_cache_slot::read_data ()
 726 {
 727   if (feof (m_fp) || ferror (m_fp))
 728     return false;
 729
 730   maybe_grow ();
 731
 732   char * from = m_data + m_nb_read;
 733   size_t to_read = m_size - m_nb_read;
 734   size_t nb_read = fread (from, 1, to_read, m_fp);
 735
 736   if (ferror (m_fp))
 737     {
 738       m_error = true;
 739       return false;
 740     }
 741
 742   m_nb_read += nb_read;
 743   return !!nb_read;
 744 }
 745
 746 /* Read new data iff the cache needs to be filled with more data
 747    coming from the file FP.  Return TRUE iff the cache was filled with
 748    mode data.  */
 749
 750 bool
 751 file_cache_slot::maybe_read_data ()
 752 {
 753   if (!needs_read_p ())
 754     return false;
 755   return read_data ();
 756 }
 757
 758 /* Helper function for file_cache_slot::get_next_line (), to find the end of
 759    the next line.  Returns with the memchr convention, i.e. nullptr if a line
 760    terminator was not found.  We need to determine line endings in the same
 761    manner that libcpp does: any of \n, \r\n, or \r is a line ending.  */
 762
 763 static const char *
 764 find_end_of_line (const char *s, size_t len)
 765 {
 766   for (const auto end = s + len; s != end; ++s)
 767     {
 768       if (*s == '\n')
 769         return s;
 770       if (*s == '\r')
 771         {
 772           const auto next = s + 1;
 773           if (next == end)
 774             {
 775               /* Don't find the line ending if \r is the very last character
 776                  in the buffer; we do not know if it's the end of the file or
 777                  just the end of what has been read so far, and we wouldn't
 778                  want to break in the middle of what's actually a \r\n
 779                  sequence.  Instead, we will handle the case of a file ending
 780                  in a \r later.  */
 781               break;
 782             }
 783           return (*next == '\n' ? next : s);
 784         }
 785     }
 786   return nullptr;
 787 }
 788
 789 /* Read a new line from file FP, using C as a cache for the data
 790    coming from the file.  Upon successful completion, *LINE is set to
 791    the beginning of the line found.  *LINE points directly in the
 792    line cache and is only valid until the next call of get_next_line.
 793    *LINE_LEN is set to the length of the line.  Note that the line
 794    does not contain any terminal delimiter.  This function returns
 795    true if some data was read or process from the cache, false
 796    otherwise.  Note that subsequent calls to get_next_line might
 797    make the content of *LINE invalid.  */
 798
 799 bool
 800 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 801 {
 802   /* Fill the cache with data to process.  */
 803   maybe_read_data ();
 804
 805   size_t remaining_size = m_nb_read - m_line_start_idx;
 806   if (remaining_size == 0)
 807     /* There is no more data to process.  */
 808     return false;
 809
 810   const char *line_start = m_data + m_line_start_idx;
 811
 812   const char *next_line_start = NULL;
 813   size_t len = 0;
 814   const char *line_end = find_end_of_line (line_start, remaining_size);
 815   if (line_end == NULL)
 816     {
 817       /* We haven't found an end-of-line delimiter in the cache.
 818          Fill the cache with more data from the file and look again.  */
 819       while (maybe_read_data ())
 820         {
 821           line_start = m_data + m_line_start_idx;
 822           remaining_size = m_nb_read - m_line_start_idx;
 823           line_end = find_end_of_line (line_start, remaining_size);
 824           if (line_end != NULL)
 825             {
 826               next_line_start = line_end + 1;
 827               break;
 828             }
 829         }
 830       if (line_end == NULL)
 831         {
 832           /* We've loaded all the file into the cache and still no
 833              terminator.  Let's say the line ends up at one byte past the
 834              end of the file.  This is to stay consistent with the case
 835              of when the line ends up with a terminator and line_end points to
 836              that.  That consistency is useful below in the len calculation.
 837
 838              If the file ends in a \r, we didn't identify it as a line
 839              terminator above, so do that now instead.  */
 840           line_end = m_data + m_nb_read;
 841           if (m_nb_read && line_end[-1] == '\r')
 842             {
 843               --line_end;
 844               m_missing_trailing_newline = false;
 845             }
 846           else
 847             m_missing_trailing_newline = true;
 848         }
 849       else
 850         m_missing_trailing_newline = false;
 851     }
 852   else
 853     {
 854       next_line_start = line_end + 1;
 855       m_missing_trailing_newline = false;
 856     }
 857
 858   if (m_error)
 859     return false;
 860
 861   /* At this point, we've found the end of the of line.  It either points to
 862      the line terminator or to one byte after the last byte of the file.  */
 863   gcc_assert (line_end != NULL);
 864
 865   len = line_end - line_start;
 866
 867   if (m_line_start_idx < m_nb_read)
 868     *line = const_cast<char *> (line_start);
 869
 870   ++m_line_num;
 871
 872   /* Before we update our line record, make sure the hint about the
 873      total number of lines of the file is correct.  If it's not, then
 874      we give up recording line boundaries from now on.  */
 875   bool update_line_record = true;
 876   if (m_line_num > m_total_lines)
 877     update_line_record = false;
 878
 879     /* Now update our line record so that re-reading lines from the
 880      before m_line_start_idx is faster.  */
 881   if (update_line_record
 882       && m_line_record.length () < line_record_size)
 883     {
 884       /* If the file lines fits in the line record, we just record all
 885          its lines ...*/
 886       if (m_total_lines <= line_record_size
 887           && m_line_num > m_line_record.length ())
 888         m_line_record.safe_push
 889           (file_cache_slot::line_info (m_line_num,
 890                                        m_line_start_idx,
 891                                        line_end - m_data));
 892       else if (m_total_lines > line_record_size)
 893         {
 894           /* ... otherwise, we just scale total_lines down to
 895              (line_record_size lines.  */
 896           size_t n = (m_line_num * line_record_size) / m_total_lines;
 897           if (m_line_record.length () == 0
 898               || n >= m_line_record.length ())
 899             m_line_record.safe_push
 900               (file_cache_slot::line_info (m_line_num,
 901                                            m_line_start_idx,
 902                                            line_end - m_data));
 903         }
 904     }
 905
 906   /* Update m_line_start_idx so that it points to the next line to be
 907      read.  */
 908   if (next_line_start)
 909     m_line_start_idx = next_line_start - m_data;
 910   else
 911     /* We didn't find any terminal '\n'.  Let's consider that the end
 912        of line is the end of the data in the cache.  The next
 913        invocation of get_next_line will either read more data from the
 914        underlying file or return false early because we've reached the
 915        end of the file.  */
 916     m_line_start_idx = m_nb_read;
 917
 918   *line_len = len;
 919
 920   return true;
 921 }
 922
 923 /* Consume the next bytes coming from the cache (or from its
 924    underlying file if there are remaining unread bytes in the file)
 925    until we reach the next end-of-line (or end-of-file).  There is no
 926    copying from the cache involved.  Return TRUE upon successful
 927    completion.  */
 928
 929 bool
 930 file_cache_slot::goto_next_line ()
 931 {
 932   char *l;
 933   ssize_t len;
 934
 935   return get_next_line (&l, &len);
 936 }
 937
 938 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 939    If the line was read successfully, *LINE points to the beginning
 940    of the line in the file cache and *LINE_LEN is the length of the
 941    line.  *LINE is not nul-terminated, but may contain zero bytes.
 942    *LINE is only valid until the next call of read_line_num.
 943    This function returns bool if a line was read.  */
 944
 945 bool
 946 file_cache_slot::read_line_num (size_t line_num,
 947                        char ** line, ssize_t *line_len)
 948 {
 949   gcc_assert (line_num > 0);
 950
 951   if (line_num <= m_line_num)
 952     {
 953       /* We've been asked to read lines that are before m_line_num.
 954          So lets use our line record (if it's not empty) to try to
 955          avoid re-reading the file from the beginning again.  */
 956
 957       if (m_line_record.is_empty ())
 958         {
 959           m_line_start_idx = 0;
 960           m_line_num = 0;
 961         }
 962       else
 963         {
 964           file_cache_slot::line_info *i = NULL;
 965           if (m_total_lines <= line_record_size)
 966             {
 967               /* In languages where the input file is not totally
 968                  preprocessed up front, the m_total_lines hint
 969                  can be smaller than the number of lines of the
 970                  file.  In that case, only the first
 971                  m_total_lines have been recorded.
 972
 973                  Otherwise, the first m_total_lines we've read have
 974                  their start/end recorded here.  */
 975               i = (line_num <= m_total_lines)
 976                 ? &m_line_record[line_num - 1]
 977                 : &m_line_record[m_total_lines - 1];
 978               gcc_assert (i->line_num <= line_num);
 979             }
 980           else
 981             {
 982               /*  So the file had more lines than our line record
 983                   size.  Thus the number of lines we've recorded has
 984                   been scaled down to line_record_size.  Let's
 985                   pick the start/end of the recorded line that is
 986                   closest to line_num.  */
 987               size_t n = (line_num <= m_total_lines)
 988                 ? line_num * line_record_size / m_total_lines
 989                 : m_line_record.length () - 1;
 990               if (n < m_line_record.length ())
 991                 {
 992                   i = &m_line_record[n];
 993                   gcc_assert (i->line_num <= line_num);
 994                 }
 995             }
 996
 997           if (i && i->line_num == line_num)
 998             {
 999               /* We have the start/end of the line.  */
1000               *line = m_data + i->start_pos;
1001               *line_len = i->end_pos - i->start_pos;
1002               return true;
1003             }
1004
1005           if (i)
1006             {
1007               m_line_start_idx = i->start_pos;
1008               m_line_num = i->line_num - 1;
1009             }
1010           else
1011             {
1012               m_line_start_idx = 0;
1013               m_line_num = 0;
1014             }
1015         }
1016     }
1017
1018   /*  Let's walk from line m_line_num up to line_num - 1, without
1019       copying any line.  */
1020   while (m_line_num < line_num - 1)
1021     if (!goto_next_line ())
1022       return false;
1023
1024   /* The line we want is the next one.  Let's read and copy it back to
1025      the caller.  */
1026   return get_next_line (line, line_len);
1027 }
1028
1029 /* Return the physical source line that corresponds to FILE_PATH/LINE.
1030    The line is not nul-terminated.  The returned pointer is only
1031    valid until the next call of location_get_source_line.
1032    Note that the line can contain several null characters,
1033    so the returned value's length has the actual length of the line.
1034    If the function fails, a NULL char_span is returned.  */
1035
1036 char_span
1037 file_cache::get_source_line (const char *file_path, int line)
1038 {
1039   char *buffer = NULL;
1040   ssize_t len;
1041
1042   if (line == 0)
1043     return char_span (NULL, 0);
1044
1045   if (file_path == NULL)
1046     return char_span (NULL, 0);
1047
1048   file_cache_slot *c = lookup_or_add_file (file_path);
1049   if (c == NULL)
1050     return char_span (NULL, 0);
1051
1052   bool read = c->read_line_num (line, &buffer, &len);
1053   if (!read)
1054     return char_span (NULL, 0);
1055
1056   return char_span (buffer, len);
1057 }
1058
1059 /* Return a NUL-terminated copy of the source text between two locations, or
1060    NULL if the arguments are invalid.  The caller is responsible for freeing
1061    the return value.  */
1062
1063 char *
1064 get_source_text_between (file_cache &fc, location_t start, location_t end)
1065 {
1066   expanded_location expstart =
1067     expand_location_to_spelling_point (start, LOCATION_ASPECT_START);
1068   expanded_location expend =
1069     expand_location_to_spelling_point (end, LOCATION_ASPECT_FINISH);
1070
1071   /* If the locations are in different files or the end comes before the
1072      start, give up and return nothing.  */
1073   if (!expstart.file || !expend.file)
1074     return NULL;
1075   if (strcmp (expstart.file, expend.file) != 0)
1076     return NULL;
1077   if (expstart.line > expend.line)
1078     return NULL;
1079   if (expstart.line == expend.line
1080       && expstart.column > expend.column)
1081     return NULL;
1082   /* These aren't real column numbers, give up.  */
1083   if (expstart.column == 0 || expend.column == 0)
1084     return NULL;
1085
1086   /* For a single line we need to trim both edges.  */
1087   if (expstart.line == expend.line)
1088     {
1089       char_span line = fc.get_source_line (expstart.file, expstart.line);
1090       if (line.length () < 1)
1091         return NULL;
1092       int s = expstart.column - 1;
1093       int len = expend.column - s;
1094       if (line.length () < (size_t)expend.column)
1095         return NULL;
1096       return line.subspan (s, len).xstrdup ();
1097     }
1098
1099   struct obstack buf_obstack;
1100   obstack_init (&buf_obstack);
1101
1102   /* Loop through all lines in the range and append each to buf; may trim
1103      parts of the start and end lines off depending on column values.  */
1104   for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1105     {
1106       char_span line = fc.get_source_line (expstart.file, lnum);
1107       if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1108         continue;
1109
1110       /* For the first line in the range, only start at expstart.column */
1111       if (lnum == expstart.line)
1112         {
1113           unsigned off = expstart.column - 1;
1114           if (line.length () < off)
1115             return NULL;
1116           line = line.subspan (off, line.length() - off);
1117         }
1118       /* For the last line, don't go past expend.column */
1119       else if (lnum == expend.line)
1120         {
1121           if (line.length () < (size_t)expend.column)
1122             return NULL;
1123           line = line.subspan (0, expend.column);
1124         }
1125
1126       /* Combine spaces at the beginning of later lines.  */
1127       if (lnum > expstart.line)
1128         {
1129           unsigned off;
1130           for (off = 0; off < line.length(); ++off)
1131             if (line[off] != ' ' && line[off] != '\t')
1132               break;
1133           if (off > 0)
1134             {
1135               obstack_1grow (&buf_obstack, ' ');
1136               line = line.subspan (off, line.length() - off);
1137             }
1138         }
1139
1140       /* This does not include any trailing newlines.  */
1141       obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1142     }
1143
1144   /* NUL-terminate and finish the buf obstack.  */
1145   obstack_1grow (&buf_obstack, 0);
1146   const char *buf = (const char *) obstack_finish (&buf_obstack);
1147
1148   return xstrdup (buf);
1149 }
1150
1151
1152 char_span
1153 file_cache::get_source_file_content (const char *file_path)
1154 {
1155   file_cache_slot *c = lookup_or_add_file (file_path);
1156   if (c == nullptr)
1157     return char_span (nullptr, 0);
1158   return c->get_full_file_content ();
1159 }
1160
1161 /* Test if the location originates from the spelling location of a
1162    builtin-tokens.  That is, return TRUE if LOC is a (possibly
1163    virtual) location of a built-in token that appears in the expansion
1164    list of a macro.  Please note that this function also works on
1165    tokens that result from built-in tokens.  For instance, the
1166    function would return true if passed a token "4" that is the result
1167    of the expansion of the built-in __LINE__ macro.  */
1168 bool
1169 is_location_from_builtin_token (location_t loc)
1170 {
1171   const line_map_ordinary *map = NULL;
1172   loc = linemap_resolve_location (line_table, loc,
1173                                   LRK_SPELLING_LOCATION, &map);
1174   return loc == BUILTINS_LOCATION;
1175 }
1176
1177 /* Expand the source location LOC into a human readable location.  If
1178    LOC is virtual, it resolves to the expansion point of the involved
1179    macro.  If LOC resolves to a builtin location, the file name of the
1180    readable location is set to the string "<built-in>".  */
1181
1182 expanded_location
1183 expand_location (location_t loc)
1184 {
1185   return expand_location_1 (line_table, loc, /*expansion_point_p=*/true,
1186                             LOCATION_ASPECT_CARET);
1187 }
1188
1189 /* Expand the source location LOC into a human readable location.  If
1190    LOC is virtual, it resolves to the expansion location of the
1191    relevant macro.  If LOC resolves to a builtin location, the file
1192    name of the readable location is set to the string
1193    "<built-in>".  */
1194
1195 expanded_location
1196 expand_location_to_spelling_point (location_t loc,
1197                                    enum location_aspect aspect)
1198 {
1199   return expand_location_1 (line_table, loc, /*expansion_point_p=*/false,
1200                             aspect);
1201 }
1202
1203 /* The rich_location class within libcpp requires a way to expand
1204    location_t instances, and relies on the client code
1205    providing a symbol named
1206      linemap_client_expand_location_to_spelling_point
1207    to do this.
1208
1209    This is the implementation for libcommon.a (all host binaries),
1210    which simply calls into expand_location_1.  */
1211
1212 expanded_location
1213 linemap_client_expand_location_to_spelling_point (const line_maps *set,
1214                                                   location_t loc,
1215                                                   enum location_aspect aspect)
1216 {
1217   return expand_location_1 (set, loc, /*expansion_point_p=*/false, aspect);
1218 }
1219
1220
1221 /* If LOCATION is in a system header and if it is a virtual location
1222    for a token coming from the expansion of a macro, unwind it to
1223    the location of the expansion point of the macro.  If the expansion
1224    point is also in a system header return the original LOCATION.
1225    Otherwise, return the location of the expansion point.
1226
1227    This is used for instance when we want to emit diagnostics about a
1228    token that may be located in a macro that is itself defined in a
1229    system header, for example, for the NULL macro.  In such a case, if
1230    LOCATION were passed directly to diagnostic functions such as
1231    warning_at, the diagnostic would be suppressed (unless
1232    -Wsystem-headers).  */
1233
1234 location_t
1235 expansion_point_location_if_in_system_header (location_t location)
1236 {
1237   if (!in_system_header_at (location))
1238     return location;
1239
1240   location_t xloc = linemap_resolve_location (line_table, location,
1241                                               LRK_MACRO_EXPANSION_POINT,
1242                                               NULL);
1243   return in_system_header_at (xloc) ? location : xloc;
1244 }
1245
1246 /* If LOCATION is a virtual location for a token coming from the expansion
1247    of a macro, unwind to the location of the expansion point of the macro.  */
1248
1249 location_t
1250 expansion_point_location (location_t location)
1251 {
1252   return linemap_resolve_location (line_table, location,
1253                                    LRK_MACRO_EXPANSION_POINT, NULL);
1254 }
1255
1256 /* Construct a location with caret at CARET, ranging from START to
1257    FINISH.
1258
1259    For example, consider:
1260
1261                  11111111112
1262         12345678901234567890
1263      522
1264      523   return foo + bar;
1265                   ~~~~^~~~~
1266      524
1267
1268    The location's caret is at the "+", line 523 column 15, but starts
1269    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1270    of "bar" at column 19.  */
1271
1272 location_t
1273 make_location (location_t caret, location_t start, location_t finish)
1274 {
1275   return line_table->make_location (caret, start, finish);
1276 }
1277
1278 /* Same as above, but taking a source range rather than two locations.  */
1279
1280 location_t
1281 make_location (location_t caret, source_range src_range)
1282 {
1283   location_t pure_loc = get_pure_location (caret);
1284   return line_table->get_or_create_combined_loc (pure_loc, src_range,
1285                                                  nullptr, 0);
1286 }
1287
1288 /* An expanded_location stores the column in byte units.  This function
1289    converts that column to display units.  That requires reading the associated
1290    source line in order to calculate the display width.  If that cannot be done
1291    for any reason, then returns the byte column as a fallback.  */
1292 int
1293 location_compute_display_column (file_cache &fc,
1294                                  expanded_location exploc,
1295                                  const cpp_char_column_policy &policy)
1296 {
1297   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1298     return exploc.column;
1299   char_span line = fc.get_source_line (exploc.file, exploc.line);
1300   /* If line is NULL, this function returns exploc.column which is the
1301      desired fallback.  */
1302   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1303                                             exploc.column, policy);
1304 }
1305
1306 /* Dump statistics to stderr about the memory usage of the line_table
1307    set of line maps.  This also displays some statistics about macro
1308    expansion.  */
1309
1310 void
1311 dump_line_table_statistics (void)
1312 {
1313   struct linemap_stats s;
1314   long total_used_map_size,
1315     macro_maps_size,
1316     total_allocated_map_size;
1317
1318   memset (&s, 0, sizeof (s));
1319
1320   linemap_get_statistics (line_table, &s);
1321
1322   macro_maps_size = s.macro_maps_used_size
1323     + s.macro_maps_locations_size;
1324
1325   total_allocated_map_size = s.ordinary_maps_allocated_size
1326     + s.macro_maps_allocated_size
1327     + s.macro_maps_locations_size;
1328
1329   total_used_map_size = s.ordinary_maps_used_size
1330     + s.macro_maps_used_size
1331     + s.macro_maps_locations_size;
1332
1333   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1334            s.num_expanded_macros);
1335   if (s.num_expanded_macros != 0)
1336     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1337              s.num_macro_tokens / s.num_expanded_macros);
1338   fprintf (stderr,
1339            "\nLine Table allocations during the "
1340            "compilation process\n");
1341   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1342            SIZE_AMOUNT (s.num_ordinary_maps_used));
1343   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1344            SIZE_AMOUNT (s.ordinary_maps_used_size));
1345   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1346            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1347   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1348            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1349   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1350            SIZE_AMOUNT (s.num_macro_maps_used));
1351   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1352            SIZE_AMOUNT (s.macro_maps_used_size));
1353   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1354            SIZE_AMOUNT (s.macro_maps_locations_size));
1355   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1356            SIZE_AMOUNT (macro_maps_size));
1357   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1358            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1359   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1360            SIZE_AMOUNT (total_allocated_map_size));
1361   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1362            SIZE_AMOUNT (total_used_map_size));
1363   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1364            SIZE_AMOUNT (s.adhoc_table_size));
1365   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1366            SIZE_AMOUNT (s.adhoc_table_entries_used));
1367   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1368            SIZE_AMOUNT (line_table->m_num_optimized_ranges));
1369   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1370            SIZE_AMOUNT (line_table->m_num_unoptimized_ranges));
1371
1372   fprintf (stderr, "\n");
1373 }
1374
1375 /* Get location one beyond the final location in ordinary map IDX.  */
1376
1377 static location_t
1378 get_end_location (class line_maps *set, line_map_uint_t idx)
1379 {
1380   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1381     return set->highest_location;
1382
1383   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1384   return MAP_START_LOCATION (next_map);
1385 }
1386
1387 /* Helper function for write_digit_row.  */
1388
1389 static void
1390 write_digit (FILE *stream, int digit)
1391 {
1392   fputc ('0' + (digit % 10), stream);
1393 }
1394
1395 /* Helper function for dump_location_info.
1396    Write a row of numbers to STREAM, numbering a source line,
1397    giving the units, tens, hundreds etc of the column number.  */
1398
1399 static void
1400 write_digit_row (FILE *stream, int indent,
1401                  const line_map_ordinary *map,
1402                  location_t loc, int max_col, int divisor)
1403 {
1404   fprintf (stream, "%*c", indent, ' ');
1405   fprintf (stream, "|");
1406   for (int column = 1; column < max_col; column++)
1407     {
1408       location_t column_loc = loc + (location_t (column) << map->m_range_bits);
1409       write_digit (stream, column_loc / divisor);
1410     }
1411   fprintf (stream, "\n");
1412 }
1413
1414 /* Write a half-closed (START) / half-open (END) interval of
1415    location_t to STREAM.  */
1416
1417 static void
1418 dump_location_range (FILE *stream,
1419                      location_t start, location_t end)
1420 {
1421   fprintf (stream,
1422            "  location_t interval: %llu <= loc < %llu\n",
1423            (unsigned long long) start, (unsigned long long) end);
1424 }
1425
1426 /* Write a labelled description of a half-closed (START) / half-open (END)
1427    interval of location_t to STREAM.  */
1428
1429 static void
1430 dump_labelled_location_range (FILE *stream,
1431                               const char *name,
1432                               location_t start, location_t end)
1433 {
1434   fprintf (stream, "%s\n", name);
1435   dump_location_range (stream, start, end);
1436   fprintf (stream, "\n");
1437 }
1438
1439 /* Write a visualization of the locations in the line_table to STREAM.  */
1440
1441 void
1442 dump_location_info (FILE *stream)
1443 {
1444   file_cache fc;
1445
1446   /* Visualize the reserved locations.  */
1447   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1448                                 0, RESERVED_LOCATION_COUNT);
1449
1450   using ULL = unsigned long long;
1451
1452   /* Visualize the ordinary line_map instances, rendering the sources. */
1453   for (line_map_uint_t idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table);
1454        idx++)
1455     {
1456       location_t end_location = get_end_location (line_table, idx);
1457       /* half-closed: doesn't include this one. */
1458
1459       const line_map_ordinary *map
1460         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1461       fprintf (stream, "ORDINARY MAP: %llu\n", (ULL) idx);
1462       dump_location_range (stream,
1463                            MAP_START_LOCATION (map), end_location);
1464       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1465       fprintf (stream, "  starting at line: %i\n",
1466                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1467       fprintf (stream, "  column and range bits: %i\n",
1468                map->m_column_and_range_bits);
1469       fprintf (stream, "  column bits: %i\n",
1470                map->m_column_and_range_bits - map->m_range_bits);
1471       fprintf (stream, "  range bits: %i\n",
1472                map->m_range_bits);
1473       const char * reason;
1474       switch (map->reason) {
1475       case LC_ENTER:
1476         reason = "LC_ENTER";
1477         break;
1478       case LC_LEAVE:
1479         reason = "LC_LEAVE";
1480         break;
1481       case LC_RENAME:
1482         reason = "LC_RENAME";
1483         break;
1484       case LC_RENAME_VERBATIM:
1485         reason = "LC_RENAME_VERBATIM";
1486         break;
1487       case LC_ENTER_MACRO:
1488         reason = "LC_RENAME_MACRO";
1489         break;
1490       default:
1491         reason = "Unknown";
1492       }
1493       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1494
1495       const line_map_ordinary *includer_map
1496         = linemap_included_from_linemap (line_table, map);
1497       fprintf (stream, "  included from location: %llu",
1498                (ULL) linemap_included_from (map));
1499       if (includer_map) {
1500         fprintf (stream, " (in ordinary map %llu)",
1501                  ULL (includer_map - line_table->info_ordinary.maps));
1502       }
1503       fprintf (stream, "\n");
1504
1505       /* Render the span of source lines that this "map" covers.  */
1506       for (location_t loc = MAP_START_LOCATION (map);
1507            loc < end_location;
1508            loc += (location_t (1) << map->m_range_bits))
1509         {
1510           gcc_assert (pure_location_p (line_table, loc) );
1511
1512           expanded_location exploc
1513             = linemap_expand_location (line_table, map, loc);
1514
1515           if (exploc.column == 0)
1516             {
1517               /* Beginning of a new source line: draw the line.  */
1518
1519               char_span line_text = fc.get_source_line (exploc.file,
1520                                                         exploc.line);
1521               if (!line_text)
1522                 break;
1523               fprintf (stream,
1524                        "%s:%3i|loc:%5llu|%.*s\n",
1525                        exploc.file, exploc.line,
1526                        (ULL) loc,
1527                        (int)line_text.length (), line_text.get_buffer ());
1528
1529               /* "loc" is at column 0, which means "the whole line".
1530                  Render the locations *within* the line, by underlining
1531                  it, showing the location_t numeric values
1532                  at each column.  */
1533               auto max_col = (ULL (1) << map->m_column_and_range_bits) - 1;
1534               if (max_col > line_text.length ())
1535                 max_col = line_text.length () + 1;
1536
1537               int len_lnum = num_digits (exploc.line);
1538               if (len_lnum < 3)
1539                 len_lnum = 3;
1540               int len_loc = num_digits (loc);
1541               if (len_loc < 5)
1542                 len_loc = 5;
1543
1544               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1545
1546               /* Thousands.  */
1547               if (end_location > 999)
1548                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1549
1550               /* Hundreds.  */
1551               if (end_location > 99)
1552                 write_digit_row (stream, indent, map, loc, max_col, 100);
1553
1554               /* Tens.  */
1555               write_digit_row (stream, indent, map, loc, max_col, 10);
1556
1557               /* Units.  */
1558               write_digit_row (stream, indent, map, loc, max_col, 1);
1559             }
1560         }
1561       fprintf (stream, "\n");
1562     }
1563
1564   /* Visualize unallocated values.  */
1565   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1566                                 line_table->highest_location,
1567                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1568
1569   /* Visualize the macro line_map instances, rendering the sources. */
1570   for (line_map_uint_t i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1571     {
1572       /* Each macro map that is allocated owns location_t values
1573          that are *lower* that the one before them.
1574          Hence it's meaningful to view them either in order of ascending
1575          source locations, or in order of ascending macro map index.  */
1576       const bool ascending_location_ts = true;
1577       auto idx = (ascending_location_ts
1578                   ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1579                   : i);
1580       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1581       fprintf (stream, "MACRO %llu: %s (%u tokens)\n",
1582                (ULL) idx,
1583                linemap_map_get_macro_name (map),
1584                MACRO_MAP_NUM_MACRO_TOKENS (map));
1585       dump_location_range (stream,
1586                            map->start_location,
1587                            (map->start_location
1588                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1589       inform (map->get_expansion_point_location (),
1590               "expansion point is location %llu",
1591               (ULL) map->get_expansion_point_location ());
1592       fprintf (stream, "  map->start_location: %llu\n",
1593                (ULL) map->start_location);
1594
1595       fprintf (stream, "  macro_locations:\n");
1596       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1597         {
1598           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1599           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1600
1601           /* linemap_add_macro_token encodes token numbers in an expansion
1602              by putting them after MAP_START_LOCATION. */
1603
1604           /* I'm typically seeing 4 uninitialized entries at the end of
1605              0xafafafaf.
1606              This appears to be due to macro.cc:replace_args
1607              adding 2 extra args for padding tokens; presumably there may
1608              be a leading and/or trailing padding token injected,
1609              each for 2 more location slots.
1610              This would explain there being up to 4 location_ts slots
1611              that may be uninitialized.  */
1612
1613           fprintf (stream, "    %u: %llu, %llu\n",
1614                    i,
1615                    (ULL) x,
1616                    (ULL) y);
1617           if (x == y)
1618             {
1619               if (x < MAP_START_LOCATION (map))
1620                 inform (x, "token %u has %<x-location == y-location == %llu%>",
1621                         i, (ULL) x);
1622               else
1623                 fprintf (stream,
1624                          "x-location == y-location == %llu"
1625                          " encodes token # %u\n",
1626                          (ULL) x,
1627                          (unsigned int)(x - MAP_START_LOCATION (map)));
1628             }
1629           else
1630             {
1631               inform (x, "token %u has %<x-location == %llu%>", i, (ULL) x);
1632               inform (x, "token %u has %<y-location == %llu%>", i, (ULL) y);
1633             }
1634         }
1635       fprintf (stream, "\n");
1636     }
1637
1638   /* It appears that MAX_LOCATION_T itself is never assigned to a
1639      macro map, presumably due to an off-by-one error somewhere
1640      between the logic in linemap_enter_macro and
1641      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1642   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1643                                 MAX_LOCATION_T,
1644                                 MAX_LOCATION_T + 1);
1645
1646   /* Visualize ad-hoc values.  */
1647   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1648                                 MAX_LOCATION_T + 1, location_t (-1));
1649 }
1650
1651 /* string_concat's constructor.  */
1652
1653 string_concat::string_concat (int num, location_t *locs)
1654   : m_num (num)
1655 {
1656   m_locs = ggc_vec_alloc <location_t> (num);
1657   for (int i = 0; i < num; i++)
1658     m_locs[i] = locs[i];
1659 }
1660
1661 /* string_concat_db's constructor.  */
1662
1663 string_concat_db::string_concat_db ()
1664 {
1665   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1666 }
1667
1668 /* Record that a string concatenation occurred, covering NUM
1669    string literal tokens.  LOCS is an array of size NUM, containing the
1670    locations of the tokens.  A copy of LOCS is taken.  */
1671
1672 void
1673 string_concat_db::record_string_concatenation (int num, location_t *locs)
1674 {
1675   gcc_assert (num > 1);
1676   gcc_assert (locs);
1677
1678   location_t key_loc = get_key_loc (locs[0]);
1679   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1680      any data now recorded under key 'key_loc' would be overwritten by a
1681      subsequent call with the same key 'key_loc'.  */
1682   if (RESERVED_LOCATION_P (key_loc))
1683     return;
1684
1685   string_concat *concat
1686     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1687   m_table->put (key_loc, concat);
1688 }
1689
1690 /* Determine if LOC was the location of the initial token of a
1691    concatenation of string literal tokens.
1692    If so, *OUT_NUM is written to with the number of tokens, and
1693    *OUT_LOCS with the location of an array of locations of the
1694    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1695    storage owned by the string_concat_db.
1696    Otherwise, return false.  */
1697
1698 bool
1699 string_concat_db::get_string_concatenation (location_t loc,
1700                                             int *out_num,
1701                                             location_t **out_locs)
1702 {
1703   gcc_assert (out_num);
1704   gcc_assert (out_locs);
1705
1706   location_t key_loc = get_key_loc (loc);
1707   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1708      discussion in 'string_concat_db::record_string_concatenation'.  */
1709   if (RESERVED_LOCATION_P (key_loc))
1710     return false;
1711
1712   string_concat **concat = m_table->get (key_loc);
1713   if (!concat)
1714     return false;
1715
1716   *out_num = (*concat)->m_num;
1717   *out_locs =(*concat)->m_locs;
1718   return true;
1719 }
1720
1721 /* Internal function.  Canonicalize LOC into a form suitable for
1722    use as a key within the database, stripping away macro expansion,
1723    ad-hoc information, and range information, using the location of
1724    the start of LOC within an ordinary linemap.  */
1725
1726 location_t
1727 string_concat_db::get_key_loc (location_t loc)
1728 {
1729   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1730                                   NULL);
1731
1732   loc = get_range_from_loc (line_table, loc).m_start;
1733
1734   return loc;
1735 }
1736
1737 /* Helper class for use within get_substring_ranges_for_loc.
1738    An vec of cpp_string with responsibility for releasing all of the
1739    str->text for each str in the vector.  */
1740
1741 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1742 {
1743  public:
1744   auto_cpp_string_vec (int alloc)
1745     : auto_vec <cpp_string> (alloc) {}
1746
1747   ~auto_cpp_string_vec ()
1748   {
1749     /* Clean up the copies within this vec.  */
1750     int i;
1751     cpp_string *str;
1752     FOR_EACH_VEC_ELT (*this, i, str)
1753       free (const_cast <unsigned char *> (str->text));
1754   }
1755 };
1756
1757 /* Attempt to populate RANGES with source location information on the
1758    individual characters within the string literal found at STRLOC.
1759    If CONCATS is non-NULL, then any string literals that the token at
1760    STRLOC  was concatenated with are also added to RANGES.
1761
1762    Return NULL if successful, or an error message if any errors occurred (in
1763    which case RANGES may be only partially populated and should not
1764    be used).
1765
1766    This is implemented by re-parsing the relevant source line(s).  */
1767
1768 static const char *
1769 get_substring_ranges_for_loc (cpp_reader *pfile,
1770                               file_cache &fc,
1771                               string_concat_db *concats,
1772                               location_t strloc,
1773                               enum cpp_ttype type,
1774                               cpp_substring_ranges &ranges)
1775 {
1776   gcc_assert (pfile);
1777
1778   if (strloc == UNKNOWN_LOCATION)
1779     return "unknown location";
1780
1781   /* Reparsing the strings requires accurate location information.
1782      If -ftrack-macro-expansion has been overridden from its default
1783      of 2, then we might have a location of a macro expansion point,
1784      rather than the location of the literal itself.
1785      Avoid this by requiring that we have full macro expansion tracking
1786      for substring locations to be available.  */
1787   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1788     return "track_macro_expansion != 2";
1789
1790   /* If #line or # 44 "file"-style directives are present, then there's
1791      no guarantee that the line numbers we have can be used to locate
1792      the strings.  For example, we might have a .i file with # directives
1793      pointing back to lines within a .c file, but the .c file might
1794      have been edited since the .i file was created.
1795      In such a case, the safest course is to disable on-demand substring
1796      locations.  */
1797   if (line_table->seen_line_directive)
1798     return "seen line directive";
1799
1800   /* If string concatenation has occurred at STRLOC, get the locations
1801      of all of the literal tokens making up the compound string.
1802      Otherwise, just use STRLOC.  */
1803   int num_locs = 1;
1804   location_t *strlocs = &strloc;
1805   if (concats)
1806     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1807
1808   auto_cpp_string_vec strs (num_locs);
1809   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1810   for (int i = 0; i < num_locs; i++)
1811     {
1812       /* Get range of strloc.  We will use it to locate the start and finish
1813          of the literal token within the line.  */
1814       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1815
1816       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1817         {
1818           /* If the string token was within a macro expansion, then we can
1819              cope with it for the simple case where we have a single token.
1820              Otherwise, bail out.  */
1821           if (src_range.m_start != src_range.m_finish)
1822             return "macro expansion";
1823         }
1824       else
1825         {
1826           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1827             /* If so, we can't reliably determine where the token started within
1828                its line.  */
1829             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1830
1831           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1832             /* If so, we can't reliably determine where the token finished
1833                within its line.  */
1834             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1835         }
1836
1837       expanded_location start
1838         = expand_location_to_spelling_point (src_range.m_start,
1839                                              LOCATION_ASPECT_START);
1840       expanded_location finish
1841         = expand_location_to_spelling_point (src_range.m_finish,
1842                                              LOCATION_ASPECT_FINISH);
1843       if (start.file != finish.file)
1844         return "range endpoints are in different files";
1845       if (start.line != finish.line)
1846         return "range endpoints are on different lines";
1847       if (start.column > finish.column)
1848         return "range endpoints are reversed";
1849
1850       char_span line = fc.get_source_line (start.file, start.line);
1851       if (!line)
1852         return "unable to read source line";
1853
1854       /* Determine the location of the literal (including quotes
1855          and leading prefix chars, such as the 'u' in a u""
1856          token).  */
1857       size_t literal_length = finish.column - start.column + 1;
1858
1859       /* Ensure that we don't crash if we got the wrong location.  */
1860       if (start.column < 1)
1861         return "zero start column";
1862       if (line.length () < (start.column - 1 + literal_length))
1863         return "line is not wide enough";
1864
1865       char_span literal = line.subspan (start.column - 1, literal_length);
1866
1867       cpp_string from;
1868       from.len = literal_length;
1869       /* Make a copy of the literal, to avoid having to rely on
1870          the lifetime of the copy of the line within the cache.
1871          This will be released by the auto_cpp_string_vec dtor.  */
1872       from.text = (unsigned char *)literal.xstrdup ();
1873       strs.safe_push (from);
1874
1875       /* For very long lines, a new linemap could have started
1876          halfway through the token.
1877          Ensure that the loc_reader uses the linemap of the
1878          *end* of the token for its start location.  */
1879       const line_map_ordinary *start_ord_map;
1880       linemap_resolve_location (line_table, src_range.m_start,
1881                                 LRK_SPELLING_LOCATION, &start_ord_map);
1882       const line_map_ordinary *final_ord_map;
1883       linemap_resolve_location (line_table, src_range.m_finish,
1884                                 LRK_SPELLING_LOCATION, &final_ord_map);
1885       if (start_ord_map == NULL || final_ord_map == NULL)
1886         return "failed to get ordinary maps";
1887       /* Bulletproofing.  We ought to only have different ordinary maps
1888          for start vs finish due to line-length jumps.  */
1889       if (start_ord_map != final_ord_map
1890           && start_ord_map->to_file != final_ord_map->to_file)
1891         return "start and finish are spelled in different ordinary maps";
1892       /* The file from linemap_resolve_location ought to match that from
1893          expand_location_to_spelling_point.  */
1894       if (start_ord_map->to_file != start.file)
1895         return "mismatching file after resolving linemap";
1896
1897       location_t start_loc
1898         = linemap_position_for_line_and_column (line_table, final_ord_map,
1899                                                 start.line, start.column);
1900
1901       cpp_string_location_reader loc_reader (start_loc, line_table);
1902       loc_readers.safe_push (loc_reader);
1903     }
1904
1905   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1906   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1907                                                  loc_readers.address (),
1908                                                  num_locs, &ranges, type);
1909   if (err)
1910     return err;
1911
1912   /* Success: "ranges" should now contain information on the string.  */
1913   return NULL;
1914 }
1915
1916 /* Attempt to populate *OUT_LOC with source location information on the
1917    given characters within the string literal found at STRLOC.
1918    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1919    character set.
1920
1921    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1922    and string literal "012345\n789"
1923    *OUT_LOC is written to with:
1924      "012345\n789"
1925          ~^~~~~
1926
1927    If CONCATS is non-NULL, then any string literals that the token at
1928    STRLOC was concatenated with are also considered.
1929
1930    This is implemented by re-parsing the relevant source line(s).
1931
1932    Return NULL if successful, or an error message if any errors occurred.
1933    Error messages are intended for GCC developers (to help debugging) rather
1934    than for end-users.  */
1935
1936 const char *
1937 get_location_within_string (cpp_reader *pfile,
1938                             file_cache &fc,
1939                             string_concat_db *concats,
1940                             location_t strloc,
1941                             enum cpp_ttype type,
1942                             int caret_idx, int start_idx, int end_idx,
1943                             location_t *out_loc)
1944 {
1945   gcc_checking_assert (caret_idx >= 0);
1946   gcc_checking_assert (start_idx >= 0);
1947   gcc_checking_assert (end_idx >= 0);
1948   gcc_assert (out_loc);
1949
1950   cpp_substring_ranges ranges;
1951   const char *err
1952     = get_substring_ranges_for_loc (pfile, fc, concats, strloc, type, ranges);
1953   if (err)
1954     return err;
1955
1956   if (caret_idx >= ranges.get_num_ranges ())
1957     return "caret_idx out of range";
1958   if (start_idx >= ranges.get_num_ranges ())
1959     return "start_idx out of range";
1960   if (end_idx >= ranges.get_num_ranges ())
1961     return "end_idx out of range";
1962
1963   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1964                             ranges.get_range (start_idx).m_start,
1965                             ranges.get_range (end_idx).m_finish);
1966   return NULL;
1967 }
1968
1969 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1970
1971 location_t
1972 location_with_discriminator (location_t locus, int discriminator)
1973 {
1974   tree block = LOCATION_BLOCK (locus);
1975   source_range src_range = get_range_from_loc (line_table, locus);
1976   locus = get_pure_location (locus);
1977
1978   if (locus == UNKNOWN_LOCATION)
1979     return locus;
1980
1981   return line_table->get_or_create_combined_loc (locus, src_range, block,
1982                                                  discriminator);
1983 }
1984
1985 /* Return TRUE if LOCUS represents a location with a discriminator.  */
1986
1987 bool
1988 has_discriminator (location_t locus)
1989 {
1990   return get_discriminator_from_loc (locus) != 0;
1991 }
1992
1993 /* Return the discriminator for LOCUS.  */
1994
1995 int
1996 get_discriminator_from_loc (location_t locus)
1997 {
1998   return get_discriminator_from_loc (line_table, locus);
1999 }
2000
2001 #if CHECKING_P
2002
2003 namespace selftest {
2004
2005 /* Selftests of location handling.  */
2006
2007 /* Attempt to populate *OUT_RANGE with source location information on the
2008    given character within the string literal found at STRLOC.
2009    CHAR_IDX refers to an offset within the execution character set.
2010    If CONCATS is non-NULL, then any string literals that the token at
2011    STRLOC was concatenated with are also considered.
2012
2013    This is implemented by re-parsing the relevant source line(s).
2014
2015    Return NULL if successful, or an error message if any errors occurred.
2016    Error messages are intended for GCC developers (to help debugging) rather
2017    than for end-users.  */
2018
2019 static const char *
2020 get_source_range_for_char (cpp_reader *pfile,
2021                            file_cache &fc,
2022                            string_concat_db *concats,
2023                            location_t strloc,
2024                            enum cpp_ttype type,
2025                            int char_idx,
2026                            source_range *out_range)
2027 {
2028   gcc_checking_assert (char_idx >= 0);
2029   gcc_assert (out_range);
2030
2031   cpp_substring_ranges ranges;
2032   const char *err
2033     = get_substring_ranges_for_loc (pfile, fc, concats, strloc, type, ranges);
2034   if (err)
2035     return err;
2036
2037   if (char_idx >= ranges.get_num_ranges ())
2038     return "char_idx out of range";
2039
2040   *out_range = ranges.get_range (char_idx);
2041   return NULL;
2042 }
2043
2044 /* As get_source_range_for_char, but write to *OUT the number
2045    of ranges that are available.  */
2046
2047 static const char *
2048 get_num_source_ranges_for_substring (cpp_reader *pfile,
2049                                      file_cache &fc,
2050                                      string_concat_db *concats,
2051                                      location_t strloc,
2052                                      enum cpp_ttype type,
2053                                      int *out)
2054 {
2055   gcc_assert (out);
2056
2057   cpp_substring_ranges ranges;
2058   const char *err
2059     = get_substring_ranges_for_loc (pfile, fc, concats, strloc, type, ranges);
2060
2061   if (err)
2062     return err;
2063
2064   *out = ranges.get_num_ranges ();
2065   return NULL;
2066 }
2067
2068 /* Selftests of location handling.  */
2069
2070 /* Verify that compare() on linenum_type handles comparisons over the full
2071    range of the type.  */
2072
2073 static void
2074 test_linenum_comparisons ()
2075 {
2076   linenum_type min_line (0);
2077   linenum_type max_line (0xffffffff);
2078   ASSERT_EQ (0, compare (min_line, min_line));
2079   ASSERT_EQ (0, compare (max_line, max_line));
2080
2081   ASSERT_GT (compare (max_line, min_line), 0);
2082   ASSERT_LT (compare (min_line, max_line), 0);
2083 }
2084
2085 /* Helper function for verifying location data: when location_t
2086    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
2087    as having column 0.  */
2088
2089 static bool
2090 should_have_column_data_p (location_t loc)
2091 {
2092   if (IS_ADHOC_LOC (loc))
2093     loc = get_location_from_adhoc_loc (line_table, loc);
2094   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
2095     return false;
2096   return true;
2097 }
2098
2099 /* Selftest for should_have_column_data_p.  */
2100
2101 static void
2102 test_should_have_column_data_p ()
2103 {
2104   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
2105   ASSERT_TRUE
2106     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
2107   ASSERT_FALSE
2108     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2109 }
2110
2111 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2112    on LOC.  */
2113
2114 static void
2115 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2116               location_t loc)
2117 {
2118   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2119   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2120   /* If location_t values are sufficiently high, then column numbers
2121      will be unavailable and LOCATION_COLUMN (loc) will be 0.
2122      When close to the threshold, column numbers *may* be present: if
2123      the final linemap before the threshold contains a line that straddles
2124      the threshold, locations in that line have column information.  */
2125   if (should_have_column_data_p (loc))
2126     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2127 }
2128
2129 /* Various selftests involve constructing a line table and one or more
2130    line maps within it.
2131
2132    For maximum test coverage we want to run these tests with a variety
2133    of situations:
2134    - line_table->default_range_bits: some frontends use a non-zero value
2135    and others use zero
2136    - the fallback modes within line-map.cc: there are various threshold
2137    values for location_t beyond line-map.cc changes
2138    behavior (disabling of the range-packing optimization, disabling
2139    of column-tracking).  We can exercise these by starting the line_table
2140    at interesting values at or near these thresholds.
2141
2142    The following struct describes a particular case within our test
2143    matrix.  */
2144
2145 class line_table_case
2146 {
2147 public:
2148   line_table_case (int default_range_bits, location_t base_location)
2149   : m_default_range_bits (default_range_bits),
2150     m_base_location (base_location)
2151   {}
2152
2153   int m_default_range_bits;
2154   location_t m_base_location;
2155 };
2156
2157 /* Constructor.  Store the old value of line_table, and create a new
2158    one, using sane defaults.  */
2159
2160 line_table_test::line_table_test ()
2161 {
2162   gcc_assert (saved_line_table == NULL);
2163   saved_line_table = line_table;
2164   line_table = ggc_alloc<line_maps> ();
2165   linemap_init (line_table, BUILTINS_LOCATION);
2166   gcc_assert (saved_line_table->m_reallocator);
2167   line_table->m_reallocator = saved_line_table->m_reallocator;
2168   gcc_assert (saved_line_table->m_round_alloc_size);
2169   line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2170   line_table->default_range_bits = 0;
2171 }
2172
2173 /* Constructor.  Store the old value of line_table, and create a new
2174    one, using the sitation described in CASE_.  */
2175
2176 line_table_test::line_table_test (const line_table_case &case_)
2177 {
2178   gcc_assert (saved_line_table == NULL);
2179   saved_line_table = line_table;
2180   line_table = ggc_alloc<line_maps> ();
2181   linemap_init (line_table, BUILTINS_LOCATION);
2182   gcc_assert (saved_line_table->m_reallocator);
2183   line_table->m_reallocator = saved_line_table->m_reallocator;
2184   gcc_assert (saved_line_table->m_round_alloc_size);
2185   line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2186   line_table->default_range_bits = case_.m_default_range_bits;
2187   if (case_.m_base_location)
2188     {
2189       line_table->highest_location = case_.m_base_location;
2190       line_table->highest_line = case_.m_base_location;
2191     }
2192 }
2193
2194 /* Destructor.  Restore the old value of line_table.  */
2195
2196 line_table_test::~line_table_test ()
2197 {
2198   gcc_assert (saved_line_table != NULL);
2199   line_table = saved_line_table;
2200   saved_line_table = NULL;
2201 }
2202
2203 /* Verify basic operation of ordinary linemaps.  */
2204
2205 static void
2206 test_accessing_ordinary_linemaps (const line_table_case &case_)
2207 {
2208   line_table_test ltt (case_);
2209
2210   /* Build a simple linemap describing some locations. */
2211   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2212
2213   linemap_line_start (line_table, 1, 100);
2214   location_t loc_a = linemap_position_for_column (line_table, 1);
2215   location_t loc_b = linemap_position_for_column (line_table, 23);
2216
2217   linemap_line_start (line_table, 2, 100);
2218   location_t loc_c = linemap_position_for_column (line_table, 1);
2219   location_t loc_d = linemap_position_for_column (line_table, 17);
2220
2221   /* Example of a very long line.  */
2222   linemap_line_start (line_table, 3, 2000);
2223   location_t loc_e = linemap_position_for_column (line_table, 700);
2224
2225   /* Transitioning back to a short line.  */
2226   linemap_line_start (line_table, 4, 0);
2227   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2228
2229   if (should_have_column_data_p (loc_back_to_short))
2230     {
2231       /* Verify that we switched to short lines in the linemap.  */
2232       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2233       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2234     }
2235
2236   /* Example of a line that will eventually be seen to be longer
2237      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2238      below that.  */
2239   linemap_line_start (line_table, 5, 2000);
2240
2241   location_t loc_start_of_very_long_line
2242     = linemap_position_for_column (line_table, 2000);
2243   location_t loc_too_wide
2244     = linemap_position_for_column (line_table, LINE_MAP_MAX_COLUMN_NUMBER + 1);
2245   location_t loc_too_wide_2
2246     = linemap_position_for_column (line_table, LINE_MAP_MAX_COLUMN_NUMBER + 2);
2247
2248   /* ...and back to a sane line length.  */
2249   linemap_line_start (line_table, 6, 100);
2250   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2251
2252   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2253
2254   /* Multiple files.  */
2255   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2256   linemap_line_start (line_table, 1, 200);
2257   location_t loc_f = linemap_position_for_column (line_table, 150);
2258   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2259
2260   /* Verify that we can recover the location info.  */
2261   assert_loceq ("foo.c", 1, 1, loc_a);
2262   assert_loceq ("foo.c", 1, 23, loc_b);
2263   assert_loceq ("foo.c", 2, 1, loc_c);
2264   assert_loceq ("foo.c", 2, 17, loc_d);
2265   assert_loceq ("foo.c", 3, 700, loc_e);
2266   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2267
2268   /* In the very wide line, the initial location should be fully tracked.  */
2269   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2270   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2271      be disabled.  */
2272   assert_loceq ("foo.c", 5, 0, loc_too_wide);
2273   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2274   /*...and column-tracking should be re-enabled for subsequent lines.  */
2275   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2276
2277   assert_loceq ("bar.c", 1, 150, loc_f);
2278
2279   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2280   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2281
2282   /* Verify using make_location to build a range, and extracting data
2283      back from it.  */
2284   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2285   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2286   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2287   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2288   ASSERT_EQ (loc_b, src_range.m_start);
2289   ASSERT_EQ (loc_d, src_range.m_finish);
2290 }
2291
2292 /* Verify various properties of UNKNOWN_LOCATION.  */
2293
2294 static void
2295 test_unknown_location ()
2296 {
2297   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2298   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2299   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2300 }
2301
2302 /* Verify various properties of BUILTINS_LOCATION.  */
2303
2304 static void
2305 test_builtins ()
2306 {
2307   assert_loceq (special_fname_builtin (), 0, 0, BUILTINS_LOCATION);
2308   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2309 }
2310
2311 /* Regression test for make_location.
2312    Ensure that we use pure locations for the start/finish of the range,
2313    rather than storing a packed or ad-hoc range as the start/finish.  */
2314
2315 static void
2316 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2317 {
2318   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2319      with C++ frontend.
2320      ....................0000000001111111111222.
2321      ....................1234567890123456789012.  */
2322   const char *content = "     r += !aaa == bbb;\n";
2323   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2324   line_table_test ltt (case_);
2325   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2326
2327   const location_t c11 = linemap_position_for_column (line_table, 11);
2328   const location_t c12 = linemap_position_for_column (line_table, 12);
2329   const location_t c13 = linemap_position_for_column (line_table, 13);
2330   const location_t c14 = linemap_position_for_column (line_table, 14);
2331   const location_t c21 = linemap_position_for_column (line_table, 21);
2332
2333   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2334     return;
2335
2336   /* Use column 13 for the caret location, arbitrarily, to verify that we
2337      handle start != caret.  */
2338   const location_t aaa = make_location (c13, c12, c14);
2339   ASSERT_EQ (c13, get_pure_location (aaa));
2340   ASSERT_EQ (c12, get_start (aaa));
2341   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2342   ASSERT_EQ (c14, get_finish (aaa));
2343   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2344
2345   /* Make a location using a location with a range as the start-point.  */
2346   const location_t not_aaa = make_location (c11, aaa, c14);
2347   ASSERT_EQ (c11, get_pure_location (not_aaa));
2348   /* It should use the start location of the range, not store the range
2349      itself.  */
2350   ASSERT_EQ (c12, get_start (not_aaa));
2351   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2352   ASSERT_EQ (c14, get_finish (not_aaa));
2353   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2354
2355   /* Similarly, make a location with a range as the end-point.  */
2356   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2357   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2358   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2359   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2360   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2361   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2362   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2363   /* It should use the finish location of the range, not store the range
2364      itself.  */
2365   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2366   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2367   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2368   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2369   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2370 }
2371
2372 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2373
2374 static void
2375 test_reading_source_line ()
2376 {
2377   /* Create a tempfile and write some text to it.  */
2378   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2379                         "01234567890123456789\n"
2380                         "This is the test text\n"
2381                         "This is the 3rd line");
2382   file_cache fc;
2383
2384   /* Read back a specific line from the tempfile.  */
2385   char_span source_line = fc.get_source_line (tmp.get_filename (), 3);
2386   ASSERT_TRUE (source_line);
2387   ASSERT_TRUE (source_line.get_buffer () != NULL);
2388   ASSERT_EQ (20, source_line.length ());
2389   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2390                          source_line.get_buffer (), source_line.length ()));
2391
2392   source_line = fc.get_source_line (tmp.get_filename (), 2);
2393   ASSERT_TRUE (source_line);
2394   ASSERT_TRUE (source_line.get_buffer () != NULL);
2395   ASSERT_EQ (21, source_line.length ());
2396   ASSERT_TRUE (!strncmp ("This is the test text",
2397                          source_line.get_buffer (), source_line.length ()));
2398
2399   source_line = fc.get_source_line (tmp.get_filename (), 4);
2400   ASSERT_FALSE (source_line);
2401   ASSERT_TRUE (source_line.get_buffer () == NULL);
2402 }
2403
2404 /* Verify reading from buffers (e.g. for sarif-replay).  */
2405
2406 static void
2407 test_reading_source_buffer ()
2408 {
2409   const char *text = ("01234567890123456789\n"
2410                       "This is the test text\n"
2411                       "This is the 3rd line");
2412   const char *filename = "foo.txt";
2413   file_cache fc;
2414   fc.add_buffered_content (filename, text, strlen (text));
2415
2416   /* Read back a specific line from the tempfile.  */
2417   char_span source_line = fc.get_source_line (filename, 3);
2418   ASSERT_TRUE (source_line);
2419   ASSERT_TRUE (source_line.get_buffer () != NULL);
2420   ASSERT_EQ (20, source_line.length ());
2421   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2422                          source_line.get_buffer (), source_line.length ()));
2423
2424   source_line = fc.get_source_line (filename, 2);
2425   ASSERT_TRUE (source_line);
2426   ASSERT_TRUE (source_line.get_buffer () != NULL);
2427   ASSERT_EQ (21, source_line.length ());
2428   ASSERT_TRUE (!strncmp ("This is the test text",
2429                          source_line.get_buffer (), source_line.length ()));
2430
2431   source_line = fc.get_source_line (filename, 4);
2432   ASSERT_FALSE (source_line);
2433   ASSERT_TRUE (source_line.get_buffer () == NULL);
2434 }
2435
2436 /* Tests of lexing.  */
2437
2438 /* Verify that token TOK from PARSER has cpp_token_as_text
2439    equal to EXPECTED_TEXT.  */
2440
2441 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2442   SELFTEST_BEGIN_STMT                                                   \
2443     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2444     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2445   SELFTEST_END_STMT
2446
2447 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2448    and ranges from EXP_START_COL to EXP_FINISH_COL.
2449    Use LOC as the effective location of the selftest.  */
2450
2451 static void
2452 assert_token_loc_eq (const location &loc,
2453                      const cpp_token *tok,
2454                      const char *exp_filename, int exp_linenum,
2455                      int exp_start_col, int exp_finish_col)
2456 {
2457   location_t tok_loc = tok->src_loc;
2458   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2459   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2460
2461   /* If location_t values are sufficiently high, then column numbers
2462      will be unavailable.  */
2463   if (!should_have_column_data_p (tok_loc))
2464     return;
2465
2466   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2467   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2468   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2469   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2470 }
2471
2472 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2473    SELFTEST_LOCATION as the effective location of the selftest.  */
2474
2475 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2476                             EXP_START_COL, EXP_FINISH_COL) \
2477   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2478                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2479
2480 /* Test of lexing a file using libcpp, verifying tokens and their
2481    location information.  */
2482
2483 static void
2484 test_lexer (const line_table_case &case_)
2485 {
2486   /* Create a tempfile and write some text to it.  */
2487   const char *content =
2488     /*00000000011111111112222222222333333.3333444444444.455555555556
2489       12345678901234567890123456789012345.6789012345678.901234567890.  */
2490     ("test_name /* c-style comment */\n"
2491      "                                  \"test literal\"\n"
2492      " // test c++-style comment\n"
2493      "   42\n");
2494   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2495
2496   line_table_test ltt (case_);
2497
2498   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2499
2500   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2501   ASSERT_NE (fname, NULL);
2502
2503   /* Verify that we get the expected tokens back, with the correct
2504      location information.  */
2505
2506   location_t loc;
2507   const cpp_token *tok;
2508   tok = cpp_get_token_with_location (parser, &loc);
2509   ASSERT_NE (tok, NULL);
2510   ASSERT_EQ (tok->type, CPP_NAME);
2511   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2512   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2513
2514   tok = cpp_get_token_with_location (parser, &loc);
2515   ASSERT_NE (tok, NULL);
2516   ASSERT_EQ (tok->type, CPP_STRING);
2517   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2518   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2519
2520   tok = cpp_get_token_with_location (parser, &loc);
2521   ASSERT_NE (tok, NULL);
2522   ASSERT_EQ (tok->type, CPP_NUMBER);
2523   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2524   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2525
2526   tok = cpp_get_token_with_location (parser, &loc);
2527   ASSERT_NE (tok, NULL);
2528   ASSERT_EQ (tok->type, CPP_EOF);
2529
2530   cpp_finish (parser, NULL);
2531   cpp_destroy (parser);
2532 }
2533
2534 /* Forward decls.  */
2535
2536 class lexer_test;
2537 class lexer_test_options;
2538
2539 /* A class for specifying options of a lexer_test.
2540    The "apply" vfunc is called during the lexer_test constructor.  */
2541
2542 class lexer_test_options
2543 {
2544  public:
2545   virtual void apply (lexer_test &) = 0;
2546 };
2547
2548 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2549    in its dtor.
2550
2551    This is needed by struct lexer_test to ensure that the cleanup of the
2552    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2553
2554 class cpp_reader_ptr
2555 {
2556  public:
2557   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2558
2559   ~cpp_reader_ptr ()
2560   {
2561     cpp_finish (m_ptr, NULL);
2562     cpp_destroy (m_ptr);
2563   }
2564
2565   operator cpp_reader * () const { return m_ptr; }
2566
2567  private:
2568   cpp_reader *m_ptr;
2569 };
2570
2571 /* A struct for writing lexer tests.  */
2572
2573 class lexer_test
2574 {
2575 public:
2576   lexer_test (const line_table_case &case_, const char *content,
2577               lexer_test_options *options);
2578   ~lexer_test ();
2579
2580   const cpp_token *get_token ();
2581
2582   /* The ordering of these fields matters.
2583      The line_table_test must be first, since the cpp_reader_ptr
2584      uses it.
2585      The cpp_reader must be cleaned up *after* the temp_source_file
2586      since the filenames in input.cc's input cache are owned by the
2587      cpp_reader; in particular, when ~temp_source_file evicts the
2588      filename the filenames must still be alive.  */
2589   line_table_test m_ltt;
2590   cpp_reader_ptr m_parser;
2591   temp_source_file m_tempfile;
2592   file_cache m_file_cache;
2593   string_concat_db m_concats;
2594   bool m_implicitly_expect_EOF;
2595 };
2596
2597 /* Use an EBCDIC encoding for the execution charset, specifically
2598    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2599
2600    This exercises iconv integration within libcpp.
2601    Not every build of iconv supports the given charset,
2602    so we need to flag this error and handle it gracefully.  */
2603
2604 class ebcdic_execution_charset : public lexer_test_options
2605 {
2606  public:
2607   ebcdic_execution_charset () : m_num_iconv_errors (0)
2608     {
2609       gcc_assert (s_singleton == NULL);
2610       s_singleton = this;
2611     }
2612   ~ebcdic_execution_charset ()
2613     {
2614       gcc_assert (s_singleton == this);
2615       s_singleton = NULL;
2616     }
2617
2618   void apply (lexer_test &test) final override
2619   {
2620     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2621     cpp_opts->narrow_charset = "IBM1047";
2622
2623     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2624     callbacks->diagnostic = on_diagnostic;
2625   }
2626
2627   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2628                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2629                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2630                              rich_location *richloc ATTRIBUTE_UNUSED,
2631                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2632     ATTRIBUTE_FPTR_PRINTF(5,0)
2633   {
2634     gcc_assert (s_singleton);
2635     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2636     const char *msg = "conversion from %s to %s not supported by iconv";
2637 #ifdef ENABLE_NLS
2638     msg = dgettext ("cpplib", msg);
2639 #endif
2640     /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2641        when the local iconv build doesn't support the conversion.  */
2642     if (strcmp (msgid, msg) == 0)
2643       {
2644         s_singleton->m_num_iconv_errors++;
2645         return true;
2646       }
2647
2648     /* Otherwise, we have an unexpected error.  */
2649     abort ();
2650   }
2651
2652   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2653
2654  private:
2655   static ebcdic_execution_charset *s_singleton;
2656   int m_num_iconv_errors;
2657 };
2658
2659 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2660
2661 /* A lexer_test_options subclass that records a list of diagnostic
2662    messages emitted by the lexer.  */
2663
2664 class lexer_diagnostic_sink : public lexer_test_options
2665 {
2666  public:
2667   lexer_diagnostic_sink ()
2668   {
2669     gcc_assert (s_singleton == NULL);
2670     s_singleton = this;
2671   }
2672   ~lexer_diagnostic_sink ()
2673   {
2674     gcc_assert (s_singleton == this);
2675     s_singleton = NULL;
2676
2677     int i;
2678     char *str;
2679     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2680       free (str);
2681   }
2682
2683   void apply (lexer_test &test) final override
2684   {
2685     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2686     callbacks->diagnostic = on_diagnostic;
2687   }
2688
2689   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2690                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2691                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2692                              rich_location *richloc ATTRIBUTE_UNUSED,
2693                              const char *msgid, va_list *ap)
2694     ATTRIBUTE_FPTR_PRINTF(5,0)
2695   {
2696     char *msg = xvasprintf (msgid, *ap);
2697     s_singleton->m_diagnostics.safe_push (msg);
2698     return true;
2699   }
2700
2701   auto_vec<char *> m_diagnostics;
2702
2703  private:
2704   static lexer_diagnostic_sink *s_singleton;
2705 };
2706
2707 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2708
2709 /* Constructor.  Override line_table with a new instance based on CASE_,
2710    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2711    start parsing the tempfile.  */
2712
2713 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2714                         lexer_test_options *options)
2715 : m_ltt (case_),
2716   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2717   /* Create a tempfile and write the text to it.  */
2718   m_tempfile (SELFTEST_LOCATION, ".c", content),
2719   m_concats (),
2720   m_implicitly_expect_EOF (true)
2721 {
2722   if (options)
2723     options->apply (*this);
2724
2725   cpp_init_iconv (m_parser);
2726
2727   /* Parse the file.  */
2728   const char *fname = cpp_read_main_file (m_parser,
2729                                           m_tempfile.get_filename ());
2730   ASSERT_NE (fname, NULL);
2731 }
2732
2733 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2734
2735 lexer_test::~lexer_test ()
2736 {
2737   location_t loc;
2738   const cpp_token *tok;
2739
2740   if (m_implicitly_expect_EOF)
2741     {
2742       tok = cpp_get_token_with_location (m_parser, &loc);
2743       ASSERT_NE (tok, NULL);
2744       ASSERT_EQ (tok->type, CPP_EOF);
2745     }
2746 }
2747
2748 /* Get the next token from m_parser.  */
2749
2750 const cpp_token *
2751 lexer_test::get_token ()
2752 {
2753   location_t loc;
2754   const cpp_token *tok;
2755
2756   tok = cpp_get_token_with_location (m_parser, &loc);
2757   ASSERT_NE (tok, NULL);
2758   return tok;
2759 }
2760
2761 /* Verify that locations within string literals are correctly handled.  */
2762
2763 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2764    using the string concatenation database for TEST.
2765
2766    Assert that the character at index IDX is on EXPECTED_LINE,
2767    and that it begins at column EXPECTED_START_COL and ends at
2768    EXPECTED_FINISH_COL (unless the locations are beyond
2769    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2770    columns).  */
2771
2772 static void
2773 assert_char_at_range (const location &loc,
2774                       lexer_test& test,
2775                       location_t strloc, enum cpp_ttype type, int idx,
2776                       int expected_line, int expected_start_col,
2777                       int expected_finish_col)
2778 {
2779   cpp_reader *pfile = test.m_parser;
2780   string_concat_db *concats = &test.m_concats;
2781
2782   source_range actual_range = source_range();
2783   const char *err
2784     = get_source_range_for_char (pfile, test.m_file_cache,
2785                                  concats, strloc, type, idx,
2786                                  &actual_range);
2787   if (should_have_column_data_p (strloc))
2788     ASSERT_EQ_AT (loc, NULL, err);
2789   else
2790     {
2791       ASSERT_STREQ_AT (loc,
2792                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2793                        err);
2794       return;
2795     }
2796
2797   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2798   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2799   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2800   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2801
2802   if (should_have_column_data_p (actual_range.m_start))
2803     {
2804       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2805       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2806     }
2807   if (should_have_column_data_p (actual_range.m_finish))
2808     {
2809       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2810       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2811     }
2812 }
2813
2814 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2815    the effective location of any errors.  */
2816
2817 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2818                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2819   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2820                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2821                         (EXPECTED_FINISH_COL))
2822
2823 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2824    using the string concatenation database for TEST.
2825
2826    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2827
2828 static void
2829 assert_num_substring_ranges (const location &loc,
2830                              lexer_test& test,
2831                              location_t strloc,
2832                              enum cpp_ttype type,
2833                              int expected_num_ranges)
2834 {
2835   cpp_reader *pfile = test.m_parser;
2836   string_concat_db *concats = &test.m_concats;
2837
2838   int actual_num_ranges = -1;
2839   const char *err
2840     = get_num_source_ranges_for_substring (pfile, test.m_file_cache,
2841                                            concats, strloc, type,
2842                                            &actual_num_ranges);
2843   if (should_have_column_data_p (strloc))
2844     ASSERT_EQ_AT (loc, NULL, err);
2845   else
2846     {
2847       ASSERT_STREQ_AT (loc,
2848                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2849                        err);
2850       return;
2851     }
2852   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2853 }
2854
2855 /* Macro for calling assert_num_substring_ranges, supplying
2856    SELFTEST_LOCATION for the effective location of any errors.  */
2857
2858 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2859                                     EXPECTED_NUM_RANGES)                \
2860   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2861                                (TYPE), (EXPECTED_NUM_RANGES))
2862
2863
2864 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2865    returns an error (using the string concatenation database for TEST).  */
2866
2867 static void
2868 assert_has_no_substring_ranges (const location &loc,
2869                                 lexer_test& test,
2870                                 location_t strloc,
2871                                 enum cpp_ttype type,
2872                                 const char *expected_err)
2873 {
2874   cpp_reader *pfile = test.m_parser;
2875   string_concat_db *concats = &test.m_concats;
2876   cpp_substring_ranges ranges;
2877   const char *actual_err
2878     = get_substring_ranges_for_loc (pfile, test.m_file_cache, concats, strloc,
2879                                     type, ranges);
2880   if (should_have_column_data_p (strloc))
2881     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2882   else
2883     ASSERT_STREQ_AT (loc,
2884                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2885                      actual_err);
2886 }
2887
2888 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2889     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2890                                     (STRLOC), (TYPE), (ERR))
2891
2892 /* Lex a simple string literal.  Verify the substring location data, before
2893    and after running cpp_interpret_string on it.  */
2894
2895 static void
2896 test_lexer_string_locations_simple (const line_table_case &case_)
2897 {
2898   /* Digits 0-9 (with 0 at column 10), the simple way.
2899      ....................000000000.11111111112.2222222223333333333
2900      ....................123456789.01234567890.1234567890123456789
2901      We add a trailing comment to ensure that we correctly locate
2902      the end of the string literal token.  */
2903   const char *content = "        \"0123456789\" /* not a string */\n";
2904   lexer_test test (case_, content, NULL);
2905
2906   /* Verify that we get the expected token back, with the correct
2907      location information.  */
2908   const cpp_token *tok = test.get_token ();
2909   ASSERT_EQ (tok->type, CPP_STRING);
2910   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2911   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2912
2913   /* At this point in lexing, the quote characters are treated as part of
2914      the string (they are stripped off by cpp_interpret_string).  */
2915
2916   ASSERT_EQ (tok->val.str.len, 12);
2917
2918   /* Verify that cpp_interpret_string works.  */
2919   cpp_string dst_string;
2920   const enum cpp_ttype type = CPP_STRING;
2921   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2922                                       &dst_string, type);
2923   ASSERT_TRUE (result);
2924   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2925   free (const_cast <unsigned char *> (dst_string.text));
2926
2927   /* Verify ranges of individual characters.  This no longer includes the
2928      opening quote, but does include the closing quote.  */
2929   for (int i = 0; i <= 10; i++)
2930     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2931                           10 + i, 10 + i);
2932
2933   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2934 }
2935
2936 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2937    encoding.  */
2938
2939 static void
2940 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2941 {
2942   /* EBCDIC support requires iconv.  */
2943   if (!HAVE_ICONV)
2944     return;
2945
2946   /* Digits 0-9 (with 0 at column 10), the simple way.
2947      ....................000000000.11111111112.2222222223333333333
2948      ....................123456789.01234567890.1234567890123456789
2949      We add a trailing comment to ensure that we correctly locate
2950      the end of the string literal token.  */
2951   const char *content = "        \"0123456789\" /* not a string */\n";
2952   ebcdic_execution_charset use_ebcdic;
2953   lexer_test test (case_, content, &use_ebcdic);
2954
2955   /* Verify that we get the expected token back, with the correct
2956      location information.  */
2957   const cpp_token *tok = test.get_token ();
2958   ASSERT_EQ (tok->type, CPP_STRING);
2959   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2960   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2961
2962   /* At this point in lexing, the quote characters are treated as part of
2963      the string (they are stripped off by cpp_interpret_string).  */
2964
2965   ASSERT_EQ (tok->val.str.len, 12);
2966
2967   /* The remainder of the test requires an iconv implementation that
2968      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2969   if (use_ebcdic.iconv_errors_occurred_p ())
2970     return;
2971
2972   /* Verify that cpp_interpret_string works.  */
2973   cpp_string dst_string;
2974   const enum cpp_ttype type = CPP_STRING;
2975   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2976                                       &dst_string, type);
2977   ASSERT_TRUE (result);
2978   /* We should now have EBCDIC-encoded text, specifically
2979      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2980      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2981   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2982                 (const char *)dst_string.text);
2983   free (const_cast <unsigned char *> (dst_string.text));
2984
2985   /* Verify that we don't attempt to record substring location information
2986      for such cases.  */
2987   ASSERT_HAS_NO_SUBSTRING_RANGES
2988     (test, tok->src_loc, type,
2989      "execution character set != source character set");
2990 }
2991
2992 /* Lex a string literal containing a hex-escaped character.
2993    Verify the substring location data, before and after running
2994    cpp_interpret_string on it.  */
2995
2996 static void
2997 test_lexer_string_locations_hex (const line_table_case &case_)
2998 {
2999   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
3000      and with a space in place of digit 6, to terminate the escaped
3001      hex code.
3002      ....................000000000.111111.11112222.
3003      ....................123456789.012345.67890123.  */
3004   const char *content = "        \"01234\\x35 789\"\n";
3005   lexer_test test (case_, content, NULL);
3006
3007   /* Verify that we get the expected token back, with the correct
3008      location information.  */
3009   const cpp_token *tok = test.get_token ();
3010   ASSERT_EQ (tok->type, CPP_STRING);
3011   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
3012   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
3013
3014   /* At this point in lexing, the quote characters are treated as part of
3015      the string (they are stripped off by cpp_interpret_string).  */
3016   ASSERT_EQ (tok->val.str.len, 15);
3017
3018   /* Verify that cpp_interpret_string works.  */
3019   cpp_string dst_string;
3020   const enum cpp_ttype type = CPP_STRING;
3021   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3022                                       &dst_string, type);
3023   ASSERT_TRUE (result);
3024   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
3025   free (const_cast <unsigned char *> (dst_string.text));
3026
3027   /* Verify ranges of individual characters.  This no longer includes the
3028      opening quote, but does include the closing quote.  */
3029   for (int i = 0; i <= 4; i++)
3030     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3031   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
3032   for (int i = 6; i <= 10; i++)
3033     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
3034
3035   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
3036 }
3037
3038 /* Lex a string literal containing an octal-escaped character.
3039    Verify the substring location data after running cpp_interpret_string
3040    on it.  */
3041
3042 static void
3043 test_lexer_string_locations_oct (const line_table_case &case_)
3044 {
3045   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
3046      and with a space in place of digit 6, to terminate the escaped
3047      octal code.
3048      ....................000000000.111111.11112222.2222223333333333444
3049      ....................123456789.012345.67890123.4567890123456789012  */
3050   const char *content = "        \"01234\\065 789\" /* not a string */\n";
3051   lexer_test test (case_, content, NULL);
3052
3053   /* Verify that we get the expected token back, with the correct
3054      location information.  */
3055   const cpp_token *tok = test.get_token ();
3056   ASSERT_EQ (tok->type, CPP_STRING);
3057   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
3058
3059   /* Verify that cpp_interpret_string works.  */
3060   cpp_string dst_string;
3061   const enum cpp_ttype type = CPP_STRING;
3062   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3063                                       &dst_string, type);
3064   ASSERT_TRUE (result);
3065   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
3066   free (const_cast <unsigned char *> (dst_string.text));
3067
3068   /* Verify ranges of individual characters.  This no longer includes the
3069      opening quote, but does include the closing quote.  */
3070   for (int i = 0; i < 5; i++)
3071     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3072   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
3073   for (int i = 6; i <= 10; i++)
3074     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
3075
3076   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
3077 }
3078
3079 /* Test of string literal containing letter escapes.  */
3080
3081 static void
3082 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
3083 {
3084   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
3085      .....................000000000.1.11111.1.1.11222.22222223333333
3086      .....................123456789.0.12345.6.7.89012.34567890123456.  */
3087   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
3088   lexer_test test (case_, content, NULL);
3089
3090   /* Verify that we get the expected tokens back.  */
3091   const cpp_token *tok = test.get_token ();
3092   ASSERT_EQ (tok->type, CPP_STRING);
3093   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
3094
3095   /* Verify ranges of individual characters. */
3096   /* "\t".  */
3097   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3098                         0, 1, 10, 11);
3099   /* "foo". */
3100   for (int i = 1; i <= 3; i++)
3101     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3102                           i, 1, 11 + i, 11 + i);
3103   /* "\\" and "\n".  */
3104   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3105                         4, 1, 15, 16);
3106   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3107                         5, 1, 17, 18);
3108
3109   /* "bar" and closing quote for nul-terminator.  */
3110   for (int i = 6; i <= 9; i++)
3111     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3112                           i, 1, 13 + i, 13 + i);
3113
3114   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
3115 }
3116
3117 /* Another test of a string literal containing a letter escape.
3118    Based on string seen in
3119      printf ("%-%\n");
3120    in gcc.dg/format/c90-printf-1.c.  */
3121
3122 static void
3123 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
3124 {
3125   /* .....................000000000.1111.11.1111.22222222223.
3126      .....................123456789.0123.45.6789.01234567890.  */
3127   const char *content = ("        \"%-%\\n\" /* non-str */\n");
3128   lexer_test test (case_, content, NULL);
3129
3130   /* Verify that we get the expected tokens back.  */
3131   const cpp_token *tok = test.get_token ();
3132   ASSERT_EQ (tok->type, CPP_STRING);
3133   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
3134
3135   /* Verify ranges of individual characters. */
3136   /* "%-%".  */
3137   for (int i = 0; i < 3; i++)
3138     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3139                           i, 1, 10 + i, 10 + i);
3140   /* "\n".  */
3141   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3142                         3, 1, 13, 14);
3143
3144   /* Closing quote for nul-terminator.  */
3145   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3146                         4, 1, 15, 15);
3147
3148   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3149 }
3150
3151 /* Lex a string literal containing UCN 4 characters.
3152    Verify the substring location data after running cpp_interpret_string
3153    on it.  */
3154
3155 static void
3156 test_lexer_string_locations_ucn4 (const line_table_case &case_)
3157 {
3158   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3159      as UCN 4.
3160      ....................000000000.111111.111122.222222223.33333333344444
3161      ....................123456789.012345.678901.234567890.12345678901234  */
3162   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
3163   lexer_test test (case_, content, NULL);
3164
3165   /* Verify that we get the expected token back, with the correct
3166      location information.  */
3167   const cpp_token *tok = test.get_token ();
3168   ASSERT_EQ (tok->type, CPP_STRING);
3169   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3170
3171   /* Verify that cpp_interpret_string works.
3172      The string should be encoded in the execution character
3173      set.  Assuming that is UTF-8, we should have the following:
3174      -----------  ----  -----  -------  ----------------
3175      Byte offset  Byte  Octal  Unicode  Source Column(s)
3176      -----------  ----  -----  -------  ----------------
3177      0            0x30         '0'      10
3178      1            0x31         '1'      11
3179      2            0x32         '2'      12
3180      3            0x33         '3'      13
3181      4            0x34         '4'      14
3182      5            0xE2  \342   U+2174   15-20
3183      6            0x85  \205    (cont)  15-20
3184      7            0xB4  \264    (cont)  15-20
3185      8            0xE2  \342   U+2175   21-26
3186      9            0x85  \205    (cont)  21-26
3187      10           0xB5  \265    (cont)  21-26
3188      11           0x37         '7'      27
3189      12           0x38         '8'      28
3190      13           0x39         '9'      29
3191      14           0x00                  30 (closing quote)
3192      -----------  ----  -----  -------  ---------------.  */
3193
3194   cpp_string dst_string;
3195   const enum cpp_ttype type = CPP_STRING;
3196   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3197                                       &dst_string, type);
3198   ASSERT_TRUE (result);
3199   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3200                 (const char *)dst_string.text);
3201   free (const_cast <unsigned char *> (dst_string.text));
3202
3203   /* Verify ranges of individual characters.  This no longer includes the
3204      opening quote, but does include the closing quote.
3205      '01234'.  */
3206   for (int i = 0; i <= 4; i++)
3207     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3208   /* U+2174.  */
3209   for (int i = 5; i <= 7; i++)
3210     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3211   /* U+2175.  */
3212   for (int i = 8; i <= 10; i++)
3213     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3214   /* '789' and nul terminator  */
3215   for (int i = 11; i <= 14; i++)
3216     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3217
3218   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3219 }
3220
3221 /* Lex a string literal containing UCN 8 characters.
3222    Verify the substring location data after running cpp_interpret_string
3223    on it.  */
3224
3225 static void
3226 test_lexer_string_locations_ucn8 (const line_table_case &case_)
3227 {
3228   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3229      ....................000000000.111111.1111222222.2222333333333.344444
3230      ....................123456789.012345.6789012345.6789012345678.901234  */
3231   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
3232   lexer_test test (case_, content, NULL);
3233
3234   /* Verify that we get the expected token back, with the correct
3235      location information.  */
3236   const cpp_token *tok = test.get_token ();
3237   ASSERT_EQ (tok->type, CPP_STRING);
3238   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3239                            "\"01234\\U00002174\\U00002175789\"");
3240
3241   /* Verify that cpp_interpret_string works.
3242      The UTF-8 encoding of the string is identical to that from
3243      the ucn4 testcase above; the only difference is the column
3244      locations.  */
3245   cpp_string dst_string;
3246   const enum cpp_ttype type = CPP_STRING;
3247   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3248                                       &dst_string, type);
3249   ASSERT_TRUE (result);
3250   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3251                 (const char *)dst_string.text);
3252   free (const_cast <unsigned char *> (dst_string.text));
3253
3254   /* Verify ranges of individual characters.  This no longer includes the
3255      opening quote, but does include the closing quote.
3256      '01234'.  */
3257   for (int i = 0; i <= 4; i++)
3258     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3259   /* U+2174.  */
3260   for (int i = 5; i <= 7; i++)
3261     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3262   /* U+2175.  */
3263   for (int i = 8; i <= 10; i++)
3264     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3265   /* '789' at columns 35-37  */
3266   for (int i = 11; i <= 13; i++)
3267     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3268   /* Closing quote/nul-terminator at column 38.  */
3269   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3270
3271   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3272 }
3273
3274 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
3275
3276 static uint32_t
3277 uint32_from_big_endian (const uint32_t *ptr_be_value)
3278 {
3279   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3280   return (((uint32_t) buf[0] << 24)
3281           | ((uint32_t) buf[1] << 16)
3282           | ((uint32_t) buf[2] << 8)
3283           | (uint32_t) buf[3]);
3284 }
3285
3286 /* Lex a wide string literal and verify that attempts to read substring
3287    location data from it fail gracefully.  */
3288
3289 static void
3290 test_lexer_string_locations_wide_string (const line_table_case &case_)
3291 {
3292   /* Digits 0-9.
3293      ....................000000000.11111111112.22222222233333
3294      ....................123456789.01234567890.12345678901234  */
3295   const char *content = "       L\"0123456789\" /* non-str */\n";
3296   lexer_test test (case_, content, NULL);
3297
3298   /* Verify that we get the expected token back, with the correct
3299      location information.  */
3300   const cpp_token *tok = test.get_token ();
3301   ASSERT_EQ (tok->type, CPP_WSTRING);
3302   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3303
3304   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
3305   cpp_string dst_string;
3306   const enum cpp_ttype type = CPP_WSTRING;
3307   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3308                                       &dst_string, type);
3309   ASSERT_TRUE (result);
3310   /* The cpp_reader defaults to big-endian with
3311      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3312      now be encoded as UTF-32BE.  */
3313   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3314   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3315   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3316   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3317   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3318   free (const_cast <unsigned char *> (dst_string.text));
3319
3320   /* We don't yet support generating substring location information
3321      for L"" strings.  */
3322   ASSERT_HAS_NO_SUBSTRING_RANGES
3323     (test, tok->src_loc, type,
3324      "execution character set != source character set");
3325 }
3326
3327 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3328
3329 static uint16_t
3330 uint16_from_big_endian (const uint16_t *ptr_be_value)
3331 {
3332   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3333   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3334 }
3335
3336 /* Lex a u"" string literal and verify that attempts to read substring
3337    location data from it fail gracefully.  */
3338
3339 static void
3340 test_lexer_string_locations_string16 (const line_table_case &case_)
3341 {
3342   /* Digits 0-9.
3343      ....................000000000.11111111112.22222222233333
3344      ....................123456789.01234567890.12345678901234  */
3345   const char *content = "       u\"0123456789\" /* non-str */\n";
3346   lexer_test test (case_, content, NULL);
3347
3348   /* Verify that we get the expected token back, with the correct
3349      location information.  */
3350   const cpp_token *tok = test.get_token ();
3351   ASSERT_EQ (tok->type, CPP_STRING16);
3352   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3353
3354   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3355   cpp_string dst_string;
3356   const enum cpp_ttype type = CPP_STRING16;
3357   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3358                                       &dst_string, type);
3359   ASSERT_TRUE (result);
3360
3361   /* The cpp_reader defaults to big-endian, so dst_string should
3362      now be encoded as UTF-16BE.  */
3363   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3364   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3365   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3366   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3367   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3368   free (const_cast <unsigned char *> (dst_string.text));
3369
3370   /* We don't yet support generating substring location information
3371      for L"" strings.  */
3372   ASSERT_HAS_NO_SUBSTRING_RANGES
3373     (test, tok->src_loc, type,
3374      "execution character set != source character set");
3375 }
3376
3377 /* Lex a U"" string literal and verify that attempts to read substring
3378    location data from it fail gracefully.  */
3379
3380 static void
3381 test_lexer_string_locations_string32 (const line_table_case &case_)
3382 {
3383   /* Digits 0-9.
3384      ....................000000000.11111111112.22222222233333
3385      ....................123456789.01234567890.12345678901234  */
3386   const char *content = "       U\"0123456789\" /* non-str */\n";
3387   lexer_test test (case_, content, NULL);
3388
3389   /* Verify that we get the expected token back, with the correct
3390      location information.  */
3391   const cpp_token *tok = test.get_token ();
3392   ASSERT_EQ (tok->type, CPP_STRING32);
3393   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3394
3395   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3396   cpp_string dst_string;
3397   const enum cpp_ttype type = CPP_STRING32;
3398   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3399                                       &dst_string, type);
3400   ASSERT_TRUE (result);
3401
3402   /* The cpp_reader defaults to big-endian, so dst_string should
3403      now be encoded as UTF-32BE.  */
3404   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3405   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3406   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3407   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3408   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3409   free (const_cast <unsigned char *> (dst_string.text));
3410
3411   /* We don't yet support generating substring location information
3412      for L"" strings.  */
3413   ASSERT_HAS_NO_SUBSTRING_RANGES
3414     (test, tok->src_loc, type,
3415      "execution character set != source character set");
3416 }
3417
3418 /* Lex a u8-string literal.
3419    Verify the substring location data after running cpp_interpret_string
3420    on it.  */
3421
3422 static void
3423 test_lexer_string_locations_u8 (const line_table_case &case_)
3424 {
3425   /* Digits 0-9.
3426      ....................000000000.11111111112.22222222233333
3427      ....................123456789.01234567890.12345678901234  */
3428   const char *content = "      u8\"0123456789\" /* non-str */\n";
3429   lexer_test test (case_, content, NULL);
3430
3431   /* Verify that we get the expected token back, with the correct
3432      location information.  */
3433   const cpp_token *tok = test.get_token ();
3434   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3435   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3436
3437   /* Verify that cpp_interpret_string works.  */
3438   cpp_string dst_string;
3439   const enum cpp_ttype type = CPP_STRING;
3440   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3441                                       &dst_string, type);
3442   ASSERT_TRUE (result);
3443   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3444   free (const_cast <unsigned char *> (dst_string.text));
3445
3446   /* Verify ranges of individual characters.  This no longer includes the
3447      opening quote, but does include the closing quote.  */
3448   for (int i = 0; i <= 10; i++)
3449     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3450 }
3451
3452 /* Lex a string literal containing UTF-8 source characters.
3453    Verify the substring location data after running cpp_interpret_string
3454    on it.  */
3455
3456 static void
3457 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3458 {
3459  /* This string literal is written out to the source file as UTF-8,
3460     and is of the form "before mojibake after", where "mojibake"
3461     is written as the following four unicode code points:
3462        U+6587 CJK UNIFIED IDEOGRAPH-6587
3463        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3464        U+5316 CJK UNIFIED IDEOGRAPH-5316
3465        U+3051 HIRAGANA LETTER KE.
3466      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3467      "before" and "after" are 1 byte per unicode character.
3468
3469      The numbering shown are "columns", which are *byte* numbers within
3470      the line, rather than unicode character numbers.
3471
3472      .................... 000000000.1111111.
3473      .................... 123456789.0123456.  */
3474   const char *content = ("        \"before "
3475                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3476                               UTF-8: 0xE6 0x96 0x87
3477                               C octal escaped UTF-8: \346\226\207
3478                             "column" numbers: 17-19.  */
3479                          "\346\226\207"
3480
3481                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3482                               UTF-8: 0xE5 0xAD 0x97
3483                               C octal escaped UTF-8: \345\255\227
3484                             "column" numbers: 20-22.  */
3485                          "\345\255\227"
3486
3487                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3488                               UTF-8: 0xE5 0x8C 0x96
3489                               C octal escaped UTF-8: \345\214\226
3490                             "column" numbers: 23-25.  */
3491                          "\345\214\226"
3492
3493                          /* U+3051 HIRAGANA LETTER KE
3494                               UTF-8: 0xE3 0x81 0x91
3495                               C octal escaped UTF-8: \343\201\221
3496                             "column" numbers: 26-28.  */
3497                          "\343\201\221"
3498
3499                          /* column numbers 29 onwards
3500                           2333333.33334444444444
3501                           9012345.67890123456789. */
3502                          " after\" /* non-str */\n");
3503   lexer_test test (case_, content, NULL);
3504
3505   /* Verify that we get the expected token back, with the correct
3506      location information.  */
3507   const cpp_token *tok = test.get_token ();
3508   ASSERT_EQ (tok->type, CPP_STRING);
3509   ASSERT_TOKEN_AS_TEXT_EQ
3510     (test.m_parser, tok,
3511      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3512
3513   /* Verify that cpp_interpret_string works.  */
3514   cpp_string dst_string;
3515   const enum cpp_ttype type = CPP_STRING;
3516   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3517                                       &dst_string, type);
3518   ASSERT_TRUE (result);
3519   ASSERT_STREQ
3520     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3521      (const char *)dst_string.text);
3522   free (const_cast <unsigned char *> (dst_string.text));
3523
3524   /* Verify ranges of individual characters.  This no longer includes the
3525      opening quote, but does include the closing quote.
3526      Assuming that both source and execution encodings are UTF-8, we have
3527      a run of 25 octets in each, plus the NUL terminator.  */
3528   for (int i = 0; i < 25; i++)
3529     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3530   /* NUL-terminator should use the closing quote at column 35.  */
3531   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3532
3533   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3534 }
3535
3536 /* Test of string literal concatenation.  */
3537
3538 static void
3539 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3540 {
3541   /* Digits 0-9.
3542      .....................000000000.111111.11112222222222
3543      .....................123456789.012345.67890123456789.  */
3544   const char *content = ("        \"01234\" /* non-str */\n"
3545                          "        \"56789\" /* non-str */\n");
3546   lexer_test test (case_, content, NULL);
3547
3548   location_t input_locs[2];
3549
3550   /* Verify that we get the expected tokens back.  */
3551   auto_vec <cpp_string> input_strings;
3552   const cpp_token *tok_a = test.get_token ();
3553   ASSERT_EQ (tok_a->type, CPP_STRING);
3554   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3555   input_strings.safe_push (tok_a->val.str);
3556   input_locs[0] = tok_a->src_loc;
3557
3558   const cpp_token *tok_b = test.get_token ();
3559   ASSERT_EQ (tok_b->type, CPP_STRING);
3560   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3561   input_strings.safe_push (tok_b->val.str);
3562   input_locs[1] = tok_b->src_loc;
3563
3564   /* Verify that cpp_interpret_string works.  */
3565   cpp_string dst_string;
3566   const enum cpp_ttype type = CPP_STRING;
3567   bool result = cpp_interpret_string (test.m_parser,
3568                                       input_strings.address (), 2,
3569                                       &dst_string, type);
3570   ASSERT_TRUE (result);
3571   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3572   free (const_cast <unsigned char *> (dst_string.text));
3573
3574   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3575   test.m_concats.record_string_concatenation (2, input_locs);
3576
3577   location_t initial_loc = input_locs[0];
3578
3579   /* "01234" on line 1.  */
3580   for (int i = 0; i <= 4; i++)
3581     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3582   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3583   for (int i = 5; i <= 10; i++)
3584     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3585
3586   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3587 }
3588
3589 /* Another test of string literal concatenation.  */
3590
3591 static void
3592 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3593 {
3594   /* Digits 0-9.
3595      .....................000000000.111.11111112222222
3596      .....................123456789.012.34567890123456.  */
3597   const char *content = ("        \"01\" /* non-str */\n"
3598                          "        \"23\" /* non-str */\n"
3599                          "        \"45\" /* non-str */\n"
3600                          "        \"67\" /* non-str */\n"
3601                          "        \"89\" /* non-str */\n");
3602   lexer_test test (case_, content, NULL);
3603
3604   auto_vec <cpp_string> input_strings;
3605   location_t input_locs[5];
3606
3607   /* Verify that we get the expected tokens back.  */
3608   for (int i = 0; i < 5; i++)
3609     {
3610       const cpp_token *tok = test.get_token ();
3611       ASSERT_EQ (tok->type, CPP_STRING);
3612       input_strings.safe_push (tok->val.str);
3613       input_locs[i] = tok->src_loc;
3614     }
3615
3616   /* Verify that cpp_interpret_string works.  */
3617   cpp_string dst_string;
3618   const enum cpp_ttype type = CPP_STRING;
3619   bool result = cpp_interpret_string (test.m_parser,
3620                                       input_strings.address (), 5,
3621                                       &dst_string, type);
3622   ASSERT_TRUE (result);
3623   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3624   free (const_cast <unsigned char *> (dst_string.text));
3625
3626   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3627   test.m_concats.record_string_concatenation (5, input_locs);
3628
3629   location_t initial_loc = input_locs[0];
3630
3631   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3632      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3633      and expect get_source_range_for_substring to fail.
3634      However, for a string concatenation test, we can have a case
3635      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3636      but subsequent strings can be after it.
3637      Attempting to detect this within assert_char_at_range
3638      would overcomplicate the logic for the common test cases, so
3639      we detect it here.  */
3640   if (should_have_column_data_p (input_locs[0])
3641       && !should_have_column_data_p (input_locs[4]))
3642     {
3643       /* Verify that get_source_range_for_substring gracefully rejects
3644          this case.  */
3645       source_range actual_range;
3646       const char *err
3647         = get_source_range_for_char (test.m_parser, test.m_file_cache,
3648                                      &test.m_concats,
3649                                      initial_loc, type, 0, &actual_range);
3650       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3651       return;
3652     }
3653
3654   for (int i = 0; i < 5; i++)
3655     for (int j = 0; j < 2; j++)
3656       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3657                             i + 1, 10 + j, 10 + j);
3658
3659   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3660   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3661
3662   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3663 }
3664
3665 /* Another test of string literal concatenation, this time combined with
3666    various kinds of escaped characters.  */
3667
3668 static void
3669 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3670 {
3671   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3672      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3673   const char *content
3674     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3675        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3676     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3677   lexer_test test (case_, content, NULL);
3678
3679   auto_vec <cpp_string> input_strings;
3680   location_t input_locs[4];
3681
3682   /* Verify that we get the expected tokens back.  */
3683   for (int i = 0; i < 4; i++)
3684     {
3685       const cpp_token *tok = test.get_token ();
3686       ASSERT_EQ (tok->type, CPP_STRING);
3687       input_strings.safe_push (tok->val.str);
3688       input_locs[i] = tok->src_loc;
3689     }
3690
3691   /* Verify that cpp_interpret_string works.  */
3692   cpp_string dst_string;
3693   const enum cpp_ttype type = CPP_STRING;
3694   bool result = cpp_interpret_string (test.m_parser,
3695                                       input_strings.address (), 4,
3696                                       &dst_string, type);
3697   ASSERT_TRUE (result);
3698   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3699   free (const_cast <unsigned char *> (dst_string.text));
3700
3701   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3702   test.m_concats.record_string_concatenation (4, input_locs);
3703
3704   location_t initial_loc = input_locs[0];
3705
3706   for (int i = 0; i <= 4; i++)
3707     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3708   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3709   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3710   for (int i = 7; i <= 9; i++)
3711     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3712
3713   /* NUL-terminator should use the location of the final closing quote.  */
3714   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3715
3716   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3717 }
3718
3719 /* Test of string literal in a macro.  */
3720
3721 static void
3722 test_lexer_string_locations_macro (const line_table_case &case_)
3723 {
3724   /* Digits 0-9.
3725      .....................0000000001111111111.22222222223.
3726      .....................1234567890123456789.01234567890.  */
3727   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3728                          "  MACRO");
3729   lexer_test test (case_, content, NULL);
3730
3731   /* Verify that we get the expected tokens back.  */
3732   const cpp_token *tok = test.get_token ();
3733   ASSERT_EQ (tok->type, CPP_PADDING);
3734
3735   tok = test.get_token ();
3736   ASSERT_EQ (tok->type, CPP_STRING);
3737   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3738
3739   /* Verify ranges of individual characters.  We ought to
3740      see columns within the macro definition.  */
3741   for (int i = 0; i <= 10; i++)
3742     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3743                           i, 1, 20 + i, 20 + i);
3744
3745   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3746
3747   tok = test.get_token ();
3748   ASSERT_EQ (tok->type, CPP_PADDING);
3749 }
3750
3751 /* Test of stringification of a macro argument.  */
3752
3753 static void
3754 test_lexer_string_locations_stringified_macro_argument
3755   (const line_table_case &case_)
3756 {
3757   /* .....................000000000111111111122222222223.
3758      .....................123456789012345678901234567890.  */
3759   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3760                          "MACRO(foo)\n");
3761   lexer_test test (case_, content, NULL);
3762
3763   /* Verify that we get the expected token back.  */
3764   const cpp_token *tok = test.get_token ();
3765   ASSERT_EQ (tok->type, CPP_PADDING);
3766
3767   tok = test.get_token ();
3768   ASSERT_EQ (tok->type, CPP_STRING);
3769   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3770
3771   /* We don't support getting the location of a stringified macro
3772      argument.  Verify that it fails gracefully.  */
3773   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3774                                   "cpp_interpret_string_1 failed");
3775
3776   tok = test.get_token ();
3777   ASSERT_EQ (tok->type, CPP_PADDING);
3778
3779   tok = test.get_token ();
3780   ASSERT_EQ (tok->type, CPP_PADDING);
3781 }
3782
3783 /* Ensure that we are fail gracefully if something attempts to pass
3784    in a location that isn't a string literal token.  Seen on this code:
3785
3786      const char a[] = " %d ";
3787      __builtin_printf (a, 0.5);
3788                        ^
3789
3790    when c-format.cc erroneously used the indicated one-character
3791    location as the format string location, leading to a read past the
3792    end of a string buffer in cpp_interpret_string_1.  */
3793
3794 static void
3795 test_lexer_string_locations_non_string (const line_table_case &case_)
3796 {
3797   /* .....................000000000111111111122222222223.
3798      .....................123456789012345678901234567890.  */
3799   const char *content = ("         a\n");
3800   lexer_test test (case_, content, NULL);
3801
3802   /* Verify that we get the expected token back.  */
3803   const cpp_token *tok = test.get_token ();
3804   ASSERT_EQ (tok->type, CPP_NAME);
3805   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3806
3807   /* At this point, libcpp is attempting to interpret the name as a
3808      string literal, despite it not starting with a quote.  We don't detect
3809      that, but we should at least fail gracefully.  */
3810   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3811                                   "cpp_interpret_string_1 failed");
3812 }
3813
3814 /* Ensure that we can read substring information for a token which
3815    starts in one linemap and ends in another .  Adapted from
3816    gcc.dg/cpp/pr69985.c.  */
3817
3818 static void
3819 test_lexer_string_locations_long_line (const line_table_case &case_)
3820 {
3821   /* .....................000000.000111111111
3822      .....................123456.789012346789.  */
3823   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3824                          "     \"0123456789012345678901234567890123456789"
3825                          "0123456789012345678901234567890123456789"
3826                          "0123456789012345678901234567890123456789"
3827                          "0123456789\"\n");
3828
3829   lexer_test test (case_, content, NULL);
3830
3831   /* Verify that we get the expected token back.  */
3832   const cpp_token *tok = test.get_token ();
3833   ASSERT_EQ (tok->type, CPP_STRING);
3834
3835   if (!should_have_column_data_p (line_table->highest_location))
3836     return;
3837
3838   /* Verify ranges of individual characters.  */
3839   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3840   for (int i = 0; i < 131; i++)
3841     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3842                           i, 2, 7 + i, 7 + i);
3843 }
3844
3845 /* Test of locations within a raw string that doesn't contain a newline.  */
3846
3847 static void
3848 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3849 {
3850   /* .....................00.0000000111111111122.
3851      .....................12.3456789012345678901.  */
3852   const char *content = ("R\"foo(0123456789)foo\"\n");
3853   lexer_test test (case_, content, NULL);
3854
3855   /* Verify that we get the expected token back.  */
3856   const cpp_token *tok = test.get_token ();
3857   ASSERT_EQ (tok->type, CPP_STRING);
3858
3859   /* Verify that cpp_interpret_string works.  */
3860   cpp_string dst_string;
3861   const enum cpp_ttype type = CPP_STRING;
3862   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3863                                       &dst_string, type);
3864   ASSERT_TRUE (result);
3865   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3866   free (const_cast <unsigned char *> (dst_string.text));
3867
3868   if (!should_have_column_data_p (line_table->highest_location))
3869     return;
3870
3871   /* 0-9, plus the nil terminator.  */
3872   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3873   for (int i = 0; i < 11; i++)
3874     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3875                           i, 1, 7 + i, 7 + i);
3876 }
3877
3878 /* Test of locations within a raw string that contains a newline.  */
3879
3880 static void
3881 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3882 {
3883   /* .....................00.0000.
3884      .....................12.3456.  */
3885   const char *content = ("R\"foo(\n"
3886   /* .....................00000.
3887      .....................12345.  */
3888                          "hello\n"
3889                          "world\n"
3890   /* .....................00000.
3891      .....................12345.  */
3892                          ")foo\"\n");
3893   lexer_test test (case_, content, NULL);
3894
3895   /* Verify that we get the expected token back.  */
3896   const cpp_token *tok = test.get_token ();
3897   ASSERT_EQ (tok->type, CPP_STRING);
3898
3899   /* Verify that cpp_interpret_string works.  */
3900   cpp_string dst_string;
3901   const enum cpp_ttype type = CPP_STRING;
3902   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3903                                       &dst_string, type);
3904   ASSERT_TRUE (result);
3905   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3906   free (const_cast <unsigned char *> (dst_string.text));
3907
3908   if (!should_have_column_data_p (line_table->highest_location))
3909     return;
3910
3911   /* Currently we don't support locations within raw strings that
3912      contain newlines.  */
3913   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3914                                   "range endpoints are on different lines");
3915 }
3916
3917 /* Test of parsing an unterminated raw string.  */
3918
3919 static void
3920 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3921 {
3922   const char *content = "R\"ouch()ouCh\" /* etc */";
3923
3924   lexer_diagnostic_sink diagnostics;
3925   lexer_test test (case_, content, &diagnostics);
3926   test.m_implicitly_expect_EOF = false;
3927
3928   /* Attempt to parse the raw string.  */
3929   const cpp_token *tok = test.get_token ();
3930   ASSERT_EQ (tok->type, CPP_EOF);
3931
3932   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3933   /* We expect the message "unterminated raw string"
3934      in the "cpplib" translation domain.
3935      It's not clear that dgettext is available on all supported hosts,
3936      so this assertion is commented-out for now.
3937        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3938                      diagnostics.m_diagnostics[0]);
3939   */
3940 }
3941
3942 /* Test of lexing char constants.  */
3943
3944 static void
3945 test_lexer_char_constants (const line_table_case &case_)
3946 {
3947   /* Various char constants.
3948      .....................0000000001111111111.22222222223.
3949      .....................1234567890123456789.01234567890.  */
3950   const char *content = ("         'a'\n"
3951                          "        u'a'\n"
3952                          "        U'a'\n"
3953                          "        L'a'\n"
3954                          "         'abc'\n");
3955   lexer_test test (case_, content, NULL);
3956
3957   /* Verify that we get the expected tokens back.  */
3958   /* 'a'.  */
3959   const cpp_token *tok = test.get_token ();
3960   ASSERT_EQ (tok->type, CPP_CHAR);
3961   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3962
3963   unsigned int chars_seen;
3964   int unsignedp;
3965   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3966                                           &chars_seen, &unsignedp);
3967   ASSERT_EQ (cc, 'a');
3968   ASSERT_EQ (chars_seen, 1);
3969
3970   /* u'a'.  */
3971   tok = test.get_token ();
3972   ASSERT_EQ (tok->type, CPP_CHAR16);
3973   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3974
3975   /* U'a'.  */
3976   tok = test.get_token ();
3977   ASSERT_EQ (tok->type, CPP_CHAR32);
3978   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3979
3980   /* L'a'.  */
3981   tok = test.get_token ();
3982   ASSERT_EQ (tok->type, CPP_WCHAR);
3983   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3984
3985   /* 'abc' (c-char-sequence).  */
3986   tok = test.get_token ();
3987   ASSERT_EQ (tok->type, CPP_CHAR);
3988   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3989 }
3990 /* A table of interesting location_t values, giving one axis of our test
3991    matrix.  */
3992
3993 static const location_t boundary_locations[] = {
3994   /* Zero means "don't override the default values for a new line_table".  */
3995   0,
3996
3997   /* An arbitrary non-zero value that isn't close to one of
3998      the boundary values below.  */
3999   0x10000,
4000
4001   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
4002   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
4003   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
4004   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
4005   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
4006   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
4007
4008   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
4009   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x200,
4010   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
4011   LINE_MAP_MAX_LOCATION_WITH_COLS,
4012   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
4013   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x200,
4014 };
4015
4016 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
4017
4018 void
4019 for_each_line_table_case (void (*testcase) (const line_table_case &))
4020 {
4021   /* As noted above in the description of struct line_table_case,
4022      we want to explore a test matrix of interesting line_table
4023      situations, running various selftests for each case within the
4024      matrix.  */
4025
4026   /* Run all tests with:
4027      (a) line_table->default_range_bits == 0, and
4028      (b) line_table->default_range_bits == line_map_suggested_range_bits.  */
4029
4030   for (int default_range_bits: {0, line_map_suggested_range_bits})
4031     {
4032       /* ...and use each of the "interesting" location values as
4033          the starting location within line_table.  */
4034       const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
4035       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
4036         {
4037           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
4038           testcase (c);
4039         }
4040     }
4041 }
4042
4043 /* Verify that when presented with a consecutive pair of locations with
4044    a very large line offset, we don't attempt to consolidate them into
4045    a single ordinary linemap where the line offsets within the line map
4046    would lead to overflow (PR lto/88147).  */
4047
4048 static void
4049 test_line_offset_overflow ()
4050 {
4051   line_table_test ltt (line_table_case (5, 0));
4052
4053   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
4054   linemap_line_start (line_table, 1, 100);
4055   location_t loc_a = linemap_line_start (line_table, 2578, 255);
4056   assert_loceq ("foo.c", 2578, 0, loc_a);
4057
4058   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
4059   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
4060   ASSERT_EQ (ordmap_a->m_range_bits, 5);
4061
4062   location_t loc_b = linemap_line_start (line_table, 404198, 512);
4063   assert_loceq ("foo.c", 404198, 0, loc_b);
4064
4065   /* We should have started a new linemap, rather than attempting to store
4066      a very large line offset.  */
4067   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
4068   ASSERT_NE (ordmap_a, ordmap_b);
4069 }
4070
4071 void test_cpp_utf8 ()
4072 {
4073   const int def_tabstop = 8;
4074   cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
4075
4076   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
4077   {
4078     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
4079     ASSERT_EQ (8, w_bad);
4080     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
4081     ASSERT_EQ (5, w_ctrl);
4082   }
4083
4084   /* Verify that wcwidth of valid UTF-8 is as expected.  */
4085   {
4086     const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
4087     ASSERT_EQ (1, w_pi);
4088     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
4089     ASSERT_EQ (2, w_emoji);
4090     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
4091                                                         policy);
4092     ASSERT_EQ (1, w_umlaut_precomposed);
4093     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
4094                                                       policy);
4095     ASSERT_EQ (1, w_umlaut_combining);
4096     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
4097     ASSERT_EQ (2, w_han);
4098     const int w_ascii = cpp_display_width ("GCC", 3, policy);
4099     ASSERT_EQ (3, w_ascii);
4100     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
4101                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
4102                                            24, policy);
4103     ASSERT_EQ (18, w_mixed);
4104   }
4105
4106   /* Verify that display width properly expands tabs.  */
4107   {
4108     const char *tstr = "\tabc\td";
4109     ASSERT_EQ (6, cpp_display_width (tstr, 6,
4110                                      cpp_char_column_policy (1, cpp_wcwidth)));
4111     ASSERT_EQ (10, cpp_display_width (tstr, 6,
4112                                       cpp_char_column_policy (3, cpp_wcwidth)));
4113     ASSERT_EQ (17, cpp_display_width (tstr, 6,
4114                                       cpp_char_column_policy (8, cpp_wcwidth)));
4115     ASSERT_EQ (1,
4116                cpp_display_column_to_byte_column
4117                  (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
4118   }
4119
4120   /* Verify that cpp_byte_column_to_display_column can go past the end,
4121      and similar edge cases.  */
4122   {
4123     const char *str
4124       /* Display columns.
4125          111111112345  */
4126       = "\xcf\x80 abc";
4127       /* 111122223456
4128          Byte columns.  */
4129
4130     ASSERT_EQ (5, cpp_display_width (str, 6, policy));
4131     ASSERT_EQ (105,
4132                cpp_byte_column_to_display_column (str, 6, 106, policy));
4133     ASSERT_EQ (10000,
4134                cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4135     ASSERT_EQ (0,
4136                cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4137   }
4138
4139   /* Verify that cpp_display_column_to_byte_column can go past the end,
4140      and similar edge cases, and check invertibility.  */
4141   {
4142     const char *str
4143       /* Display columns.
4144          000000000000000000000000000000000000011
4145          111111112222222234444444455555555678901  */
4146       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4147       /* 000000000000000000000000000000000111111
4148          111122223333444456666777788889999012345
4149          Byte columns.  */
4150     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4151     ASSERT_EQ (15,
4152                cpp_display_column_to_byte_column (str, 15, 11, policy));
4153     ASSERT_EQ (115,
4154                cpp_display_column_to_byte_column (str, 15, 111, policy));
4155     ASSERT_EQ (10000,
4156                cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4157     ASSERT_EQ (0,
4158                cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4159
4160     /* Verify that we do not interrupt a UTF-8 sequence.  */
4161     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4162
4163     for (int byte_col = 1; byte_col <= 15; ++byte_col)
4164       {
4165         const int disp_col
4166           = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
4167         const int byte_col2
4168           = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
4169
4170         /* If we ask for the display column in the middle of a UTF-8
4171            sequence, it will return the length of the partial sequence,
4172            matching the behavior of GCC before display column support.
4173            Otherwise check the round trip was successful.  */
4174         if (byte_col < 4)
4175           ASSERT_EQ (byte_col, disp_col);
4176         else if (byte_col >= 6 && byte_col < 9)
4177           ASSERT_EQ (3 + (byte_col - 5), disp_col);
4178         else
4179           ASSERT_EQ (byte_col2, byte_col);
4180       }
4181   }
4182 }
4183
4184 static bool
4185 check_cpp_valid_utf8_p (const char *str)
4186 {
4187   return cpp_valid_utf8_p (str, strlen (str));
4188 }
4189
4190 /* Check that cpp_valid_utf8_p works as expected.  */
4191
4192 static void
4193 test_cpp_valid_utf8_p ()
4194 {
4195   ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
4196
4197   /* 2-byte char (pi).  */
4198   ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
4199
4200   /* 3-byte chars (the Japanese word "mojibake").  */
4201   ASSERT_TRUE (check_cpp_valid_utf8_p
4202                (
4203                 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
4204                    UTF-8: 0xE6 0x96 0x87
4205                    C octal escaped UTF-8: \346\226\207.  */
4206                 "\346\226\207"
4207                 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
4208                    UTF-8: 0xE5 0xAD 0x97
4209                    C octal escaped UTF-8: \345\255\227.  */
4210                 "\345\255\227"
4211                 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
4212                    UTF-8: 0xE5 0x8C 0x96
4213                    C octal escaped UTF-8: \345\214\226.  */
4214                 "\345\214\226"
4215                 /* U+3051 HIRAGANA LETTER KE
4216                    UTF-8: 0xE3 0x81 0x91
4217                    C octal escaped UTF-8: \343\201\221.  */
4218                 "\343\201\221"));
4219
4220   /* 4-byte char: an emoji.  */
4221   ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
4222
4223   /* Control codes, including the NUL byte.  */
4224   ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
4225
4226   ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
4227
4228   /* Unexpected continuation bytes.  */
4229   for (unsigned char continuation_byte = 0x80;
4230        continuation_byte <= 0xbf;
4231        continuation_byte++)
4232     ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
4233
4234   /* "Lonely start characters" for 2-byte sequences.  */
4235   {
4236     unsigned char buf[2];
4237     buf[1] = ' ';
4238     for (buf[0] = 0xc0;
4239          buf[0] <= 0xdf;
4240          buf[0]++)
4241       ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4242   }
4243
4244   /* "Lonely start characters" for 3-byte sequences.  */
4245   {
4246     unsigned char buf[2];
4247     buf[1] = ' ';
4248     for (buf[0] = 0xe0;
4249          buf[0] <= 0xef;
4250          buf[0]++)
4251       ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4252   }
4253
4254   /* "Lonely start characters" for 4-byte sequences.  */
4255   {
4256     unsigned char buf[2];
4257     buf[1] = ' ';
4258     for (buf[0] = 0xf0;
4259          buf[0] <= 0xf4;
4260          buf[0]++)
4261       ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4262   }
4263
4264   /* Invalid start characters (formerly valid for 5-byte and 6-byte
4265      sequences).  */
4266   {
4267     unsigned char buf[2];
4268     buf[1] = ' ';
4269     for (buf[0] = 0xf5;
4270          buf[0] <= 0xfd;
4271          buf[0]++)
4272       ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4273   }
4274
4275   /* Impossible bytes.  */
4276   ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
4277   ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
4278   ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
4279   ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
4280 }
4281
4282 /* Run all of the selftests within this file.  */
4283
4284 void
4285 input_cc_tests ()
4286 {
4287   test_linenum_comparisons ();
4288   test_should_have_column_data_p ();
4289   test_unknown_location ();
4290   test_builtins ();
4291   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
4292
4293   for_each_line_table_case (test_accessing_ordinary_linemaps);
4294   for_each_line_table_case (test_lexer);
4295   for_each_line_table_case (test_lexer_string_locations_simple);
4296   for_each_line_table_case (test_lexer_string_locations_ebcdic);
4297   for_each_line_table_case (test_lexer_string_locations_hex);
4298   for_each_line_table_case (test_lexer_string_locations_oct);
4299   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
4300   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
4301   for_each_line_table_case (test_lexer_string_locations_ucn4);
4302   for_each_line_table_case (test_lexer_string_locations_ucn8);
4303   for_each_line_table_case (test_lexer_string_locations_wide_string);
4304   for_each_line_table_case (test_lexer_string_locations_string16);
4305   for_each_line_table_case (test_lexer_string_locations_string32);
4306   for_each_line_table_case (test_lexer_string_locations_u8);
4307   for_each_line_table_case (test_lexer_string_locations_utf8_source);
4308   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
4309   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
4310   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
4311   for_each_line_table_case (test_lexer_string_locations_macro);
4312   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
4313   for_each_line_table_case (test_lexer_string_locations_non_string);
4314   for_each_line_table_case (test_lexer_string_locations_long_line);
4315   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
4316   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
4317   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
4318   for_each_line_table_case (test_lexer_char_constants);
4319
4320   test_reading_source_line ();
4321   test_reading_source_buffer ();
4322
4323   test_line_offset_overflow ();
4324
4325   test_cpp_utf8 ();
4326   test_cpp_valid_utf8_p ();
4327 }
4328
4329 } // namespace selftest
4330
4331 #endif /* CHECKING_P */