src/ptx.c

   1 /* Permuted index for GNU, with keywords in their context.
   2    Copyright (C) 1990-2024 Free Software Foundation, Inc.
   3    François Pinard <pinard@iro.umontreal.ca>, 1988.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17
  18    François Pinard <pinard@iro.umontreal.ca> */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25 #include "system.h"
  26 #include <regex.h>
  27 #include "argmatch.h"
  28 #include "c-ctype.h"
  29 #include "fadvise.h"
  30 #include "quote.h"
  31 #include "read-file.h"
  32 #include "stdio--.h"
  33 #include "xstrtol.h"
  34
  35 /* The official name of this program (e.g., no 'g' prefix).  */
  36 #define PROGRAM_NAME "ptx"
  37
  38 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
  39    if "ç" (c-with-cedilla) is available in the translation's character
  40    set and encoding.  */
  41 #define AUTHORS proper_name_lite ("F. Pinard", "Fran\xc3\xa7ois Pinard")
  42
  43 /* Number of possible characters in a byte.  */
  44 #define CHAR_SET_SIZE 256
  45
  46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
  47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
  48                      : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
  49 #define OCTTOBIN(C) ((C) - '0')
  50
  51 /* Debugging the memory allocator.  */
  52
  53 #if WITH_DMALLOC
  54 # define MALLOC_FUNC_CHECK 1
  55 # include <dmalloc.h>
  56 #endif
  57
  58 /* Global definitions.  */
  59
  60 /* FIXME: There are many unchecked integer overflows in this file,
  61    and in theory they could cause this command to have undefined
  62    behavior given large inputs or options.  This command should
  63    diagnose any such overflow and exit.  */
  64
  65 /* Program options.  */
  66
  67 enum Format
  68 {
  69   UNKNOWN_FORMAT,               /* output format still unknown */
  70   DUMB_FORMAT,                  /* output for a dumb terminal */
  71   ROFF_FORMAT,                  /* output for 'troff' or 'nroff' */
  72   TEX_FORMAT                    /* output for 'TeX' or 'LaTeX' */
  73 };
  74
  75 static bool gnu_extensions = true;      /* trigger all GNU extensions */
  76 static bool auto_reference = false;     /* refs are 'file_name:line_number:' */
  77 static bool input_reference = false;    /* refs at beginning of input lines */
  78 static bool right_reference = false;    /* output refs after right context  */
  79 static idx_t line_width = 72;           /* output line width in characters */
  80 static idx_t gap_size = 3;      /* number of spaces between output fields */
  81 static char const *truncation_string = "/";
  82                                 /* string used to mark line truncations */
  83 static char const *macro_name = "xx";   /* macro name for roff or TeX output */
  84 static enum Format output_format = UNKNOWN_FORMAT;
  85                                 /* output format */
  86
  87 static bool ignore_case = false;        /* fold lower to upper for sorting */
  88 static char const *break_file = nullptr; /* name of the 'Break chars' file */
  89 static char const *only_file = nullptr; /* name of the 'Only words' file */
  90 static char const *ignore_file = nullptr; /* name of the 'Ignore words' file */
  91
  92 /* Options that use regular expressions.  */
  93 struct regex_data
  94 {
  95   /* The original regular expression, as a string.  */
  96   char const *string;
  97
  98   /* The compiled regular expression, and its fastmap.  */
  99   struct re_pattern_buffer pattern;
 100   char fastmap[UCHAR_MAX + 1];
 101 };
 102
 103 static struct regex_data context_regex; /* end of context */
 104 static struct regex_data word_regex;    /* keyword */
 105
 106 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
 107    whole file.  A WORD is similar, except it is intended for smaller regions.
 108    A WORD_TABLE may contain several WORDs.  */
 109
 110 typedef struct
 111   {
 112     char *start;                /* pointer to beginning of region */
 113     char *end;                  /* pointer to end + 1 of region */
 114   }
 115 BLOCK;
 116
 117 typedef struct
 118   {
 119     char *start;                /* pointer to beginning of region */
 120     idx_t size;                 /* length of the region */
 121   }
 122 WORD;
 123
 124 typedef struct
 125   {
 126     WORD *start;                /* array of WORDs */
 127     idx_t alloc;                /* allocated length */
 128     idx_t length;               /* number of used entries */
 129   }
 130 WORD_TABLE;
 131
 132 /* Pattern description tables.  */
 133
 134 /* For each character, provide its folded equivalent.  */
 135 static unsigned char folded_chars[CHAR_SET_SIZE];
 136
 137 /* End of context pattern register indices.  */
 138 static struct re_registers context_regs;
 139
 140 /* Keyword pattern register indices.  */
 141 static struct re_registers word_regs;
 142
 143 /* A word characters fastmap is used only when no word regexp has been
 144    provided.  A word is then made up of a sequence of one or more characters
 145    allowed by the fastmap.  Contains !0 if character allowed in word.  Not
 146    only this is faster in most cases, but it simplifies the implementation
 147    of the Break files.  */
 148 static char word_fastmap[CHAR_SET_SIZE];
 149
 150 /* Maximum length of any word read.  */
 151 static idx_t maximum_word_length;
 152
 153 /* Maximum width of any reference used.  */
 154 static idx_t reference_max_width;
 155
 156 /* Ignore and Only word tables.  */
 157
 158 static WORD_TABLE ignore_table; /* table of words to ignore */
 159 static WORD_TABLE only_table;           /* table of words to select */
 160
 161 /* Source text table, and scanning macros.  */
 162
 163 static int number_input_files;  /* number of text input files */
 164 static intmax_t total_line_count;       /* total number of lines seen so far */
 165 static char const **input_file_name;    /* array of text input file names */
 166 static intmax_t *file_line_count;       /* array of line count values at end */
 167
 168 static BLOCK *text_buffers;     /* files to study */
 169
 170 /* SKIP_NON_WHITE used only for getting or skipping the reference.  */
 171
 172 #define SKIP_NON_WHITE(cursor, limit) \
 173   while (cursor < limit && ! isspace (to_uchar (*cursor)))              \
 174     cursor++
 175
 176 #define SKIP_WHITE(cursor, limit) \
 177   while (cursor < limit && isspace (to_uchar (*cursor)))                \
 178     cursor++
 179
 180 #define SKIP_WHITE_BACKWARDS(cursor, start) \
 181   while (cursor > start && isspace (to_uchar (cursor[-1])))             \
 182     cursor--
 183
 184 #define SKIP_SOMETHING(cursor, limit) \
 185   if (word_regex.string)                                                \
 186     {                                                                   \
 187       regoff_t count;                                                   \
 188       count = re_match (&word_regex.pattern, cursor, limit - cursor,    \
 189                         0, nullptr);                                    \
 190       if (count == -2)                                                  \
 191         matcher_error ();                                               \
 192       cursor += count == -1 ? 1 : count;                                \
 193     }                                                                   \
 194   else if (word_fastmap[to_uchar (*cursor)])                            \
 195     while (cursor < limit && word_fastmap[to_uchar (*cursor)])          \
 196       cursor++;                                                         \
 197   else                                                                  \
 198     cursor++
 199
 200 /* Occurrences table.
 201
 202    The 'keyword' pointer provides the central word, which is surrounded
 203    by a left context and a right context.  The 'keyword' and 'length'
 204    field allow full 8-bit characters keys, even including NULs.  At other
 205    places in this program, the name 'keyafter' refers to the keyword
 206    followed by its right context.
 207
 208    The left context does not extend, towards the beginning of the file,
 209    further than a distance given by the 'left' value.  This value is
 210    relative to the keyword beginning, it is usually negative.  This
 211    insures that, except for white space, we will never have to backward
 212    scan the source text, when it is time to generate the final output
 213    lines.
 214
 215    The right context, indirectly attainable through the keyword end, does
 216    not extend, towards the end of the file, further than a distance given
 217    by the 'right' value.  This value is relative to the keyword
 218    beginning, it is usually positive.
 219
 220    When automatic references are used, the 'reference' value is the
 221    overall line number in all input files read so far, in this case, it
 222    is of type intmax_t.  When input references are used, the 'reference'
 223    value indicates the distance between the keyword beginning and the
 224    start of the reference field, and it fits in ptrdiff_t and is usually
 225    negative.  */
 226
 227 typedef struct
 228   {
 229     WORD key;                   /* description of the keyword */
 230     ptrdiff_t left;             /* distance to left context start */
 231     ptrdiff_t right;            /* distance to right context end */
 232     intmax_t reference;         /* reference descriptor */
 233     int file_index;             /* corresponding file  */
 234   }
 235 OCCURS;
 236
 237 /* The various OCCURS tables are indexed by the language.  But the time
 238    being, there is no such multiple language support.  */
 239
 240 static OCCURS *occurs_table[1]; /* all words retained from the read text */
 241 static idx_t occurs_alloc[1];   /* allocated size of occurs_table */
 242 static idx_t number_of_occurs[1]; /* number of used slots in occurs_table */
 243
 244
 245 /* Communication among output routines.  */
 246
 247 /* Indicate if special output processing is requested for each character.  */
 248 static char edited_flag[CHAR_SET_SIZE];
 249
 250 /* Half of line width, reference excluded.  */
 251 static idx_t half_line_width;
 252
 253 /* Maximum width of before field.
 254    FIXME: Is this nonnegative?  That is, should this be idx_t?  */
 255 static ptrdiff_t before_max_width;
 256
 257 /* Maximum width of keyword-and-after field.
 258    FIXME: Is this nonnegative?  That is, should this be idx_t?  */
 259 static ptrdiff_t keyafter_max_width;
 260
 261 /* Length of string that flags truncation.  */
 262 static idx_t truncation_string_length;
 263
 264 /* When context is limited by lines, wraparound may happen on final output:
 265    the 'head' pointer gives access to some supplementary left context which
 266    will be seen at the end of the output line, the 'tail' pointer gives
 267    access to some supplementary right context which will be seen at the
 268    beginning of the output line. */
 269
 270 static BLOCK tail;              /* tail field */
 271 static bool tail_truncation;    /* flag truncation after the tail field */
 272
 273 static BLOCK before;            /* before field */
 274 static bool before_truncation;  /* flag truncation before the before field */
 275
 276 static BLOCK keyafter;          /* keyword-and-after field */
 277 static bool keyafter_truncation; /* flag truncation after the keyafter field */
 278
 279 static BLOCK head;              /* head field */
 280 static bool head_truncation;    /* flag truncation before the head field */
 281
 282 static BLOCK reference;         /* reference field for input reference mode */
 283
 284 /* Miscellaneous routines.  */
 285
 286 /* Diagnose an error in the regular expression matcher.  Then exit.  */
 287
 288 static void
 289 matcher_error (void)
 290 {
 291   error (EXIT_FAILURE, errno, _("error in regular expression matcher"));
 292 }
 293
 294 /* Unescape STRING in-place.  */
 295
 296 static void
 297 unescape_string (char *string)
 298 {
 299   char *cursor;                 /* cursor in result */
 300   int value;                    /* value of \nnn escape */
 301   int length;                   /* length of \nnn escape */
 302
 303   cursor = string;
 304
 305   while (*string)
 306     {
 307       if (*string == '\\')
 308         {
 309           string++;
 310           switch (*string)
 311             {
 312             case 'x':           /* \xhhh escape, 3 chars maximum */
 313               value = 0;
 314               for (length = 0, string++;
 315                    length < 3 && c_isxdigit (to_uchar (*string));
 316                    length++, string++)
 317                 value = value * 16 + HEXTOBIN (*string);
 318               if (length == 0)
 319                 {
 320                   *cursor++ = '\\';
 321                   *cursor++ = 'x';
 322                 }
 323               else
 324                 *cursor++ = value;
 325               break;
 326
 327             case '0':           /* \0ooo escape, 3 chars maximum */
 328               value = 0;
 329               for (length = 0, string++;
 330                    length < 3 && ISODIGIT (*string);
 331                    length++, string++)
 332                 value = value * 8 + OCTTOBIN (*string);
 333               *cursor++ = value;
 334               break;
 335
 336             case 'a':           /* alert */
 337               *cursor++ = '\a';
 338               string++;
 339               break;
 340
 341             case 'b':           /* backspace */
 342               *cursor++ = '\b';
 343               string++;
 344               break;
 345
 346             case 'c':           /* cancel the rest of the output */
 347               while (*string)
 348                 string++;
 349               break;
 350
 351             case 'f':           /* form feed */
 352               *cursor++ = '\f';
 353               string++;
 354               break;
 355
 356             case 'n':           /* new line */
 357               *cursor++ = '\n';
 358               string++;
 359               break;
 360
 361             case 'r':           /* carriage return */
 362               *cursor++ = '\r';
 363               string++;
 364               break;
 365
 366             case 't':           /* horizontal tab */
 367               *cursor++ = '\t';
 368               string++;
 369               break;
 370
 371             case 'v':           /* vertical tab */
 372               *cursor++ = '\v';
 373               string++;
 374               break;
 375
 376             case '\0':          /* lone backslash at end of string */
 377               /* ignore it */
 378               break;
 379
 380             default:
 381               *cursor++ = '\\';
 382               *cursor++ = *string++;
 383               break;
 384             }
 385         }
 386       else
 387         *cursor++ = *string++;
 388     }
 389
 390   *cursor = '\0';
 391 }
 392
 393 /*--------------------------------------------------------------------------.
 394 | Compile the regex represented by REGEX, diagnose and abort if any error.  |
 395 `--------------------------------------------------------------------------*/
 396
 397 static void
 398 compile_regex (struct regex_data *regex)
 399 {
 400   struct re_pattern_buffer *pattern = &regex->pattern;
 401   char const *string = regex->string;
 402   char const *message;
 403
 404   pattern->buffer = nullptr;
 405   pattern->allocated = 0;
 406   pattern->fastmap = regex->fastmap;
 407   pattern->translate = ignore_case ? folded_chars : nullptr;
 408
 409   message = re_compile_pattern (string, strlen (string), pattern);
 410   if (message)
 411     error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
 412
 413   /* The fastmap should be compiled before 're_match'.  The following
 414      call is not mandatory, because 're_search' is always called sooner,
 415      and it compiles the fastmap if this has not been done yet.  */
 416
 417   re_compile_fastmap (pattern);
 418 }
 419
 420 /*------------------------------------------------------------------------.
 421 | This will initialize various tables for pattern match and compiles some |
 422 | regexps.                                                                |
 423 `------------------------------------------------------------------------*/
 424
 425 static void
 426 initialize_regex (void)
 427 {
 428   int character;                /* character value */
 429
 430   /* Initialize the case folding table.  */
 431
 432   if (ignore_case)
 433     for (character = 0; character < CHAR_SET_SIZE; character++)
 434       folded_chars[character] = toupper (character);
 435
 436   /* Unless the user already provided a description of the end of line or
 437      end of sentence sequence, select an end of line sequence to compile.
 438      If the user provided an empty definition, thus disabling end of line
 439      or sentence feature, make it null to speed up tests.  If GNU
 440      extensions are enabled, use end of sentence like in GNU emacs.  If
 441      disabled, use end of lines.  */
 442
 443   if (context_regex.string)
 444     {
 445       if (!*context_regex.string)
 446         context_regex.string = nullptr;
 447     }
 448   else if (gnu_extensions && !input_reference)
 449     context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\|  \\)[ \t\n]*";
 450   else
 451     context_regex.string = "\n";
 452
 453   if (context_regex.string)
 454     compile_regex (&context_regex);
 455
 456   /* If the user has already provided a non-empty regexp to describe
 457      words, compile it.  Else, unless this has already been done through
 458      a user provided Break character file, construct a fastmap of
 459      characters that may appear in a word.  If GNU extensions enabled,
 460      include only letters of the underlying character set.  If disabled,
 461      include almost everything, even punctuation; stop only on white
 462      space.  */
 463
 464   if (word_regex.string)
 465     compile_regex (&word_regex);
 466   else if (!break_file)
 467     {
 468       if (gnu_extensions)
 469         {
 470
 471           /* Simulate \w+.  */
 472
 473           for (character = 0; character < CHAR_SET_SIZE; character++)
 474             word_fastmap[character] = !! isalpha (character);
 475         }
 476       else
 477         {
 478
 479           /* Simulate [^ \t\n]+.  */
 480
 481           memset (word_fastmap, 1, CHAR_SET_SIZE);
 482           word_fastmap[' '] = 0;
 483           word_fastmap['\t'] = 0;
 484           word_fastmap['\n'] = 0;
 485         }
 486     }
 487 }
 488
 489 /*------------------------------------------------------------------------.
 490 | This routine will attempt to swallow a whole file name FILE_NAME into a |
 491 | contiguous region of memory and return a description of it into BLOCK.  |
 492 | Standard input is assumed whenever FILE_NAME is null, empty or "-".     |
 493 |                                                                         |
 494 | Previously, in some cases, white space compression was attempted while  |
 495 | inputting text.  This was defeating some regexps like default end of    |
 496 | sentence, which checks for two consecutive spaces.  If white space      |
 497 | compression is ever reinstated, it should be in output routines.        |
 498 `------------------------------------------------------------------------*/
 499
 500 static void
 501 swallow_file_in_memory (char const *file_name, BLOCK *block)
 502 {
 503   size_t used_length;           /* used length in memory buffer */
 504
 505   /* As special cases, a file name which is null or "-" indicates standard
 506      input, which is already opened.  In all other cases, open the file from
 507      its name.  */
 508   bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
 509   if (using_stdin)
 510     block->start = fread_file (stdin, 0, &used_length);
 511   else
 512     block->start = read_file (file_name, 0, &used_length);
 513
 514   if (!block->start)
 515     error (EXIT_FAILURE, errno, "%s", quotef (using_stdin ? "-" : file_name));
 516
 517   if (using_stdin)
 518     clearerr (stdin);
 519
 520   block->end = block->start + used_length;
 521 }
 522
 523 /* Sort and search routines.  */
 524
 525 /*--------------------------------------------------------------------------.
 526 | Compare two words, FIRST and SECOND, and return 0 if they are identical.  |
 527 | Return less than 0 if the first word goes before the second; return       |
 528 | greater than 0 if the first word goes after the second.                   |
 529 |                                                                           |
 530 | If a word is indeed a prefix of the other, the shorter should go first.   |
 531 `--------------------------------------------------------------------------*/
 532
 533 static int
 534 compare_words (const void *void_first, const void *void_second)
 535 {
 536   WORD const *first = void_first;
 537   WORD const *second = void_second;
 538   idx_t length = MIN (first->size, second->size);
 539
 540   if (ignore_case)
 541     {
 542       for (idx_t counter = 0; counter < length; counter++)
 543         {
 544           int value = (folded_chars[to_uchar (first->start[counter])]
 545                        - folded_chars[to_uchar (second->start[counter])]);
 546           if (value != 0)
 547             return value;
 548         }
 549     }
 550   else
 551     {
 552       for (idx_t counter = 0; counter < length; counter++)
 553         {
 554           int value = (to_uchar (first->start[counter])
 555                        - to_uchar (second->start[counter]));
 556           if (value != 0)
 557             return value;
 558         }
 559     }
 560
 561   return (first->size > second->size) - (first->size < second->size);
 562 }
 563
 564 /*-----------------------------------------------------------------------.
 565 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
 566 | go first.  In case of a tie, preserve the original order through a     |
 567 | pointer comparison.                                                    |
 568 `-----------------------------------------------------------------------*/
 569
 570 static int
 571 compare_occurs (const void *void_first, const void *void_second)
 572 {
 573 #define first ((const OCCURS *) void_first)
 574 #define second ((const OCCURS *) void_second)
 575   int value;
 576
 577   value = compare_words (&first->key, &second->key);
 578   return (value ? value
 579           : ((first->key.start > second->key.start)
 580              - (first->key.start < second->key.start)));
 581 #undef first
 582 #undef second
 583 }
 584
 585 /* True if WORD appears in TABLE.  Uses a binary search.  */
 586
 587 ATTRIBUTE_PURE
 588 static bool
 589 search_table (WORD *word, WORD_TABLE *table)
 590 {
 591   idx_t lo = 0;
 592   idx_t hi = table->length;
 593   while (lo < hi)
 594     {
 595       idx_t middle = (lo >> 1) + (hi >> 1) + (lo & hi & 1);
 596       int value = compare_words (word, table->start + middle);
 597       if (value < 0)
 598         hi = middle;
 599       else if (value > 0)
 600         lo = middle + 1;
 601       else
 602         return true;
 603     }
 604   return false;
 605 }
 606
 607 /*---------------------------------------------------------------------.
 608 | Sort the whole occurs table in memory.  Presumably, 'qsort' does not |
 609 | take intermediate copies or table elements, so the sort will be      |
 610 | stabilized throughout the comparison routine.                        |
 611 `---------------------------------------------------------------------*/
 612
 613 static void
 614 sort_found_occurs (void)
 615 {
 616
 617   /* Only one language for the time being.  */
 618   if (number_of_occurs[0])
 619     qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
 620            compare_occurs);
 621 }
 622
 623 /* Parameter files reading routines.  */
 624
 625 /*----------------------------------------------------------------------.
 626 | Read a file named FILE_NAME, containing a set of break characters.    |
 627 | Build a content to the array word_fastmap in which all characters are |
 628 | allowed except those found in the file.  Characters may be repeated.  |
 629 `----------------------------------------------------------------------*/
 630
 631 static void
 632 digest_break_file (char const *file_name)
 633 {
 634   BLOCK file_contents;          /* to receive a copy of the file */
 635   char *cursor;                 /* cursor in file copy */
 636
 637   swallow_file_in_memory (file_name, &file_contents);
 638
 639   /* Make the fastmap and record the file contents in it.  */
 640
 641   memset (word_fastmap, 1, CHAR_SET_SIZE);
 642   for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
 643     word_fastmap[to_uchar (*cursor)] = 0;
 644
 645   if (!gnu_extensions)
 646     {
 647
 648       /* If GNU extensions are enabled, the only way to avoid newline as
 649          a break character is to write all the break characters in the
 650          file with no newline at all, not even at the end of the file.
 651          If disabled, spaces, tabs and newlines are always considered as
 652          break characters even if not included in the break file.  */
 653
 654       word_fastmap[' '] = 0;
 655       word_fastmap['\t'] = 0;
 656       word_fastmap['\n'] = 0;
 657     }
 658
 659   /* Return the space of the file, which is no more required.  */
 660
 661   free (file_contents.start);
 662 }
 663
 664 /*-----------------------------------------------------------------------.
 665 | Read a file named FILE_NAME, containing one word per line, then        |
 666 | construct in TABLE a table of WORD descriptors for them.  The routine  |
 667 | swallows the whole file in memory; this is at the expense of space     |
 668 | needed for newlines, which are useless; however, the reading is fast.  |
 669 `-----------------------------------------------------------------------*/
 670
 671 static void
 672 digest_word_file (char const *file_name, WORD_TABLE *table)
 673 {
 674   BLOCK file_contents;          /* to receive a copy of the file */
 675   char *cursor;                 /* cursor in file copy */
 676   char *word_start;             /* start of the current word */
 677
 678   swallow_file_in_memory (file_name, &file_contents);
 679
 680   table->start = nullptr;
 681   table->alloc = 0;
 682   table->length = 0;
 683
 684   /* Read the whole file.  */
 685
 686   cursor = file_contents.start;
 687   while (cursor < file_contents.end)
 688     {
 689
 690       /* Read one line, and save the word in contains.  */
 691
 692       word_start = cursor;
 693       while (cursor < file_contents.end && *cursor != '\n')
 694         cursor++;
 695
 696       /* Record the word in table if it is not empty.  */
 697
 698       if (cursor > word_start)
 699         {
 700           if (table->length == table->alloc)
 701             table->start = xpalloc (table->start, &table->alloc, 1, -1,
 702                                     sizeof *table->start);
 703           table->start[table->length].start = word_start;
 704           table->start[table->length].size = cursor - word_start;
 705           table->length++;
 706         }
 707
 708       /* This test allows for an incomplete line at end of file.  */
 709
 710       if (cursor < file_contents.end)
 711         cursor++;
 712     }
 713
 714   /* Finally, sort all the words read.  */
 715
 716   qsort (table->start, table->length, sizeof table->start[0], compare_words);
 717 }
 718
 719 /* Keyword recognition and selection.  */
 720
 721 /*----------------------------------------------------------------------.
 722 | For each keyword in the source text, constructs an OCCURS structure.  |
 723 `----------------------------------------------------------------------*/
 724
 725 static void
 726 find_occurs_in_text (int file_index)
 727 {
 728   char *cursor;                 /* for scanning the source text */
 729   char *scan;                   /* for scanning the source text also */
 730   char *line_start;             /* start of the current input line */
 731   char *line_scan;              /* newlines scanned until this point */
 732   idx_t reference_length;       /* length of reference in input mode */
 733   WORD possible_key;            /* possible key, to ease searches */
 734   OCCURS *occurs_cursor;        /* current OCCURS under construction */
 735
 736   char *context_start;          /* start of left context */
 737   char *context_end;            /* end of right context */
 738   char *word_start;             /* start of word */
 739   char *word_end;               /* end of word */
 740   char *next_context_start;     /* next start of left context */
 741
 742   const BLOCK *text_buffer = &text_buffers[file_index];
 743
 744   /* reference_length is always used within 'if (input_reference)'.
 745      However, GNU C diagnoses that it may be used uninitialized.  The
 746      following assignment is merely to shut it up.  */
 747
 748   reference_length = 0;
 749
 750   /* Tracking where lines start is helpful for reference processing.  In
 751      auto reference mode, this allows counting lines.  In input reference
 752      mode, this permits finding the beginning of the references.
 753
 754      The first line begins with the file, skip immediately this very first
 755      reference in input reference mode, to help further rejection any word
 756      found inside it.  Also, unconditionally assigning these variable has
 757      the happy effect of shutting up lint.  */
 758
 759   line_start = text_buffer->start;
 760   line_scan = line_start;
 761   if (input_reference)
 762     {
 763       SKIP_NON_WHITE (line_scan, text_buffer->end);
 764       reference_length = line_scan - line_start;
 765       SKIP_WHITE (line_scan, text_buffer->end);
 766     }
 767
 768   /* Process the whole buffer, one line or one sentence at a time.  */
 769
 770   for (cursor = text_buffer->start;
 771        cursor < text_buffer->end;
 772        cursor = next_context_start)
 773     {
 774
 775       /* 'context_start' gets initialized before the processing of each
 776          line, or once for the whole buffer if no end of line or sentence
 777          sequence separator.  */
 778
 779       context_start = cursor;
 780
 781       /* If an end of line or end of sentence sequence is defined and
 782          non-empty, 'next_context_start' will be recomputed to be the end of
 783          each line or sentence, before each one is processed.  If no such
 784          sequence, then 'next_context_start' is set at the end of the whole
 785          buffer, which is then considered to be a single line or sentence.
 786          This test also accounts for the case of an incomplete line or
 787          sentence at the end of the buffer.  */
 788
 789       next_context_start = text_buffer->end;
 790       if (context_regex.string)
 791         switch (re_search (&context_regex.pattern, cursor,
 792                            text_buffer->end - cursor,
 793                            0, text_buffer->end - cursor, &context_regs))
 794           {
 795           case -2:
 796             matcher_error ();
 797
 798           case -1:
 799             break;
 800
 801           case 0:
 802             error (EXIT_FAILURE, 0,
 803                    _("error: regular expression has a match of length zero:"
 804                      " %s"),
 805                    quote (context_regex.string));
 806
 807           default:
 808             next_context_start = cursor + context_regs.end[0];
 809             break;
 810           }
 811
 812       /* Include the separator into the right context, but not any suffix
 813          white space in this separator; this insures it will be seen in
 814          output and will not take more space than necessary.  */
 815
 816       context_end = next_context_start;
 817       SKIP_WHITE_BACKWARDS (context_end, context_start);
 818
 819       /* Read and process a single input line or sentence, one word at a
 820          time.  */
 821
 822       while (true)
 823         {
 824           if (word_regex.string)
 825
 826             /* If a word regexp has been compiled, use it to skip at the
 827                beginning of the next word.  If there is no such word, exit
 828                the loop.  */
 829
 830             {
 831               regoff_t r = re_search (&word_regex.pattern, cursor,
 832                                       context_end - cursor,
 833                                       0, context_end - cursor, &word_regs);
 834               if (r == -2)
 835                 matcher_error ();
 836               if (r == -1)
 837                 break;
 838               word_start = cursor + word_regs.start[0];
 839               word_end = cursor + word_regs.end[0];
 840             }
 841           else
 842
 843             /* Avoid re_search and use the fastmap to skip to the
 844                beginning of the next word.  If there is no more word in
 845                the buffer, exit the loop.  */
 846
 847             {
 848               scan = cursor;
 849               while (scan < context_end
 850                      && !word_fastmap[to_uchar (*scan)])
 851                 scan++;
 852
 853               if (scan == context_end)
 854                 break;
 855
 856               word_start = scan;
 857
 858               while (scan < context_end
 859                      && word_fastmap[to_uchar (*scan)])
 860                 scan++;
 861
 862               word_end = scan;
 863             }
 864
 865           /* Skip right to the beginning of the found word.  */
 866
 867           cursor = word_start;
 868
 869           /* Skip any zero length word.  Just advance a single position,
 870              then go fetch the next word.  */
 871
 872           if (word_end == word_start)
 873             {
 874               cursor++;
 875               continue;
 876             }
 877
 878           /* This is a genuine, non empty word, so save it as a possible
 879              key.  Then skip over it.  Also, maintain the maximum length of
 880              all words read so far.  It is mandatory to take the maximum
 881              length of all words in the file, without considering if they
 882              are actually kept or rejected, because backward jumps at output
 883              generation time may fall in *any* word.  */
 884
 885           possible_key.start = cursor;
 886           possible_key.size = word_end - word_start;
 887           cursor += possible_key.size;
 888
 889           if (possible_key.size > maximum_word_length)
 890             maximum_word_length = possible_key.size;
 891
 892           /* In input reference mode, update 'line_start' from its previous
 893              value.  Count the lines just in case auto reference mode is
 894              also selected. If it happens that the word just matched is
 895              indeed part of a reference; just ignore it.  */
 896
 897           if (input_reference)
 898             {
 899               while (line_scan < possible_key.start)
 900                 if (*line_scan == '\n')
 901                   {
 902                     total_line_count++;
 903                     line_scan++;
 904                     line_start = line_scan;
 905                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 906                     reference_length = line_scan - line_start;
 907                   }
 908                 else
 909                   line_scan++;
 910               if (line_scan > possible_key.start)
 911                 continue;
 912             }
 913
 914           /* Ignore the word if an 'Ignore words' table exists and if it is
 915              part of it.  Also ignore the word if an 'Only words' table and
 916              if it is *not* part of it.
 917
 918              It is allowed that both tables be used at once, even if this
 919              may look strange for now.  Just ignore a word that would appear
 920              in both.  If regexps are eventually implemented for these
 921              tables, the Ignore table could then reject words that would
 922              have been previously accepted by the Only table.  */
 923
 924           if (ignore_file && search_table (&possible_key, &ignore_table))
 925             continue;
 926           if (only_file && !search_table (&possible_key, &only_table))
 927             continue;
 928
 929           /* A non-empty word has been found.  First of all, insure
 930              proper allocation of the next OCCURS, and make a pointer to
 931              where it will be constructed.  */
 932
 933           if (number_of_occurs[0] == occurs_alloc[0])
 934             occurs_table[0] = xpalloc (occurs_table[0], &occurs_alloc[0],
 935                                        1, -1, sizeof *occurs_table[0]);
 936           occurs_cursor = occurs_table[0] + number_of_occurs[0];
 937
 938           /* Define the reference field, if any.  */
 939
 940           if (auto_reference)
 941             {
 942
 943               /* While auto referencing, update 'line_start' from its
 944                  previous value, counting lines as we go.  If input
 945                  referencing at the same time, 'line_start' has been
 946                  advanced earlier, and the following loop is never really
 947                  executed.  */
 948
 949               while (line_scan < possible_key.start)
 950                 if (*line_scan == '\n')
 951                   {
 952                     total_line_count++;
 953                     line_scan++;
 954                     line_start = line_scan;
 955                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 956                   }
 957                 else
 958                   line_scan++;
 959
 960               occurs_cursor->reference = total_line_count;
 961             }
 962           else if (input_reference)
 963             {
 964
 965               /* If only input referencing, 'line_start' has been computed
 966                  earlier to detect the case the word matched would be part
 967                  of the reference.  The reference position is simply the
 968                  value of 'line_start'.  */
 969
 970               occurs_cursor->reference = line_start - possible_key.start;
 971               if (reference_length > reference_max_width)
 972                 reference_max_width = reference_length;
 973             }
 974
 975           /* Exclude the reference from the context in simple cases.  */
 976
 977           if (input_reference && line_start == context_start)
 978             {
 979               SKIP_NON_WHITE (context_start, context_end);
 980               SKIP_WHITE (context_start, context_end);
 981             }
 982
 983           /* Completes the OCCURS structure.  */
 984
 985           occurs_cursor->key = possible_key;
 986           occurs_cursor->left = context_start - possible_key.start;
 987           occurs_cursor->right = context_end - possible_key.start;
 988           occurs_cursor->file_index = file_index;
 989
 990           number_of_occurs[0]++;
 991         }
 992     }
 993 }
 994
 995 /* Formatting and actual output - service routines.  */
 996
 997 /*-----------------------------------------.
 998 | Prints some NUMBER of spaces on stdout.  |
 999 `-----------------------------------------*/
1000
1001 static void
1002 print_spaces (ptrdiff_t number)
1003 {
1004   for (ptrdiff_t counter = number; counter > 0; counter--)
1005     putchar (' ');
1006 }
1007
1008 /*-------------------------------------.
1009 | Prints the field provided by FIELD.  |
1010 `-------------------------------------*/
1011
1012 static void
1013 print_field (BLOCK field)
1014 {
1015   char *cursor;                 /* Cursor in field to print */
1016
1017   /* Whitespace is not really compressed.  Instead, each white space
1018      character (tab, vt, ht etc.) is printed as one single space.  */
1019
1020   for (cursor = field.start; cursor < field.end; cursor++)
1021     {
1022       unsigned char character = *cursor;
1023       if (edited_flag[character])
1024         {
1025           /* Handle cases which are specific to 'roff' or TeX.  All
1026              white space processing is done as the default case of
1027              this switch.  */
1028
1029           switch (character)
1030             {
1031             case '"':
1032               /* In roff output format, double any quote.  */
1033               putchar ('"');
1034               putchar ('"');
1035               break;
1036
1037             case '$':
1038             case '%':
1039             case '&':
1040             case '#':
1041             case '_':
1042               /* In TeX output format, precede these with a backslash.  */
1043               putchar ('\\');
1044               putchar (character);
1045               break;
1046
1047             case '{':
1048             case '}':
1049               /* In TeX output format, precede these with a backslash and
1050                  force mathematical mode.  */
1051               printf ("$\\%c$", character);
1052               break;
1053
1054             case '\\':
1055               /* In TeX output mode, request production of a backslash.  */
1056               fputs ("\\backslash{}", stdout);
1057               break;
1058
1059             default:
1060               /* Any other flagged character produces a single space.  */
1061               putchar (' ');
1062             }
1063         }
1064       else
1065         putchar (*cursor);
1066     }
1067 }
1068
1069 /* Formatting and actual output - planning routines.  */
1070
1071 /*--------------------------------------------------------------------.
1072 | From information collected from command line options and input file |
1073 | readings, compute and fix some output parameter values.             |
1074 `--------------------------------------------------------------------*/
1075
1076 static void
1077 fix_output_parameters (void)
1078 {
1079   /* In auto reference mode, the maximum width of this field is
1080      precomputed and subtracted from the overall line width.  Add one for
1081      the column which separate the file name from the line number.  */
1082
1083   if (auto_reference)
1084     {
1085       reference_max_width = 0;
1086       for (int file_index = 0; file_index < number_input_files; file_index++)
1087         {
1088           intmax_t line_ordinal = file_line_count[file_index] + 1;
1089           if (file_index > 0)
1090             line_ordinal -= file_line_count[file_index - 1];
1091           char ordinal_string[INT_BUFSIZE_BOUND (intmax_t)];
1092           idx_t reference_width = sprintf (ordinal_string, "%jd", line_ordinal);
1093           if (input_file_name[file_index])
1094             reference_width += strlen (input_file_name[file_index]);
1095           if (reference_width > reference_max_width)
1096             reference_max_width = reference_width;
1097         }
1098       reference_max_width++;
1099       reference.start = xmalloc (reference_max_width + 1);
1100     }
1101
1102   /* If the reference appears to the left of the output line, reserve some
1103      space for it right away, including one gap size.  */
1104
1105   if ((auto_reference || input_reference) && !right_reference)
1106     line_width = MAX (0, line_width - (reference_max_width + gap_size));
1107
1108   /* The output lines, minimally, will contain from left to right a left
1109      context, a gap, and a keyword followed by the right context with no
1110      special intervening gap.  Half of the line width is dedicated to the
1111      left context and the gap, the other half is dedicated to the keyword
1112      and the right context; these values are computed once and for all here.
1113      There also are tail and head wrap around fields, used when the keyword
1114      is near the beginning or the end of the line, or when some long word
1115      cannot fit in, but leave place from wrapped around shorter words.  The
1116      maximum width of these fields are recomputed separately for each line,
1117      on a case by case basis.  It is worth noting that it cannot happen that
1118      both the tail and head fields are used at once.  */
1119
1120   half_line_width = line_width >> 1;
1121   before_max_width = half_line_width - gap_size;
1122   keyafter_max_width = half_line_width;
1123
1124   /* If truncation_string is the empty string, make it null to speed up
1125      tests.  In this case, truncation_string_length will never get used, so
1126      there is no need to set it.  */
1127
1128   if (truncation_string && *truncation_string)
1129     truncation_string_length = strlen (truncation_string);
1130   else
1131     truncation_string = nullptr;
1132
1133   if (gnu_extensions)
1134     {
1135
1136       /* When flagging truncation at the left of the keyword, the
1137          truncation mark goes at the beginning of the before field,
1138          unless there is a head field, in which case the mark goes at the
1139          left of the head field.  When flagging truncation at the right
1140          of the keyword, the mark goes at the end of the keyafter field,
1141          unless there is a tail field, in which case the mark goes at the
1142          end of the tail field.  Only eight combination cases could arise
1143          for truncation marks:
1144
1145          . None.
1146          . One beginning the before field.
1147          . One beginning the head field.
1148          . One ending the keyafter field.
1149          . One ending the tail field.
1150          . One beginning the before field, another ending the keyafter field.
1151          . One ending the tail field, another beginning the before field.
1152          . One ending the keyafter field, another beginning the head field.
1153
1154          So, there is at most two truncation marks, which could appear both
1155          on the left side of the center of the output line, both on the
1156          right side, or one on either side.  */
1157
1158       before_max_width -= 2 * truncation_string_length;
1159       if (before_max_width < 0)
1160         before_max_width = 0;
1161       keyafter_max_width -= 2 * truncation_string_length;
1162     }
1163   else
1164     {
1165
1166       /* I never figured out exactly how UNIX' ptx plans the output width
1167          of its various fields.  If GNU extensions are disabled, do not
1168          try computing the field widths correctly; instead, use the
1169          following formula, which does not completely imitate UNIX' ptx,
1170          but almost.  */
1171
1172       keyafter_max_width -= 2 * truncation_string_length + 1;
1173     }
1174
1175   /* Compute which characters need special output processing.  Initialize
1176      by flagging any white space character.  Some systems do not consider
1177      form feed as a space character, but we do.  */
1178
1179   for (int character = 0; character < CHAR_SET_SIZE; character++)
1180     edited_flag[character] = !! isspace (character);
1181   edited_flag['\f'] = 1;
1182
1183   /* Complete the special character flagging according to selected output
1184      format.  */
1185
1186   switch (output_format)
1187     {
1188     case UNKNOWN_FORMAT:
1189       /* Should never happen.  */
1190
1191     case DUMB_FORMAT:
1192       break;
1193
1194     case ROFF_FORMAT:
1195
1196       /* 'Quote' characters should be doubled.  */
1197
1198       edited_flag['"'] = 1;
1199       break;
1200
1201     case TEX_FORMAT:
1202
1203       /* Various characters need special processing.  */
1204
1205       for (char const *cursor = "$%&#_{}\\"; *cursor; cursor++)
1206         edited_flag[to_uchar (*cursor)] = 1;
1207
1208       break;
1209     }
1210 }
1211
1212 /*------------------------------------------------------------------.
1213 | Compute the position and length of all the output fields, given a |
1214 | pointer to some OCCURS.                                           |
1215 `------------------------------------------------------------------*/
1216
1217 static void
1218 define_all_fields (OCCURS *occurs)
1219 {
1220   ptrdiff_t tail_max_width;     /* allowable width of tail field */
1221   ptrdiff_t head_max_width;     /* allowable width of head field */
1222   char *cursor;                 /* running cursor in source text */
1223   char *left_context_start;     /* start of left context */
1224   char *right_context_end;      /* end of right context */
1225   char *left_field_start;       /* conservative start for 'head'/'before' */
1226   char const *file_name;        /* file name for reference */
1227   intmax_t line_ordinal;        /* line ordinal for reference */
1228   char const *buffer_start;     /* start of buffered file for this occurs */
1229   char const *buffer_end;       /* end of buffered file for this occurs */
1230
1231   /* Define 'keyafter', start of left context and end of right context.
1232      'keyafter' starts at the saved position for keyword and extend to the
1233      right from the end of the keyword, eating separators or full words, but
1234      not beyond maximum allowed width for 'keyafter' field or limit for the
1235      right context.  Suffix spaces will be removed afterwards.  */
1236
1237   keyafter.start = occurs->key.start;
1238   keyafter.end = keyafter.start + occurs->key.size;
1239   left_context_start = keyafter.start + occurs->left;
1240   right_context_end = keyafter.start + occurs->right;
1241
1242   buffer_start = text_buffers[occurs->file_index].start;
1243   buffer_end = text_buffers[occurs->file_index].end;
1244
1245   cursor = keyafter.end;
1246   while (cursor < right_context_end
1247          && cursor <= keyafter.start + keyafter_max_width)
1248     {
1249       keyafter.end = cursor;
1250       SKIP_SOMETHING (cursor, right_context_end);
1251     }
1252   if (cursor <= keyafter.start + keyafter_max_width)
1253     keyafter.end = cursor;
1254
1255   keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1256
1257   SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1258
1259   /* When the left context is wide, it might take some time to catch up from
1260      the left context boundary to the beginning of the 'head' or 'before'
1261      fields.  So, in this case, to speed the catchup, we jump back from the
1262      keyword, using some secure distance, possibly falling in the middle of
1263      a word.  A secure backward jump would be at least half the maximum
1264      width of a line, plus the size of the longest word met in the whole
1265      input.  We conclude this backward jump by a skip forward of at least
1266      one word.  In this manner, we should not inadvertently accept only part
1267      of a word.  From the reached point, when it will be time to fix the
1268      beginning of 'head' or 'before' fields, we will skip forward words or
1269      delimiters until we get sufficiently near.  */
1270
1271   if (-occurs->left > half_line_width + maximum_word_length)
1272     {
1273       left_field_start
1274         = keyafter.start - (half_line_width + maximum_word_length);
1275       SKIP_SOMETHING (left_field_start, keyafter.start);
1276     }
1277   else
1278     left_field_start = keyafter.start + occurs->left;
1279
1280   /* 'before' certainly ends at the keyword, but not including separating
1281      spaces.  It starts after than the saved value for the left context, by
1282      advancing it until it falls inside the maximum allowed width for the
1283      before field.  There will be no prefix spaces either.  'before' only
1284      advances by skipping single separators or whole words. */
1285
1286   before.start = left_field_start;
1287   before.end = keyafter.start;
1288   SKIP_WHITE_BACKWARDS (before.end, before.start);
1289
1290   while (before.start + before_max_width < before.end)
1291     SKIP_SOMETHING (before.start, before.end);
1292
1293   if (truncation_string)
1294     {
1295       cursor = before.start;
1296       SKIP_WHITE_BACKWARDS (cursor, buffer_start);
1297       before_truncation = cursor > left_context_start;
1298     }
1299   else
1300     before_truncation = false;
1301
1302   SKIP_WHITE (before.start, buffer_end);
1303
1304   /* The tail could not take more columns than what has been left in the
1305      left context field, and a gap is mandatory.  It starts after the
1306      right context, and does not contain prefixed spaces.  It ends at
1307      the end of line, the end of buffer or when the tail field is full,
1308      whichever comes first.  It cannot contain only part of a word, and
1309      has no suffixed spaces.  */
1310
1311   tail_max_width
1312     = before_max_width - (before.end - before.start) - gap_size;
1313
1314   if (tail_max_width > 0)
1315     {
1316       tail.start = keyafter.end;
1317       SKIP_WHITE (tail.start, buffer_end);
1318
1319       tail.end = tail.start;
1320       cursor = tail.end;
1321       while (cursor < right_context_end
1322              && cursor < tail.start + tail_max_width)
1323         {
1324           tail.end = cursor;
1325           SKIP_SOMETHING (cursor, right_context_end);
1326         }
1327
1328       if (cursor < tail.start + tail_max_width)
1329         tail.end = cursor;
1330
1331       if (tail.end > tail.start)
1332         {
1333           keyafter_truncation = false;
1334           tail_truncation = truncation_string && tail.end < right_context_end;
1335         }
1336       else
1337         tail_truncation = false;
1338
1339       SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1340     }
1341   else
1342     {
1343
1344       /* No place left for a tail field.  */
1345
1346       tail.start = nullptr;
1347       tail.end = nullptr;
1348       tail_truncation = false;
1349     }
1350
1351   /* 'head' could not take more columns than what has been left in the right
1352      context field, and a gap is mandatory.  It ends before the left
1353      context, and does not contain suffixed spaces.  Its pointer is advanced
1354      until the head field has shrunk to its allowed width.  It cannot
1355      contain only part of a word, and has no suffixed spaces.  */
1356
1357   head_max_width
1358     = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1359
1360   if (head_max_width > 0)
1361     {
1362       head.end = before.start;
1363       SKIP_WHITE_BACKWARDS (head.end, buffer_start);
1364
1365       head.start = left_field_start;
1366       while (head.start + head_max_width < head.end)
1367         SKIP_SOMETHING (head.start, head.end);
1368
1369       if (head.end > head.start)
1370         {
1371           before_truncation = false;
1372           head_truncation = (truncation_string
1373                              && head.start > left_context_start);
1374         }
1375       else
1376         head_truncation = false;
1377
1378       SKIP_WHITE (head.start, head.end);
1379     }
1380   else
1381     {
1382
1383       /* No place left for a head field.  */
1384
1385       head.start = nullptr;
1386       head.end = nullptr;
1387       head_truncation = false;
1388     }
1389
1390   if (auto_reference)
1391     {
1392
1393       /* Construct the reference text in preallocated space from the file
1394          name and the line number.  Standard input yields an empty file name.
1395          Ensure line numbers are 1 based, even if they are computed 0 based.  */
1396
1397       file_name = input_file_name[occurs->file_index];
1398       if (!file_name)
1399         file_name = "";
1400
1401       line_ordinal = occurs->reference + 1;
1402       if (occurs->file_index > 0)
1403         line_ordinal -= file_line_count[occurs->file_index - 1];
1404
1405       char *file_end = stpcpy (reference.start, file_name);
1406       reference.end = file_end + sprintf (file_end, ":%jd", line_ordinal);
1407     }
1408   else if (input_reference)
1409     {
1410
1411       /* Reference starts at saved position for reference and extends right
1412          until some white space is met.  */
1413
1414       reference.start = keyafter.start + occurs->reference;
1415       reference.end = reference.start;
1416       SKIP_NON_WHITE (reference.end, right_context_end);
1417     }
1418 }
1419
1420 /* Formatting and actual output - control routines.  */
1421
1422 /*----------------------------------------------------------------------.
1423 | Output the current output fields as one line for 'troff' or 'nroff'.  |
1424 `----------------------------------------------------------------------*/
1425
1426 static void
1427 output_one_roff_line (void)
1428 {
1429   /* Output the 'tail' field.  */
1430
1431   printf (".%s \"", macro_name);
1432   print_field (tail);
1433   if (tail_truncation)
1434     fputs (truncation_string, stdout);
1435   putchar ('"');
1436
1437   /* Output the 'before' field.  */
1438
1439   fputs (" \"", stdout);
1440   if (before_truncation)
1441     fputs (truncation_string, stdout);
1442   print_field (before);
1443   putchar ('"');
1444
1445   /* Output the 'keyafter' field.  */
1446
1447   fputs (" \"", stdout);
1448   print_field (keyafter);
1449   if (keyafter_truncation)
1450     fputs (truncation_string, stdout);
1451   putchar ('"');
1452
1453   /* Output the 'head' field.  */
1454
1455   fputs (" \"", stdout);
1456   if (head_truncation)
1457     fputs (truncation_string, stdout);
1458   print_field (head);
1459   putchar ('"');
1460
1461   /* Conditionally output the 'reference' field.  */
1462
1463   if (auto_reference || input_reference)
1464     {
1465       fputs (" \"", stdout);
1466       print_field (reference);
1467       putchar ('"');
1468     }
1469
1470   putchar ('\n');
1471 }
1472
1473 /*---------------------------------------------------------.
1474 | Output the current output fields as one line for 'TeX'.  |
1475 `---------------------------------------------------------*/
1476
1477 static void
1478 output_one_tex_line (void)
1479 {
1480   BLOCK key;                    /* key field, isolated */
1481   BLOCK after;                  /* after field, isolated */
1482   char *cursor;                 /* running cursor in source text */
1483
1484   printf ("\\%s ", macro_name);
1485   putchar ('{');
1486   print_field (tail);
1487   fputs ("}{", stdout);
1488   print_field (before);
1489   fputs ("}{", stdout);
1490   key.start = keyafter.start;
1491   after.end = keyafter.end;
1492   cursor = keyafter.start;
1493   SKIP_SOMETHING (cursor, keyafter.end);
1494   key.end = cursor;
1495   after.start = cursor;
1496   print_field (key);
1497   fputs ("}{", stdout);
1498   print_field (after);
1499   fputs ("}{", stdout);
1500   print_field (head);
1501   putchar ('}');
1502   if (auto_reference || input_reference)
1503     {
1504       putchar ('{');
1505       print_field (reference);
1506       putchar ('}');
1507     }
1508   putchar ('\n');
1509 }
1510
1511 /*-------------------------------------------------------------------.
1512 | Output the current output fields as one line for a dumb terminal.  |
1513 `-------------------------------------------------------------------*/
1514
1515 static void
1516 output_one_dumb_line (void)
1517 {
1518   if (!right_reference)
1519     {
1520       if (auto_reference)
1521         {
1522
1523           /* Output the 'reference' field, in such a way that GNU emacs
1524              next-error will handle it.  The ending colon is taken from the
1525              gap which follows.  */
1526
1527           print_field (reference);
1528           putchar (':');
1529           print_spaces (reference_max_width
1530                         + gap_size
1531                         - (reference.end - reference.start)
1532                         - 1);
1533         }
1534       else
1535         {
1536
1537           /* Output the 'reference' field and its following gap.  */
1538
1539           print_field (reference);
1540           print_spaces (reference_max_width
1541                         + gap_size
1542                         - (reference.end - reference.start));
1543         }
1544     }
1545
1546   if (tail.start < tail.end)
1547     {
1548       /* Output the 'tail' field.  */
1549
1550       print_field (tail);
1551       if (tail_truncation)
1552         fputs (truncation_string, stdout);
1553
1554       print_spaces (half_line_width - gap_size
1555                     - (before.end - before.start)
1556                     - (before_truncation ? truncation_string_length : 0)
1557                     - (tail.end - tail.start)
1558                     - (tail_truncation ? truncation_string_length : 0));
1559     }
1560   else
1561     print_spaces (half_line_width - gap_size
1562                   - (before.end - before.start)
1563                   - (before_truncation ? truncation_string_length : 0));
1564
1565   /* Output the 'before' field.  */
1566
1567   if (before_truncation)
1568     fputs (truncation_string, stdout);
1569   print_field (before);
1570
1571   print_spaces (gap_size);
1572
1573   /* Output the 'keyafter' field.  */
1574
1575   print_field (keyafter);
1576   if (keyafter_truncation)
1577     fputs (truncation_string, stdout);
1578
1579   if (head.start < head.end)
1580     {
1581       /* Output the 'head' field.  */
1582
1583       print_spaces (half_line_width
1584                     - (keyafter.end - keyafter.start)
1585                     - (keyafter_truncation ? truncation_string_length : 0)
1586                     - (head.end - head.start)
1587                     - (head_truncation ? truncation_string_length : 0));
1588       if (head_truncation)
1589         fputs (truncation_string, stdout);
1590       print_field (head);
1591     }
1592   else
1593
1594     if ((auto_reference || input_reference) && right_reference)
1595       print_spaces (half_line_width
1596                     - (keyafter.end - keyafter.start)
1597                     - (keyafter_truncation ? truncation_string_length : 0));
1598
1599   if ((auto_reference || input_reference) && right_reference)
1600     {
1601       /* Output the 'reference' field.  */
1602
1603       print_spaces (gap_size);
1604       print_field (reference);
1605     }
1606
1607   putchar ('\n');
1608 }
1609
1610 /*------------------------------------------------------------------------.
1611 | Scan the whole occurs table and, for each entry, output one line in the |
1612 | appropriate format.                                                     |
1613 `------------------------------------------------------------------------*/
1614
1615 static void
1616 generate_all_output (void)
1617 {
1618   OCCURS *occurs_cursor;        /* current keyword entry being processed */
1619
1620   /* The following assignments are useful to provide default values in case
1621      line contexts or references are not used, in which case these variables
1622      would never be computed.  */
1623
1624   tail.start = nullptr;
1625   tail.end = nullptr;
1626   tail_truncation = false;
1627
1628   head.start = nullptr;
1629   head.end = nullptr;
1630   head_truncation = false;
1631
1632   /* Loop over all keyword occurrences.  */
1633
1634   occurs_cursor = occurs_table[0];
1635
1636   for (idx_t occurs_index = 0; occurs_index < number_of_occurs[0];
1637        occurs_index++)
1638     {
1639       /* Compute the exact size of every field and whenever truncation flags
1640          are present or not.  */
1641
1642       define_all_fields (occurs_cursor);
1643
1644       /* Produce one output line according to selected format.  */
1645
1646       switch (output_format)
1647         {
1648         case UNKNOWN_FORMAT:
1649           /* Should never happen.  */
1650
1651         case DUMB_FORMAT:
1652           output_one_dumb_line ();
1653           break;
1654
1655         case ROFF_FORMAT:
1656           output_one_roff_line ();
1657           break;
1658
1659         case TEX_FORMAT:
1660           output_one_tex_line ();
1661           break;
1662         }
1663
1664       /* Advance the cursor into the occurs table.  */
1665
1666       occurs_cursor++;
1667     }
1668 }
1669
1670 /* Option decoding and main program.  */
1671
1672 /*------------------------------------------------------.
1673 | Print program identification and options, then exit.  |
1674 `------------------------------------------------------*/
1675
1676 void
1677 usage (int status)
1678 {
1679   if (status != EXIT_SUCCESS)
1680     emit_try_help ();
1681   else
1682     {
1683       printf (_("\
1684 Usage: %s [OPTION]... [INPUT]...   (without -G)\n\
1685   or:  %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1686               program_name, program_name);
1687       fputs (_("\
1688 Output a permuted index, including context, of the words in the input files.\n\
1689 "), stdout);
1690
1691       emit_stdin_note ();
1692       emit_mandatory_arg_note ();
1693
1694       fputs (_("\
1695   -A, --auto-reference           output automatically generated references\n\
1696   -G, --traditional              behave more like System V 'ptx'\n\
1697 "), stdout);
1698       fputs (_("\
1699   -F, --flag-truncation=STRING   use STRING for flagging line truncations.\n\
1700                                  The default is '/'\n\
1701 "), stdout);
1702       fputs (_("\
1703   -M, --macro-name=STRING        macro name to use instead of 'xx'\n\
1704   -O, --format=roff              generate output as roff directives\n\
1705   -R, --right-side-refs          put references at right, not counted in -w\n\
1706   -S, --sentence-regexp=REGEXP   for end of lines or end of sentences\n\
1707   -T, --format=tex               generate output as TeX directives\n\
1708 "), stdout);
1709       fputs (_("\
1710   -W, --word-regexp=REGEXP       use REGEXP to match each keyword\n\
1711   -b, --break-file=FILE          word break characters in this FILE\n\
1712   -f, --ignore-case              fold lower case to upper case for sorting\n\
1713   -g, --gap-size=NUMBER          gap size in columns between output fields\n\
1714   -i, --ignore-file=FILE         read ignore word list from FILE\n\
1715   -o, --only-file=FILE           read only word list from this FILE\n\
1716 "), stdout);
1717       fputs (_("\
1718   -r, --references               first field of each line is a reference\n\
1719   -t, --typeset-mode               - not implemented -\n\
1720   -w, --width=NUMBER             output width in columns, reference excluded\n\
1721 "), stdout);
1722       fputs (HELP_OPTION_DESCRIPTION, stdout);
1723       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1724       emit_ancillary_info (PROGRAM_NAME);
1725     }
1726   exit (status);
1727 }
1728
1729 /*----------------------------------------------------------------------.
1730 | Main program.  Decode ARGC arguments passed through the ARGV array of |
1731 | strings, then launch execution.                                       |
1732 `----------------------------------------------------------------------*/
1733
1734 /* Long options equivalences.  */
1735 static struct option const long_options[] =
1736 {
1737   {"auto-reference", no_argument, nullptr, 'A'},
1738   {"break-file", required_argument, nullptr, 'b'},
1739   {"flag-truncation", required_argument, nullptr, 'F'},
1740   {"ignore-case", no_argument, nullptr, 'f'},
1741   {"gap-size", required_argument, nullptr, 'g'},
1742   {"ignore-file", required_argument, nullptr, 'i'},
1743   {"macro-name", required_argument, nullptr, 'M'},
1744   {"only-file", required_argument, nullptr, 'o'},
1745   {"references", no_argument, nullptr, 'r'},
1746   {"right-side-refs", no_argument, nullptr, 'R'},
1747   {"format", required_argument, nullptr, 10},
1748   {"sentence-regexp", required_argument, nullptr, 'S'},
1749   {"traditional", no_argument, nullptr, 'G'},
1750   {"typeset-mode", no_argument, nullptr, 't'},
1751   {"width", required_argument, nullptr, 'w'},
1752   {"word-regexp", required_argument, nullptr, 'W'},
1753   {GETOPT_HELP_OPTION_DECL},
1754   {GETOPT_VERSION_OPTION_DECL},
1755   {nullptr, 0, nullptr, 0},
1756 };
1757
1758 static char const *const format_args[] =
1759 {
1760   "roff", "tex", nullptr
1761 };
1762
1763 static enum Format const format_vals[] =
1764 {
1765   ROFF_FORMAT, TEX_FORMAT
1766 };
1767
1768 int
1769 main (int argc, char **argv)
1770 {
1771   int optchar;                  /* argument character */
1772   int file_index;               /* index in text input file arrays */
1773
1774   /* Decode program options.  */
1775
1776   initialize_main (&argc, &argv);
1777   set_program_name (argv[0]);
1778   setlocale (LC_ALL, "");
1779   bindtextdomain (PACKAGE, LOCALEDIR);
1780   textdomain (PACKAGE);
1781
1782   atexit (close_stdout);
1783
1784 #if HAVE_SETCHRCLASS
1785   setchrclass (nullptr);
1786 #endif
1787
1788   while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1789                                 long_options, nullptr),
1790          optchar != EOF)
1791     {
1792       switch (optchar)
1793         {
1794         default:
1795           usage (EXIT_FAILURE);
1796
1797         case 'G':
1798           gnu_extensions = false;
1799           break;
1800
1801         case 'b':
1802           break_file = optarg;
1803           break;
1804
1805         case 'f':
1806           ignore_case = true;
1807           break;
1808
1809         case 'g':
1810           {
1811             intmax_t tmp;
1812             if (! (xstrtoimax (optarg, nullptr, 0, &tmp, "") == LONGINT_OK
1813                    && 0 < tmp && tmp <= IDX_MAX))
1814               error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
1815                      quote (optarg));
1816             gap_size = tmp;
1817             break;
1818           }
1819
1820         case 'i':
1821           ignore_file = optarg;
1822           break;
1823
1824         case 'o':
1825           only_file = optarg;
1826           break;
1827
1828         case 'r':
1829           input_reference = true;
1830           break;
1831
1832         case 't':
1833           /* Yet to understand...  */
1834           break;
1835
1836         case 'w':
1837           {
1838             intmax_t tmp;
1839             if (! (xstrtoimax (optarg, nullptr, 0, &tmp, "") == LONGINT_OK
1840                    && 0 < tmp && tmp <= IDX_MAX))
1841               error (EXIT_FAILURE, 0, _("invalid line width: %s"),
1842                      quote (optarg));
1843             line_width = tmp;
1844             break;
1845           }
1846
1847         case 'A':
1848           auto_reference = true;
1849           break;
1850
1851         case 'F':
1852           truncation_string = optarg;
1853           unescape_string (optarg);
1854           break;
1855
1856         case 'M':
1857           macro_name = optarg;
1858           break;
1859
1860         case 'O':
1861           output_format = ROFF_FORMAT;
1862           break;
1863
1864         case 'R':
1865           right_reference = true;
1866           break;
1867
1868         case 'S':
1869           context_regex.string = optarg;
1870           unescape_string (optarg);
1871           break;
1872
1873         case 'T':
1874           output_format = TEX_FORMAT;
1875           break;
1876
1877         case 'W':
1878           word_regex.string = optarg;
1879           unescape_string (optarg);
1880           if (!*word_regex.string)
1881             word_regex.string = nullptr;
1882           break;
1883
1884         case 10:
1885           output_format = XARGMATCH ("--format", optarg,
1886                                      format_args, format_vals);
1887           break;
1888
1889         case_GETOPT_HELP_CHAR;
1890
1891         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1892         }
1893     }
1894
1895   /* Process remaining arguments.  If GNU extensions are enabled, process
1896      all arguments as input parameters.  If disabled, accept at most two
1897      arguments, the second of which is an output parameter.  */
1898
1899   if (optind == argc)
1900     {
1901
1902       /* No more argument simply means: read standard input.  */
1903
1904       input_file_name = xmalloc (sizeof *input_file_name);
1905       file_line_count = xmalloc (sizeof *file_line_count);
1906       text_buffers =    xmalloc (sizeof *text_buffers);
1907       number_input_files = 1;
1908       input_file_name[0] = nullptr;
1909     }
1910   else if (gnu_extensions)
1911     {
1912       number_input_files = argc - optind;
1913       input_file_name = xnmalloc (number_input_files, sizeof *input_file_name);
1914       file_line_count = xnmalloc (number_input_files, sizeof *file_line_count);
1915       text_buffers    = xnmalloc (number_input_files, sizeof *text_buffers);
1916
1917       for (file_index = 0; file_index < number_input_files; file_index++)
1918         {
1919           if (!*argv[optind] || STREQ (argv[optind], "-"))
1920             input_file_name[file_index] = nullptr;
1921           else
1922             input_file_name[file_index] = argv[optind];
1923           optind++;
1924         }
1925     }
1926   else
1927     {
1928
1929       /* There is one necessary input file.  */
1930
1931       number_input_files = 1;
1932       input_file_name = xmalloc (sizeof *input_file_name);
1933       file_line_count = xmalloc (sizeof *file_line_count);
1934       text_buffers    = xmalloc (sizeof *text_buffers);
1935       if (!*argv[optind] || STREQ (argv[optind], "-"))
1936         input_file_name[0] = nullptr;
1937       else
1938         input_file_name[0] = argv[optind];
1939       optind++;
1940
1941       /* Redirect standard output, only if requested.  */
1942
1943       if (optind < argc)
1944         {
1945           if (! freopen (argv[optind], "w", stdout))
1946             error (EXIT_FAILURE, errno, "%s", quotef (argv[optind]));
1947           optind++;
1948         }
1949
1950       /* Diagnose any other argument as an error.  */
1951
1952       if (optind < argc)
1953         {
1954           error (0, 0, _("extra operand %s"), quote (argv[optind]));
1955           usage (EXIT_FAILURE);
1956         }
1957     }
1958
1959   /* If the output format has not been explicitly selected, choose dumb
1960      terminal format if GNU extensions are enabled, else 'roff' format.  */
1961
1962   if (output_format == UNKNOWN_FORMAT)
1963     output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
1964
1965   /* Initialize the main tables.  */
1966
1967   initialize_regex ();
1968
1969   /* Read 'Break character' file, if any.  */
1970
1971   if (break_file)
1972     digest_break_file (break_file);
1973
1974   /* Read 'Ignore words' file and 'Only words' files, if any.  If any of
1975      these files is empty, reset the name of the file to null, to avoid
1976      unnecessary calls to search_table. */
1977
1978   if (ignore_file)
1979     {
1980       digest_word_file (ignore_file, &ignore_table);
1981       if (ignore_table.length == 0)
1982         ignore_file = nullptr;
1983     }
1984
1985   if (only_file)
1986     {
1987       digest_word_file (only_file, &only_table);
1988       if (only_table.length == 0)
1989         only_file = nullptr;
1990     }
1991
1992   /* Prepare to study all the input files.  */
1993
1994   number_of_occurs[0] = 0;
1995   total_line_count = 0;
1996   maximum_word_length = 0;
1997   reference_max_width = 0;
1998
1999   for (file_index = 0; file_index < number_input_files; file_index++)
2000     {
2001       BLOCK *text_buffer = text_buffers + file_index;
2002
2003       /* Read the file contents into memory, then study it.  */
2004
2005       swallow_file_in_memory (input_file_name[file_index], text_buffer);
2006       find_occurs_in_text (file_index);
2007
2008       /* Maintain for each file how many lines has been read so far when its
2009          end is reached.  Incrementing the count first is a simple kludge to
2010          handle a possible incomplete line at end of file.  */
2011
2012       total_line_count++;
2013       file_line_count[file_index] = total_line_count;
2014     }
2015
2016   /* Do the output process phase.  */
2017
2018   sort_found_occurs ();
2019   fix_output_parameters ();
2020   generate_all_output ();
2021
2022   /* All done.  */
2023
2024   return EXIT_SUCCESS;
2025 }