src/ptx.c

   1 /* Permuted index for GNU, with keywords in their context.
   2    Copyright (C) 1990-2024 Free Software Foundation, Inc.
   3    François Pinard <pinard@iro.umontreal.ca>, 1988.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17
  18    François Pinard <pinard@iro.umontreal.ca> */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25 #include "system.h"
  26 #include <regex.h>
  27 #include "argmatch.h"
  28 #include "c-ctype.h"
  29 #include "fadvise.h"
  30 #include "quote.h"
  31 #include "read-file.h"
  32 #include "stdio--.h"
  33 #include "xstrtol.h"
  34
  35 /* The official name of this program (e.g., no 'g' prefix).  */
  36 #define PROGRAM_NAME "ptx"
  37
  38 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
  39    if "ç" (c-with-cedilla) is available in the translation's character
  40    set and encoding.  */
  41 #define AUTHORS proper_name_lite ("F. Pinard", "Fran\xc3\xa7ois Pinard")
  42
  43 /* Number of possible characters in a byte.  */
  44 #define CHAR_SET_SIZE 256
  45
  46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
  47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
  48                      : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
  49 #define OCTTOBIN(C) ((C) - '0')
  50
  51 /* Debugging the memory allocator.  */
  52
  53 #if WITH_DMALLOC
  54 # define MALLOC_FUNC_CHECK 1
  55 # include <dmalloc.h>
  56 #endif
  57
  58 /* Global definitions.  */
  59
  60 /* FIXME: There are many unchecked integer overflows in this file,
  61    and in theory they could cause this command to have undefined
  62    behavior given large inputs or options.  This command should
  63    diagnose any such overflow and exit.  */
  64
  65 /* Program options.  */
  66
  67 enum Format
  68 {
  69   UNKNOWN_FORMAT,               /* output format still unknown */
  70   DUMB_FORMAT,                  /* output for a dumb terminal */
  71   ROFF_FORMAT,                  /* output for 'troff' or 'nroff' */
  72   TEX_FORMAT                    /* output for 'TeX' or 'LaTeX' */
  73 };
  74
  75 static bool gnu_extensions = true;      /* trigger all GNU extensions */
  76 static bool auto_reference = false;     /* refs are 'file_name:line_number:' */
  77 static bool input_reference = false;    /* refs at beginning of input lines */
  78 static bool right_reference = false;    /* output refs after right context  */
  79 static ptrdiff_t line_width = 72;       /* output line width in characters */
  80 static ptrdiff_t gap_size = 3;  /* number of spaces between output fields */
  81 static char const *truncation_string = "/";
  82                                 /* string used to mark line truncations */
  83 static char const *macro_name = "xx";   /* macro name for roff or TeX output */
  84 static enum Format output_format = UNKNOWN_FORMAT;
  85                                 /* output format */
  86
  87 static bool ignore_case = false;        /* fold lower to upper for sorting */
  88 static char const *break_file = nullptr; /* name of the 'Break chars' file */
  89 static char const *only_file = nullptr; /* name of the 'Only words' file */
  90 static char const *ignore_file = nullptr; /* name of the 'Ignore words' file */
  91
  92 /* Options that use regular expressions.  */
  93 struct regex_data
  94 {
  95   /* The original regular expression, as a string.  */
  96   char const *string;
  97
  98   /* The compiled regular expression, and its fastmap.  */
  99   struct re_pattern_buffer pattern;
 100   char fastmap[UCHAR_MAX + 1];
 101 };
 102
 103 static struct regex_data context_regex; /* end of context */
 104 static struct regex_data word_regex;    /* keyword */
 105
 106 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
 107    whole file.  A WORD is similar, except it is intended for smaller regions.
 108    A WORD_TABLE may contain several WORDs.  */
 109
 110 typedef struct
 111   {
 112     char *start;                /* pointer to beginning of region */
 113     char *end;                  /* pointer to end + 1 of region */
 114   }
 115 BLOCK;
 116
 117 typedef struct
 118   {
 119     char *start;                /* pointer to beginning of region */
 120     ptrdiff_t size;             /* length of the region */
 121   }
 122 WORD;
 123
 124 typedef struct
 125   {
 126     WORD *start;                /* array of WORDs */
 127     size_t alloc;               /* allocated length */
 128     ptrdiff_t length;           /* number of used entries */
 129   }
 130 WORD_TABLE;
 131
 132 /* Pattern description tables.  */
 133
 134 /* For each character, provide its folded equivalent.  */
 135 static unsigned char folded_chars[CHAR_SET_SIZE];
 136
 137 /* End of context pattern register indices.  */
 138 static struct re_registers context_regs;
 139
 140 /* Keyword pattern register indices.  */
 141 static struct re_registers word_regs;
 142
 143 /* A word characters fastmap is used only when no word regexp has been
 144    provided.  A word is then made up of a sequence of one or more characters
 145    allowed by the fastmap.  Contains !0 if character allowed in word.  Not
 146    only this is faster in most cases, but it simplifies the implementation
 147    of the Break files.  */
 148 static char word_fastmap[CHAR_SET_SIZE];
 149
 150 /* Maximum length of any word read.  */
 151 static ptrdiff_t maximum_word_length;
 152
 153 /* Maximum width of any reference used.  */
 154 static ptrdiff_t reference_max_width;
 155
 156 /* Ignore and Only word tables.  */
 157
 158 static WORD_TABLE ignore_table; /* table of words to ignore */
 159 static WORD_TABLE only_table;           /* table of words to select */
 160
 161 /* Source text table, and scanning macros.  */
 162
 163 static int number_input_files;  /* number of text input files */
 164 static intmax_t total_line_count;       /* total number of lines seen so far */
 165 static char const **input_file_name;    /* array of text input file names */
 166 static intmax_t *file_line_count;       /* array of line count values at end */
 167
 168 static BLOCK *text_buffers;     /* files to study */
 169
 170 /* SKIP_NON_WHITE used only for getting or skipping the reference.  */
 171
 172 #define SKIP_NON_WHITE(cursor, limit) \
 173   while (cursor < limit && ! isspace (to_uchar (*cursor)))              \
 174     cursor++
 175
 176 #define SKIP_WHITE(cursor, limit) \
 177   while (cursor < limit && isspace (to_uchar (*cursor)))                \
 178     cursor++
 179
 180 #define SKIP_WHITE_BACKWARDS(cursor, start) \
 181   while (cursor > start && isspace (to_uchar (cursor[-1])))             \
 182     cursor--
 183
 184 #define SKIP_SOMETHING(cursor, limit) \
 185   if (word_regex.string)                                                \
 186     {                                                                   \
 187       regoff_t count;                                                   \
 188       count = re_match (&word_regex.pattern, cursor, limit - cursor,    \
 189                         0, nullptr);                                    \
 190       if (count == -2)                                                  \
 191         matcher_error ();                                               \
 192       cursor += count == -1 ? 1 : count;                                \
 193     }                                                                   \
 194   else if (word_fastmap[to_uchar (*cursor)])                            \
 195     while (cursor < limit && word_fastmap[to_uchar (*cursor)])          \
 196       cursor++;                                                         \
 197   else                                                                  \
 198     cursor++
 199
 200 /* Occurrences table.
 201
 202    The 'keyword' pointer provides the central word, which is surrounded
 203    by a left context and a right context.  The 'keyword' and 'length'
 204    field allow full 8-bit characters keys, even including NULs.  At other
 205    places in this program, the name 'keyafter' refers to the keyword
 206    followed by its right context.
 207
 208    The left context does not extend, towards the beginning of the file,
 209    further than a distance given by the 'left' value.  This value is
 210    relative to the keyword beginning, it is usually negative.  This
 211    insures that, except for white space, we will never have to backward
 212    scan the source text, when it is time to generate the final output
 213    lines.
 214
 215    The right context, indirectly attainable through the keyword end, does
 216    not extend, towards the end of the file, further than a distance given
 217    by the 'right' value.  This value is relative to the keyword
 218    beginning, it is usually positive.
 219
 220    When automatic references are used, the 'reference' value is the
 221    overall line number in all input files read so far, in this case, it
 222    is of type intmax_t.  When input references are used, the 'reference'
 223    value indicates the distance between the keyword beginning and the
 224    start of the reference field, and it fits in ptrdiff_t and is usually
 225    negative.  */
 226
 227 typedef struct
 228   {
 229     WORD key;                   /* description of the keyword */
 230     ptrdiff_t left;             /* distance to left context start */
 231     ptrdiff_t right;            /* distance to right context end */
 232     intmax_t reference;         /* reference descriptor */
 233     int file_index;             /* corresponding file  */
 234   }
 235 OCCURS;
 236
 237 /* The various OCCURS tables are indexed by the language.  But the time
 238    being, there is no such multiple language support.  */
 239
 240 static OCCURS *occurs_table[1]; /* all words retained from the read text */
 241 static size_t occurs_alloc[1];  /* allocated size of occurs_table */
 242 static ptrdiff_t number_of_occurs[1]; /* number of used slots in occurs_table */
 243
 244
 245 /* Communication among output routines.  */
 246
 247 /* Indicate if special output processing is requested for each character.  */
 248 static char edited_flag[CHAR_SET_SIZE];
 249
 250 /* Half of line width, reference excluded.  */
 251 static ptrdiff_t half_line_width;
 252
 253 /* Maximum width of before field.  */
 254 static ptrdiff_t before_max_width;
 255
 256 /* Maximum width of keyword-and-after field.  */
 257 static ptrdiff_t keyafter_max_width;
 258
 259 /* Length of string that flags truncation.  */
 260 static ptrdiff_t truncation_string_length;
 261
 262 /* When context is limited by lines, wraparound may happen on final output:
 263    the 'head' pointer gives access to some supplementary left context which
 264    will be seen at the end of the output line, the 'tail' pointer gives
 265    access to some supplementary right context which will be seen at the
 266    beginning of the output line. */
 267
 268 static BLOCK tail;              /* tail field */
 269 static bool tail_truncation;    /* flag truncation after the tail field */
 270
 271 static BLOCK before;            /* before field */
 272 static bool before_truncation;  /* flag truncation before the before field */
 273
 274 static BLOCK keyafter;          /* keyword-and-after field */
 275 static bool keyafter_truncation; /* flag truncation after the keyafter field */
 276
 277 static BLOCK head;              /* head field */
 278 static bool head_truncation;    /* flag truncation before the head field */
 279
 280 static BLOCK reference;         /* reference field for input reference mode */
 281
 282 /* Miscellaneous routines.  */
 283
 284 /* Diagnose an error in the regular expression matcher.  Then exit.  */
 285
 286 static void
 287 matcher_error (void)
 288 {
 289   error (EXIT_FAILURE, errno, _("error in regular expression matcher"));
 290 }
 291
 292 /* Unescape STRING in-place.  */
 293
 294 static void
 295 unescape_string (char *string)
 296 {
 297   char *cursor;                 /* cursor in result */
 298   int value;                    /* value of \nnn escape */
 299   int length;                   /* length of \nnn escape */
 300
 301   cursor = string;
 302
 303   while (*string)
 304     {
 305       if (*string == '\\')
 306         {
 307           string++;
 308           switch (*string)
 309             {
 310             case 'x':           /* \xhhh escape, 3 chars maximum */
 311               value = 0;
 312               for (length = 0, string++;
 313                    length < 3 && c_isxdigit (to_uchar (*string));
 314                    length++, string++)
 315                 value = value * 16 + HEXTOBIN (*string);
 316               if (length == 0)
 317                 {
 318                   *cursor++ = '\\';
 319                   *cursor++ = 'x';
 320                 }
 321               else
 322                 *cursor++ = value;
 323               break;
 324
 325             case '0':           /* \0ooo escape, 3 chars maximum */
 326               value = 0;
 327               for (length = 0, string++;
 328                    length < 3 && ISODIGIT (*string);
 329                    length++, string++)
 330                 value = value * 8 + OCTTOBIN (*string);
 331               *cursor++ = value;
 332               break;
 333
 334             case 'a':           /* alert */
 335               *cursor++ = '\a';
 336               string++;
 337               break;
 338
 339             case 'b':           /* backspace */
 340               *cursor++ = '\b';
 341               string++;
 342               break;
 343
 344             case 'c':           /* cancel the rest of the output */
 345               while (*string)
 346                 string++;
 347               break;
 348
 349             case 'f':           /* form feed */
 350               *cursor++ = '\f';
 351               string++;
 352               break;
 353
 354             case 'n':           /* new line */
 355               *cursor++ = '\n';
 356               string++;
 357               break;
 358
 359             case 'r':           /* carriage return */
 360               *cursor++ = '\r';
 361               string++;
 362               break;
 363
 364             case 't':           /* horizontal tab */
 365               *cursor++ = '\t';
 366               string++;
 367               break;
 368
 369             case 'v':           /* vertical tab */
 370               *cursor++ = '\v';
 371               string++;
 372               break;
 373
 374             case '\0':          /* lone backslash at end of string */
 375               /* ignore it */
 376               break;
 377
 378             default:
 379               *cursor++ = '\\';
 380               *cursor++ = *string++;
 381               break;
 382             }
 383         }
 384       else
 385         *cursor++ = *string++;
 386     }
 387
 388   *cursor = '\0';
 389 }
 390
 391 /*--------------------------------------------------------------------------.
 392 | Compile the regex represented by REGEX, diagnose and abort if any error.  |
 393 `--------------------------------------------------------------------------*/
 394
 395 static void
 396 compile_regex (struct regex_data *regex)
 397 {
 398   struct re_pattern_buffer *pattern = &regex->pattern;
 399   char const *string = regex->string;
 400   char const *message;
 401
 402   pattern->buffer = nullptr;
 403   pattern->allocated = 0;
 404   pattern->fastmap = regex->fastmap;
 405   pattern->translate = ignore_case ? folded_chars : nullptr;
 406
 407   message = re_compile_pattern (string, strlen (string), pattern);
 408   if (message)
 409     error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
 410
 411   /* The fastmap should be compiled before 're_match'.  The following
 412      call is not mandatory, because 're_search' is always called sooner,
 413      and it compiles the fastmap if this has not been done yet.  */
 414
 415   re_compile_fastmap (pattern);
 416 }
 417
 418 /*------------------------------------------------------------------------.
 419 | This will initialize various tables for pattern match and compiles some |
 420 | regexps.                                                                |
 421 `------------------------------------------------------------------------*/
 422
 423 static void
 424 initialize_regex (void)
 425 {
 426   int character;                /* character value */
 427
 428   /* Initialize the case folding table.  */
 429
 430   if (ignore_case)
 431     for (character = 0; character < CHAR_SET_SIZE; character++)
 432       folded_chars[character] = toupper (character);
 433
 434   /* Unless the user already provided a description of the end of line or
 435      end of sentence sequence, select an end of line sequence to compile.
 436      If the user provided an empty definition, thus disabling end of line
 437      or sentence feature, make it null to speed up tests.  If GNU
 438      extensions are enabled, use end of sentence like in GNU emacs.  If
 439      disabled, use end of lines.  */
 440
 441   if (context_regex.string)
 442     {
 443       if (!*context_regex.string)
 444         context_regex.string = nullptr;
 445     }
 446   else if (gnu_extensions && !input_reference)
 447     context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\|  \\)[ \t\n]*";
 448   else
 449     context_regex.string = "\n";
 450
 451   if (context_regex.string)
 452     compile_regex (&context_regex);
 453
 454   /* If the user has already provided a non-empty regexp to describe
 455      words, compile it.  Else, unless this has already been done through
 456      a user provided Break character file, construct a fastmap of
 457      characters that may appear in a word.  If GNU extensions enabled,
 458      include only letters of the underlying character set.  If disabled,
 459      include almost everything, even punctuation; stop only on white
 460      space.  */
 461
 462   if (word_regex.string)
 463     compile_regex (&word_regex);
 464   else if (!break_file)
 465     {
 466       if (gnu_extensions)
 467         {
 468
 469           /* Simulate \w+.  */
 470
 471           for (character = 0; character < CHAR_SET_SIZE; character++)
 472             word_fastmap[character] = !! isalpha (character);
 473         }
 474       else
 475         {
 476
 477           /* Simulate [^ \t\n]+.  */
 478
 479           memset (word_fastmap, 1, CHAR_SET_SIZE);
 480           word_fastmap[' '] = 0;
 481           word_fastmap['\t'] = 0;
 482           word_fastmap['\n'] = 0;
 483         }
 484     }
 485 }
 486
 487 /*------------------------------------------------------------------------.
 488 | This routine will attempt to swallow a whole file name FILE_NAME into a |
 489 | contiguous region of memory and return a description of it into BLOCK.  |
 490 | Standard input is assumed whenever FILE_NAME is null, empty or "-".     |
 491 |                                                                         |
 492 | Previously, in some cases, white space compression was attempted while  |
 493 | inputting text.  This was defeating some regexps like default end of    |
 494 | sentence, which checks for two consecutive spaces.  If white space      |
 495 | compression is ever reinstated, it should be in output routines.        |
 496 `------------------------------------------------------------------------*/
 497
 498 static void
 499 swallow_file_in_memory (char const *file_name, BLOCK *block)
 500 {
 501   size_t used_length;           /* used length in memory buffer */
 502
 503   /* As special cases, a file name which is null or "-" indicates standard
 504      input, which is already opened.  In all other cases, open the file from
 505      its name.  */
 506   bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
 507   if (using_stdin)
 508     block->start = fread_file (stdin, 0, &used_length);
 509   else
 510     block->start = read_file (file_name, 0, &used_length);
 511
 512   if (!block->start)
 513     error (EXIT_FAILURE, errno, "%s", quotef (using_stdin ? "-" : file_name));
 514
 515   if (using_stdin)
 516     clearerr (stdin);
 517
 518   block->end = block->start + used_length;
 519 }
 520
 521 /* Sort and search routines.  */
 522
 523 /*--------------------------------------------------------------------------.
 524 | Compare two words, FIRST and SECOND, and return 0 if they are identical.  |
 525 | Return less than 0 if the first word goes before the second; return       |
 526 | greater than 0 if the first word goes after the second.                   |
 527 |                                                                           |
 528 | If a word is indeed a prefix of the other, the shorter should go first.   |
 529 `--------------------------------------------------------------------------*/
 530
 531 static int
 532 compare_words (const void *void_first, const void *void_second)
 533 {
 534 #define first ((const WORD *) void_first)
 535 #define second ((const WORD *) void_second)
 536   ptrdiff_t length;             /* minimum of two lengths */
 537   ptrdiff_t counter;            /* cursor in words */
 538   int value;                    /* value of comparison */
 539
 540   length = first->size < second->size ? first->size : second->size;
 541
 542   if (ignore_case)
 543     {
 544       for (counter = 0; counter < length; counter++)
 545         {
 546           value = (folded_chars [to_uchar (first->start[counter])]
 547                    - folded_chars [to_uchar (second->start[counter])]);
 548           if (value != 0)
 549             return value;
 550         }
 551     }
 552   else
 553     {
 554       for (counter = 0; counter < length; counter++)
 555         {
 556           value = (to_uchar (first->start[counter])
 557                    - to_uchar (second->start[counter]));
 558           if (value != 0)
 559             return value;
 560         }
 561     }
 562
 563   return (first->size > second->size) - (first->size < second->size);
 564 #undef first
 565 #undef second
 566 }
 567
 568 /*-----------------------------------------------------------------------.
 569 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
 570 | go first.  In case of a tie, preserve the original order through a     |
 571 | pointer comparison.                                                    |
 572 `-----------------------------------------------------------------------*/
 573
 574 static int
 575 compare_occurs (const void *void_first, const void *void_second)
 576 {
 577 #define first ((const OCCURS *) void_first)
 578 #define second ((const OCCURS *) void_second)
 579   int value;
 580
 581   value = compare_words (&first->key, &second->key);
 582   return (value ? value
 583           : ((first->key.start > second->key.start)
 584              - (first->key.start < second->key.start)));
 585 #undef first
 586 #undef second
 587 }
 588
 589 /* True if WORD appears in TABLE.  Uses a binary search.  */
 590
 591 ATTRIBUTE_PURE
 592 static bool
 593 search_table (WORD *word, WORD_TABLE *table)
 594 {
 595   ptrdiff_t lowest;             /* current lowest possible index */
 596   ptrdiff_t highest;            /* current highest possible index */
 597   ptrdiff_t middle;             /* current middle index */
 598   int value;                    /* value from last comparison */
 599
 600   lowest = 0;
 601   highest = table->length - 1;
 602   while (lowest <= highest)
 603     {
 604       middle = (lowest + highest) / 2;
 605       value = compare_words (word, table->start + middle);
 606       if (value < 0)
 607         highest = middle - 1;
 608       else if (value > 0)
 609         lowest = middle + 1;
 610       else
 611         return true;
 612     }
 613   return false;
 614 }
 615
 616 /*---------------------------------------------------------------------.
 617 | Sort the whole occurs table in memory.  Presumably, 'qsort' does not |
 618 | take intermediate copies or table elements, so the sort will be      |
 619 | stabilized throughout the comparison routine.                        |
 620 `---------------------------------------------------------------------*/
 621
 622 static void
 623 sort_found_occurs (void)
 624 {
 625
 626   /* Only one language for the time being.  */
 627   if (number_of_occurs[0])
 628     qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
 629            compare_occurs);
 630 }
 631
 632 /* Parameter files reading routines.  */
 633
 634 /*----------------------------------------------------------------------.
 635 | Read a file named FILE_NAME, containing a set of break characters.    |
 636 | Build a content to the array word_fastmap in which all characters are |
 637 | allowed except those found in the file.  Characters may be repeated.  |
 638 `----------------------------------------------------------------------*/
 639
 640 static void
 641 digest_break_file (char const *file_name)
 642 {
 643   BLOCK file_contents;          /* to receive a copy of the file */
 644   char *cursor;                 /* cursor in file copy */
 645
 646   swallow_file_in_memory (file_name, &file_contents);
 647
 648   /* Make the fastmap and record the file contents in it.  */
 649
 650   memset (word_fastmap, 1, CHAR_SET_SIZE);
 651   for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
 652     word_fastmap[to_uchar (*cursor)] = 0;
 653
 654   if (!gnu_extensions)
 655     {
 656
 657       /* If GNU extensions are enabled, the only way to avoid newline as
 658          a break character is to write all the break characters in the
 659          file with no newline at all, not even at the end of the file.
 660          If disabled, spaces, tabs and newlines are always considered as
 661          break characters even if not included in the break file.  */
 662
 663       word_fastmap[' '] = 0;
 664       word_fastmap['\t'] = 0;
 665       word_fastmap['\n'] = 0;
 666     }
 667
 668   /* Return the space of the file, which is no more required.  */
 669
 670   free (file_contents.start);
 671 }
 672
 673 /*-----------------------------------------------------------------------.
 674 | Read a file named FILE_NAME, containing one word per line, then        |
 675 | construct in TABLE a table of WORD descriptors for them.  The routine  |
 676 | swallows the whole file in memory; this is at the expense of space     |
 677 | needed for newlines, which are useless; however, the reading is fast.  |
 678 `-----------------------------------------------------------------------*/
 679
 680 static void
 681 digest_word_file (char const *file_name, WORD_TABLE *table)
 682 {
 683   BLOCK file_contents;          /* to receive a copy of the file */
 684   char *cursor;                 /* cursor in file copy */
 685   char *word_start;             /* start of the current word */
 686
 687   swallow_file_in_memory (file_name, &file_contents);
 688
 689   table->start = nullptr;
 690   table->alloc = 0;
 691   table->length = 0;
 692
 693   /* Read the whole file.  */
 694
 695   cursor = file_contents.start;
 696   while (cursor < file_contents.end)
 697     {
 698
 699       /* Read one line, and save the word in contains.  */
 700
 701       word_start = cursor;
 702       while (cursor < file_contents.end && *cursor != '\n')
 703         cursor++;
 704
 705       /* Record the word in table if it is not empty.  */
 706
 707       if (cursor > word_start)
 708         {
 709           if (table->length == table->alloc)
 710             table->start = x2nrealloc (table->start, &table->alloc,
 711                                        sizeof *table->start);
 712           table->start[table->length].start = word_start;
 713           table->start[table->length].size = cursor - word_start;
 714           table->length++;
 715         }
 716
 717       /* This test allows for an incomplete line at end of file.  */
 718
 719       if (cursor < file_contents.end)
 720         cursor++;
 721     }
 722
 723   /* Finally, sort all the words read.  */
 724
 725   qsort (table->start, table->length, sizeof table->start[0], compare_words);
 726 }
 727
 728 /* Keyword recognition and selection.  */
 729
 730 /*----------------------------------------------------------------------.
 731 | For each keyword in the source text, constructs an OCCURS structure.  |
 732 `----------------------------------------------------------------------*/
 733
 734 static void
 735 find_occurs_in_text (int file_index)
 736 {
 737   char *cursor;                 /* for scanning the source text */
 738   char *scan;                   /* for scanning the source text also */
 739   char *line_start;             /* start of the current input line */
 740   char *line_scan;              /* newlines scanned until this point */
 741   ptrdiff_t reference_length;   /* length of reference in input mode */
 742   WORD possible_key;            /* possible key, to ease searches */
 743   OCCURS *occurs_cursor;        /* current OCCURS under construction */
 744
 745   char *context_start;          /* start of left context */
 746   char *context_end;            /* end of right context */
 747   char *word_start;             /* start of word */
 748   char *word_end;               /* end of word */
 749   char *next_context_start;     /* next start of left context */
 750
 751   const BLOCK *text_buffer = &text_buffers[file_index];
 752
 753   /* reference_length is always used within 'if (input_reference)'.
 754      However, GNU C diagnoses that it may be used uninitialized.  The
 755      following assignment is merely to shut it up.  */
 756
 757   reference_length = 0;
 758
 759   /* Tracking where lines start is helpful for reference processing.  In
 760      auto reference mode, this allows counting lines.  In input reference
 761      mode, this permits finding the beginning of the references.
 762
 763      The first line begins with the file, skip immediately this very first
 764      reference in input reference mode, to help further rejection any word
 765      found inside it.  Also, unconditionally assigning these variable has
 766      the happy effect of shutting up lint.  */
 767
 768   line_start = text_buffer->start;
 769   line_scan = line_start;
 770   if (input_reference)
 771     {
 772       SKIP_NON_WHITE (line_scan, text_buffer->end);
 773       reference_length = line_scan - line_start;
 774       SKIP_WHITE (line_scan, text_buffer->end);
 775     }
 776
 777   /* Process the whole buffer, one line or one sentence at a time.  */
 778
 779   for (cursor = text_buffer->start;
 780        cursor < text_buffer->end;
 781        cursor = next_context_start)
 782     {
 783
 784       /* 'context_start' gets initialized before the processing of each
 785          line, or once for the whole buffer if no end of line or sentence
 786          sequence separator.  */
 787
 788       context_start = cursor;
 789
 790       /* If an end of line or end of sentence sequence is defined and
 791          non-empty, 'next_context_start' will be recomputed to be the end of
 792          each line or sentence, before each one is processed.  If no such
 793          sequence, then 'next_context_start' is set at the end of the whole
 794          buffer, which is then considered to be a single line or sentence.
 795          This test also accounts for the case of an incomplete line or
 796          sentence at the end of the buffer.  */
 797
 798       next_context_start = text_buffer->end;
 799       if (context_regex.string)
 800         switch (re_search (&context_regex.pattern, cursor,
 801                            text_buffer->end - cursor,
 802                            0, text_buffer->end - cursor, &context_regs))
 803           {
 804           case -2:
 805             matcher_error ();
 806
 807           case -1:
 808             break;
 809
 810           case 0:
 811             error (EXIT_FAILURE, 0,
 812                    _("error: regular expression has a match of length zero:"
 813                      " %s"),
 814                    quote (context_regex.string));
 815
 816           default:
 817             next_context_start = cursor + context_regs.end[0];
 818             break;
 819           }
 820
 821       /* Include the separator into the right context, but not any suffix
 822          white space in this separator; this insures it will be seen in
 823          output and will not take more space than necessary.  */
 824
 825       context_end = next_context_start;
 826       SKIP_WHITE_BACKWARDS (context_end, context_start);
 827
 828       /* Read and process a single input line or sentence, one word at a
 829          time.  */
 830
 831       while (true)
 832         {
 833           if (word_regex.string)
 834
 835             /* If a word regexp has been compiled, use it to skip at the
 836                beginning of the next word.  If there is no such word, exit
 837                the loop.  */
 838
 839             {
 840               regoff_t r = re_search (&word_regex.pattern, cursor,
 841                                       context_end - cursor,
 842                                       0, context_end - cursor, &word_regs);
 843               if (r == -2)
 844                 matcher_error ();
 845               if (r == -1)
 846                 break;
 847               word_start = cursor + word_regs.start[0];
 848               word_end = cursor + word_regs.end[0];
 849             }
 850           else
 851
 852             /* Avoid re_search and use the fastmap to skip to the
 853                beginning of the next word.  If there is no more word in
 854                the buffer, exit the loop.  */
 855
 856             {
 857               scan = cursor;
 858               while (scan < context_end
 859                      && !word_fastmap[to_uchar (*scan)])
 860                 scan++;
 861
 862               if (scan == context_end)
 863                 break;
 864
 865               word_start = scan;
 866
 867               while (scan < context_end
 868                      && word_fastmap[to_uchar (*scan)])
 869                 scan++;
 870
 871               word_end = scan;
 872             }
 873
 874           /* Skip right to the beginning of the found word.  */
 875
 876           cursor = word_start;
 877
 878           /* Skip any zero length word.  Just advance a single position,
 879              then go fetch the next word.  */
 880
 881           if (word_end == word_start)
 882             {
 883               cursor++;
 884               continue;
 885             }
 886
 887           /* This is a genuine, non empty word, so save it as a possible
 888              key.  Then skip over it.  Also, maintain the maximum length of
 889              all words read so far.  It is mandatory to take the maximum
 890              length of all words in the file, without considering if they
 891              are actually kept or rejected, because backward jumps at output
 892              generation time may fall in *any* word.  */
 893
 894           possible_key.start = cursor;
 895           possible_key.size = word_end - word_start;
 896           cursor += possible_key.size;
 897
 898           if (possible_key.size > maximum_word_length)
 899             maximum_word_length = possible_key.size;
 900
 901           /* In input reference mode, update 'line_start' from its previous
 902              value.  Count the lines just in case auto reference mode is
 903              also selected. If it happens that the word just matched is
 904              indeed part of a reference; just ignore it.  */
 905
 906           if (input_reference)
 907             {
 908               while (line_scan < possible_key.start)
 909                 if (*line_scan == '\n')
 910                   {
 911                     total_line_count++;
 912                     line_scan++;
 913                     line_start = line_scan;
 914                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 915                     reference_length = line_scan - line_start;
 916                   }
 917                 else
 918                   line_scan++;
 919               if (line_scan > possible_key.start)
 920                 continue;
 921             }
 922
 923           /* Ignore the word if an 'Ignore words' table exists and if it is
 924              part of it.  Also ignore the word if an 'Only words' table and
 925              if it is *not* part of it.
 926
 927              It is allowed that both tables be used at once, even if this
 928              may look strange for now.  Just ignore a word that would appear
 929              in both.  If regexps are eventually implemented for these
 930              tables, the Ignore table could then reject words that would
 931              have been previously accepted by the Only table.  */
 932
 933           if (ignore_file && search_table (&possible_key, &ignore_table))
 934             continue;
 935           if (only_file && !search_table (&possible_key, &only_table))
 936             continue;
 937
 938           /* A non-empty word has been found.  First of all, insure
 939              proper allocation of the next OCCURS, and make a pointer to
 940              where it will be constructed.  */
 941
 942           if (number_of_occurs[0] == occurs_alloc[0])
 943             occurs_table[0] = x2nrealloc (occurs_table[0],
 944                                           &occurs_alloc[0],
 945                                           sizeof *occurs_table[0]);
 946           occurs_cursor = occurs_table[0] + number_of_occurs[0];
 947
 948           /* Define the reference field, if any.  */
 949
 950           if (auto_reference)
 951             {
 952
 953               /* While auto referencing, update 'line_start' from its
 954                  previous value, counting lines as we go.  If input
 955                  referencing at the same time, 'line_start' has been
 956                  advanced earlier, and the following loop is never really
 957                  executed.  */
 958
 959               while (line_scan < possible_key.start)
 960                 if (*line_scan == '\n')
 961                   {
 962                     total_line_count++;
 963                     line_scan++;
 964                     line_start = line_scan;
 965                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 966                   }
 967                 else
 968                   line_scan++;
 969
 970               occurs_cursor->reference = total_line_count;
 971             }
 972           else if (input_reference)
 973             {
 974
 975               /* If only input referencing, 'line_start' has been computed
 976                  earlier to detect the case the word matched would be part
 977                  of the reference.  The reference position is simply the
 978                  value of 'line_start'.  */
 979
 980               occurs_cursor->reference = line_start - possible_key.start;
 981               if (reference_length > reference_max_width)
 982                 reference_max_width = reference_length;
 983             }
 984
 985           /* Exclude the reference from the context in simple cases.  */
 986
 987           if (input_reference && line_start == context_start)
 988             {
 989               SKIP_NON_WHITE (context_start, context_end);
 990               SKIP_WHITE (context_start, context_end);
 991             }
 992
 993           /* Completes the OCCURS structure.  */
 994
 995           occurs_cursor->key = possible_key;
 996           occurs_cursor->left = context_start - possible_key.start;
 997           occurs_cursor->right = context_end - possible_key.start;
 998           occurs_cursor->file_index = file_index;
 999
1000           number_of_occurs[0]++;
1001         }
1002     }
1003 }
1004
1005 /* Formatting and actual output - service routines.  */
1006
1007 /*-----------------------------------------.
1008 | Prints some NUMBER of spaces on stdout.  |
1009 `-----------------------------------------*/
1010
1011 static void
1012 print_spaces (ptrdiff_t number)
1013 {
1014   for (ptrdiff_t counter = number; counter > 0; counter--)
1015     putchar (' ');
1016 }
1017
1018 /*-------------------------------------.
1019 | Prints the field provided by FIELD.  |
1020 `-------------------------------------*/
1021
1022 static void
1023 print_field (BLOCK field)
1024 {
1025   char *cursor;                 /* Cursor in field to print */
1026
1027   /* Whitespace is not really compressed.  Instead, each white space
1028      character (tab, vt, ht etc.) is printed as one single space.  */
1029
1030   for (cursor = field.start; cursor < field.end; cursor++)
1031     {
1032       unsigned char character = *cursor;
1033       if (edited_flag[character])
1034         {
1035           /* Handle cases which are specific to 'roff' or TeX.  All
1036              white space processing is done as the default case of
1037              this switch.  */
1038
1039           switch (character)
1040             {
1041             case '"':
1042               /* In roff output format, double any quote.  */
1043               putchar ('"');
1044               putchar ('"');
1045               break;
1046
1047             case '$':
1048             case '%':
1049             case '&':
1050             case '#':
1051             case '_':
1052               /* In TeX output format, precede these with a backslash.  */
1053               putchar ('\\');
1054               putchar (character);
1055               break;
1056
1057             case '{':
1058             case '}':
1059               /* In TeX output format, precede these with a backslash and
1060                  force mathematical mode.  */
1061               printf ("$\\%c$", character);
1062               break;
1063
1064             case '\\':
1065               /* In TeX output mode, request production of a backslash.  */
1066               fputs ("\\backslash{}", stdout);
1067               break;
1068
1069             default:
1070               /* Any other flagged character produces a single space.  */
1071               putchar (' ');
1072             }
1073         }
1074       else
1075         putchar (*cursor);
1076     }
1077 }
1078
1079 /* Formatting and actual output - planning routines.  */
1080
1081 /*--------------------------------------------------------------------.
1082 | From information collected from command line options and input file |
1083 | readings, compute and fix some output parameter values.             |
1084 `--------------------------------------------------------------------*/
1085
1086 static void
1087 fix_output_parameters (void)
1088 {
1089   size_t file_index;            /* index in text input file arrays */
1090   intmax_t line_ordinal;        /* line ordinal value for reference */
1091   ptrdiff_t reference_width;    /* width for the whole reference */
1092   int character;                /* character ordinal */
1093   char const *cursor;           /* cursor in some constant strings */
1094
1095   /* In auto reference mode, the maximum width of this field is
1096      precomputed and subtracted from the overall line width.  Add one for
1097      the column which separate the file name from the line number.  */
1098
1099   if (auto_reference)
1100     {
1101       reference_max_width = 0;
1102       for (file_index = 0; file_index < number_input_files; file_index++)
1103         {
1104           line_ordinal = file_line_count[file_index] + 1;
1105           if (file_index > 0)
1106             line_ordinal -= file_line_count[file_index - 1];
1107           char ordinal_string[INT_BUFSIZE_BOUND (intmax_t)];
1108           reference_width = sprintf (ordinal_string, "%jd", line_ordinal);
1109           if (input_file_name[file_index])
1110             reference_width += strlen (input_file_name[file_index]);
1111           if (reference_width > reference_max_width)
1112             reference_max_width = reference_width;
1113         }
1114       reference_max_width++;
1115       reference.start = xmalloc (reference_max_width + 1);
1116     }
1117
1118   /* If the reference appears to the left of the output line, reserve some
1119      space for it right away, including one gap size.  */
1120
1121   if ((auto_reference || input_reference) && !right_reference)
1122     line_width -= reference_max_width + gap_size;
1123   if (line_width < 0)
1124     line_width = 0;
1125
1126   /* The output lines, minimally, will contain from left to right a left
1127      context, a gap, and a keyword followed by the right context with no
1128      special intervening gap.  Half of the line width is dedicated to the
1129      left context and the gap, the other half is dedicated to the keyword
1130      and the right context; these values are computed once and for all here.
1131      There also are tail and head wrap around fields, used when the keyword
1132      is near the beginning or the end of the line, or when some long word
1133      cannot fit in, but leave place from wrapped around shorter words.  The
1134      maximum width of these fields are recomputed separately for each line,
1135      on a case by case basis.  It is worth noting that it cannot happen that
1136      both the tail and head fields are used at once.  */
1137
1138   half_line_width = line_width / 2;
1139   before_max_width = half_line_width - gap_size;
1140   keyafter_max_width = half_line_width;
1141
1142   /* If truncation_string is the empty string, make it null to speed up
1143      tests.  In this case, truncation_string_length will never get used, so
1144      there is no need to set it.  */
1145
1146   if (truncation_string && *truncation_string)
1147     truncation_string_length = strlen (truncation_string);
1148   else
1149     truncation_string = nullptr;
1150
1151   if (gnu_extensions)
1152     {
1153
1154       /* When flagging truncation at the left of the keyword, the
1155          truncation mark goes at the beginning of the before field,
1156          unless there is a head field, in which case the mark goes at the
1157          left of the head field.  When flagging truncation at the right
1158          of the keyword, the mark goes at the end of the keyafter field,
1159          unless there is a tail field, in which case the mark goes at the
1160          end of the tail field.  Only eight combination cases could arise
1161          for truncation marks:
1162
1163          . None.
1164          . One beginning the before field.
1165          . One beginning the head field.
1166          . One ending the keyafter field.
1167          . One ending the tail field.
1168          . One beginning the before field, another ending the keyafter field.
1169          . One ending the tail field, another beginning the before field.
1170          . One ending the keyafter field, another beginning the head field.
1171
1172          So, there is at most two truncation marks, which could appear both
1173          on the left side of the center of the output line, both on the
1174          right side, or one on either side.  */
1175
1176       before_max_width -= 2 * truncation_string_length;
1177       if (before_max_width < 0)
1178         before_max_width = 0;
1179       keyafter_max_width -= 2 * truncation_string_length;
1180     }
1181   else
1182     {
1183
1184       /* I never figured out exactly how UNIX' ptx plans the output width
1185          of its various fields.  If GNU extensions are disabled, do not
1186          try computing the field widths correctly; instead, use the
1187          following formula, which does not completely imitate UNIX' ptx,
1188          but almost.  */
1189
1190       keyafter_max_width -= 2 * truncation_string_length + 1;
1191     }
1192
1193   /* Compute which characters need special output processing.  Initialize
1194      by flagging any white space character.  Some systems do not consider
1195      form feed as a space character, but we do.  */
1196
1197   for (character = 0; character < CHAR_SET_SIZE; character++)
1198     edited_flag[character] = !! isspace (character);
1199   edited_flag['\f'] = 1;
1200
1201   /* Complete the special character flagging according to selected output
1202      format.  */
1203
1204   switch (output_format)
1205     {
1206     case UNKNOWN_FORMAT:
1207       /* Should never happen.  */
1208
1209     case DUMB_FORMAT:
1210       break;
1211
1212     case ROFF_FORMAT:
1213
1214       /* 'Quote' characters should be doubled.  */
1215
1216       edited_flag['"'] = 1;
1217       break;
1218
1219     case TEX_FORMAT:
1220
1221       /* Various characters need special processing.  */
1222
1223       for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1224         edited_flag[to_uchar (*cursor)] = 1;
1225
1226       break;
1227     }
1228 }
1229
1230 /*------------------------------------------------------------------.
1231 | Compute the position and length of all the output fields, given a |
1232 | pointer to some OCCURS.                                           |
1233 `------------------------------------------------------------------*/
1234
1235 static void
1236 define_all_fields (OCCURS *occurs)
1237 {
1238   ptrdiff_t tail_max_width;     /* allowable width of tail field */
1239   ptrdiff_t head_max_width;     /* allowable width of head field */
1240   char *cursor;                 /* running cursor in source text */
1241   char *left_context_start;     /* start of left context */
1242   char *right_context_end;      /* end of right context */
1243   char *left_field_start;       /* conservative start for 'head'/'before' */
1244   char const *file_name;        /* file name for reference */
1245   intmax_t line_ordinal;        /* line ordinal for reference */
1246   char const *buffer_start;     /* start of buffered file for this occurs */
1247   char const *buffer_end;       /* end of buffered file for this occurs */
1248
1249   /* Define 'keyafter', start of left context and end of right context.
1250      'keyafter' starts at the saved position for keyword and extend to the
1251      right from the end of the keyword, eating separators or full words, but
1252      not beyond maximum allowed width for 'keyafter' field or limit for the
1253      right context.  Suffix spaces will be removed afterwards.  */
1254
1255   keyafter.start = occurs->key.start;
1256   keyafter.end = keyafter.start + occurs->key.size;
1257   left_context_start = keyafter.start + occurs->left;
1258   right_context_end = keyafter.start + occurs->right;
1259
1260   buffer_start = text_buffers[occurs->file_index].start;
1261   buffer_end = text_buffers[occurs->file_index].end;
1262
1263   cursor = keyafter.end;
1264   while (cursor < right_context_end
1265          && cursor <= keyafter.start + keyafter_max_width)
1266     {
1267       keyafter.end = cursor;
1268       SKIP_SOMETHING (cursor, right_context_end);
1269     }
1270   if (cursor <= keyafter.start + keyafter_max_width)
1271     keyafter.end = cursor;
1272
1273   keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1274
1275   SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1276
1277   /* When the left context is wide, it might take some time to catch up from
1278      the left context boundary to the beginning of the 'head' or 'before'
1279      fields.  So, in this case, to speed the catchup, we jump back from the
1280      keyword, using some secure distance, possibly falling in the middle of
1281      a word.  A secure backward jump would be at least half the maximum
1282      width of a line, plus the size of the longest word met in the whole
1283      input.  We conclude this backward jump by a skip forward of at least
1284      one word.  In this manner, we should not inadvertently accept only part
1285      of a word.  From the reached point, when it will be time to fix the
1286      beginning of 'head' or 'before' fields, we will skip forward words or
1287      delimiters until we get sufficiently near.  */
1288
1289   if (-occurs->left > half_line_width + maximum_word_length)
1290     {
1291       left_field_start
1292         = keyafter.start - (half_line_width + maximum_word_length);
1293       SKIP_SOMETHING (left_field_start, keyafter.start);
1294     }
1295   else
1296     left_field_start = keyafter.start + occurs->left;
1297
1298   /* 'before' certainly ends at the keyword, but not including separating
1299      spaces.  It starts after than the saved value for the left context, by
1300      advancing it until it falls inside the maximum allowed width for the
1301      before field.  There will be no prefix spaces either.  'before' only
1302      advances by skipping single separators or whole words. */
1303
1304   before.start = left_field_start;
1305   before.end = keyafter.start;
1306   SKIP_WHITE_BACKWARDS (before.end, before.start);
1307
1308   while (before.start + before_max_width < before.end)
1309     SKIP_SOMETHING (before.start, before.end);
1310
1311   if (truncation_string)
1312     {
1313       cursor = before.start;
1314       SKIP_WHITE_BACKWARDS (cursor, buffer_start);
1315       before_truncation = cursor > left_context_start;
1316     }
1317   else
1318     before_truncation = false;
1319
1320   SKIP_WHITE (before.start, buffer_end);
1321
1322   /* The tail could not take more columns than what has been left in the
1323      left context field, and a gap is mandatory.  It starts after the
1324      right context, and does not contain prefixed spaces.  It ends at
1325      the end of line, the end of buffer or when the tail field is full,
1326      whichever comes first.  It cannot contain only part of a word, and
1327      has no suffixed spaces.  */
1328
1329   tail_max_width
1330     = before_max_width - (before.end - before.start) - gap_size;
1331
1332   if (tail_max_width > 0)
1333     {
1334       tail.start = keyafter.end;
1335       SKIP_WHITE (tail.start, buffer_end);
1336
1337       tail.end = tail.start;
1338       cursor = tail.end;
1339       while (cursor < right_context_end
1340              && cursor < tail.start + tail_max_width)
1341         {
1342           tail.end = cursor;
1343           SKIP_SOMETHING (cursor, right_context_end);
1344         }
1345
1346       if (cursor < tail.start + tail_max_width)
1347         tail.end = cursor;
1348
1349       if (tail.end > tail.start)
1350         {
1351           keyafter_truncation = false;
1352           tail_truncation = truncation_string && tail.end < right_context_end;
1353         }
1354       else
1355         tail_truncation = false;
1356
1357       SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1358     }
1359   else
1360     {
1361
1362       /* No place left for a tail field.  */
1363
1364       tail.start = nullptr;
1365       tail.end = nullptr;
1366       tail_truncation = false;
1367     }
1368
1369   /* 'head' could not take more columns than what has been left in the right
1370      context field, and a gap is mandatory.  It ends before the left
1371      context, and does not contain suffixed spaces.  Its pointer is advanced
1372      until the head field has shrunk to its allowed width.  It cannot
1373      contain only part of a word, and has no suffixed spaces.  */
1374
1375   head_max_width
1376     = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1377
1378   if (head_max_width > 0)
1379     {
1380       head.end = before.start;
1381       SKIP_WHITE_BACKWARDS (head.end, buffer_start);
1382
1383       head.start = left_field_start;
1384       while (head.start + head_max_width < head.end)
1385         SKIP_SOMETHING (head.start, head.end);
1386
1387       if (head.end > head.start)
1388         {
1389           before_truncation = false;
1390           head_truncation = (truncation_string
1391                              && head.start > left_context_start);
1392         }
1393       else
1394         head_truncation = false;
1395
1396       SKIP_WHITE (head.start, head.end);
1397     }
1398   else
1399     {
1400
1401       /* No place left for a head field.  */
1402
1403       head.start = nullptr;
1404       head.end = nullptr;
1405       head_truncation = false;
1406     }
1407
1408   if (auto_reference)
1409     {
1410
1411       /* Construct the reference text in preallocated space from the file
1412          name and the line number.  Standard input yields an empty file name.
1413          Ensure line numbers are 1 based, even if they are computed 0 based.  */
1414
1415       file_name = input_file_name[occurs->file_index];
1416       if (!file_name)
1417         file_name = "";
1418
1419       line_ordinal = occurs->reference + 1;
1420       if (occurs->file_index > 0)
1421         line_ordinal -= file_line_count[occurs->file_index - 1];
1422
1423       char *file_end = stpcpy (reference.start, file_name);
1424       reference.end = file_end + sprintf (file_end, ":%jd", line_ordinal);
1425     }
1426   else if (input_reference)
1427     {
1428
1429       /* Reference starts at saved position for reference and extends right
1430          until some white space is met.  */
1431
1432       reference.start = keyafter.start + occurs->reference;
1433       reference.end = reference.start;
1434       SKIP_NON_WHITE (reference.end, right_context_end);
1435     }
1436 }
1437
1438 /* Formatting and actual output - control routines.  */
1439
1440 /*----------------------------------------------------------------------.
1441 | Output the current output fields as one line for 'troff' or 'nroff'.  |
1442 `----------------------------------------------------------------------*/
1443
1444 static void
1445 output_one_roff_line (void)
1446 {
1447   /* Output the 'tail' field.  */
1448
1449   printf (".%s \"", macro_name);
1450   print_field (tail);
1451   if (tail_truncation)
1452     fputs (truncation_string, stdout);
1453   putchar ('"');
1454
1455   /* Output the 'before' field.  */
1456
1457   fputs (" \"", stdout);
1458   if (before_truncation)
1459     fputs (truncation_string, stdout);
1460   print_field (before);
1461   putchar ('"');
1462
1463   /* Output the 'keyafter' field.  */
1464
1465   fputs (" \"", stdout);
1466   print_field (keyafter);
1467   if (keyafter_truncation)
1468     fputs (truncation_string, stdout);
1469   putchar ('"');
1470
1471   /* Output the 'head' field.  */
1472
1473   fputs (" \"", stdout);
1474   if (head_truncation)
1475     fputs (truncation_string, stdout);
1476   print_field (head);
1477   putchar ('"');
1478
1479   /* Conditionally output the 'reference' field.  */
1480
1481   if (auto_reference || input_reference)
1482     {
1483       fputs (" \"", stdout);
1484       print_field (reference);
1485       putchar ('"');
1486     }
1487
1488   putchar ('\n');
1489 }
1490
1491 /*---------------------------------------------------------.
1492 | Output the current output fields as one line for 'TeX'.  |
1493 `---------------------------------------------------------*/
1494
1495 static void
1496 output_one_tex_line (void)
1497 {
1498   BLOCK key;                    /* key field, isolated */
1499   BLOCK after;                  /* after field, isolated */
1500   char *cursor;                 /* running cursor in source text */
1501
1502   printf ("\\%s ", macro_name);
1503   putchar ('{');
1504   print_field (tail);
1505   fputs ("}{", stdout);
1506   print_field (before);
1507   fputs ("}{", stdout);
1508   key.start = keyafter.start;
1509   after.end = keyafter.end;
1510   cursor = keyafter.start;
1511   SKIP_SOMETHING (cursor, keyafter.end);
1512   key.end = cursor;
1513   after.start = cursor;
1514   print_field (key);
1515   fputs ("}{", stdout);
1516   print_field (after);
1517   fputs ("}{", stdout);
1518   print_field (head);
1519   putchar ('}');
1520   if (auto_reference || input_reference)
1521     {
1522       putchar ('{');
1523       print_field (reference);
1524       putchar ('}');
1525     }
1526   putchar ('\n');
1527 }
1528
1529 /*-------------------------------------------------------------------.
1530 | Output the current output fields as one line for a dumb terminal.  |
1531 `-------------------------------------------------------------------*/
1532
1533 static void
1534 output_one_dumb_line (void)
1535 {
1536   if (!right_reference)
1537     {
1538       if (auto_reference)
1539         {
1540
1541           /* Output the 'reference' field, in such a way that GNU emacs
1542              next-error will handle it.  The ending colon is taken from the
1543              gap which follows.  */
1544
1545           print_field (reference);
1546           putchar (':');
1547           print_spaces (reference_max_width
1548                         + gap_size
1549                         - (reference.end - reference.start)
1550                         - 1);
1551         }
1552       else
1553         {
1554
1555           /* Output the 'reference' field and its following gap.  */
1556
1557           print_field (reference);
1558           print_spaces (reference_max_width
1559                         + gap_size
1560                         - (reference.end - reference.start));
1561         }
1562     }
1563
1564   if (tail.start < tail.end)
1565     {
1566       /* Output the 'tail' field.  */
1567
1568       print_field (tail);
1569       if (tail_truncation)
1570         fputs (truncation_string, stdout);
1571
1572       print_spaces (half_line_width - gap_size
1573                     - (before.end - before.start)
1574                     - (before_truncation ? truncation_string_length : 0)
1575                     - (tail.end - tail.start)
1576                     - (tail_truncation ? truncation_string_length : 0));
1577     }
1578   else
1579     print_spaces (half_line_width - gap_size
1580                   - (before.end - before.start)
1581                   - (before_truncation ? truncation_string_length : 0));
1582
1583   /* Output the 'before' field.  */
1584
1585   if (before_truncation)
1586     fputs (truncation_string, stdout);
1587   print_field (before);
1588
1589   print_spaces (gap_size);
1590
1591   /* Output the 'keyafter' field.  */
1592
1593   print_field (keyafter);
1594   if (keyafter_truncation)
1595     fputs (truncation_string, stdout);
1596
1597   if (head.start < head.end)
1598     {
1599       /* Output the 'head' field.  */
1600
1601       print_spaces (half_line_width
1602                     - (keyafter.end - keyafter.start)
1603                     - (keyafter_truncation ? truncation_string_length : 0)
1604                     - (head.end - head.start)
1605                     - (head_truncation ? truncation_string_length : 0));
1606       if (head_truncation)
1607         fputs (truncation_string, stdout);
1608       print_field (head);
1609     }
1610   else
1611
1612     if ((auto_reference || input_reference) && right_reference)
1613       print_spaces (half_line_width
1614                     - (keyafter.end - keyafter.start)
1615                     - (keyafter_truncation ? truncation_string_length : 0));
1616
1617   if ((auto_reference || input_reference) && right_reference)
1618     {
1619       /* Output the 'reference' field.  */
1620
1621       print_spaces (gap_size);
1622       print_field (reference);
1623     }
1624
1625   putchar ('\n');
1626 }
1627
1628 /*------------------------------------------------------------------------.
1629 | Scan the whole occurs table and, for each entry, output one line in the |
1630 | appropriate format.                                                     |
1631 `------------------------------------------------------------------------*/
1632
1633 static void
1634 generate_all_output (void)
1635 {
1636   ptrdiff_t occurs_index;       /* index of keyword entry being processed */
1637   OCCURS *occurs_cursor;        /* current keyword entry being processed */
1638
1639   /* The following assignments are useful to provide default values in case
1640      line contexts or references are not used, in which case these variables
1641      would never be computed.  */
1642
1643   tail.start = nullptr;
1644   tail.end = nullptr;
1645   tail_truncation = false;
1646
1647   head.start = nullptr;
1648   head.end = nullptr;
1649   head_truncation = false;
1650
1651   /* Loop over all keyword occurrences.  */
1652
1653   occurs_cursor = occurs_table[0];
1654
1655   for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1656     {
1657       /* Compute the exact size of every field and whenever truncation flags
1658          are present or not.  */
1659
1660       define_all_fields (occurs_cursor);
1661
1662       /* Produce one output line according to selected format.  */
1663
1664       switch (output_format)
1665         {
1666         case UNKNOWN_FORMAT:
1667           /* Should never happen.  */
1668
1669         case DUMB_FORMAT:
1670           output_one_dumb_line ();
1671           break;
1672
1673         case ROFF_FORMAT:
1674           output_one_roff_line ();
1675           break;
1676
1677         case TEX_FORMAT:
1678           output_one_tex_line ();
1679           break;
1680         }
1681
1682       /* Advance the cursor into the occurs table.  */
1683
1684       occurs_cursor++;
1685     }
1686 }
1687
1688 /* Option decoding and main program.  */
1689
1690 /*------------------------------------------------------.
1691 | Print program identification and options, then exit.  |
1692 `------------------------------------------------------*/
1693
1694 void
1695 usage (int status)
1696 {
1697   if (status != EXIT_SUCCESS)
1698     emit_try_help ();
1699   else
1700     {
1701       printf (_("\
1702 Usage: %s [OPTION]... [INPUT]...   (without -G)\n\
1703   or:  %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1704               program_name, program_name);
1705       fputs (_("\
1706 Output a permuted index, including context, of the words in the input files.\n\
1707 "), stdout);
1708
1709       emit_stdin_note ();
1710       emit_mandatory_arg_note ();
1711
1712       fputs (_("\
1713   -A, --auto-reference           output automatically generated references\n\
1714   -G, --traditional              behave more like System V 'ptx'\n\
1715 "), stdout);
1716       fputs (_("\
1717   -F, --flag-truncation=STRING   use STRING for flagging line truncations.\n\
1718                                  The default is '/'\n\
1719 "), stdout);
1720       fputs (_("\
1721   -M, --macro-name=STRING        macro name to use instead of 'xx'\n\
1722   -O, --format=roff              generate output as roff directives\n\
1723   -R, --right-side-refs          put references at right, not counted in -w\n\
1724   -S, --sentence-regexp=REGEXP   for end of lines or end of sentences\n\
1725   -T, --format=tex               generate output as TeX directives\n\
1726 "), stdout);
1727       fputs (_("\
1728   -W, --word-regexp=REGEXP       use REGEXP to match each keyword\n\
1729   -b, --break-file=FILE          word break characters in this FILE\n\
1730   -f, --ignore-case              fold lower case to upper case for sorting\n\
1731   -g, --gap-size=NUMBER          gap size in columns between output fields\n\
1732   -i, --ignore-file=FILE         read ignore word list from FILE\n\
1733   -o, --only-file=FILE           read only word list from this FILE\n\
1734 "), stdout);
1735       fputs (_("\
1736   -r, --references               first field of each line is a reference\n\
1737   -t, --typeset-mode               - not implemented -\n\
1738   -w, --width=NUMBER             output width in columns, reference excluded\n\
1739 "), stdout);
1740       fputs (HELP_OPTION_DESCRIPTION, stdout);
1741       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1742       emit_ancillary_info (PROGRAM_NAME);
1743     }
1744   exit (status);
1745 }
1746
1747 /*----------------------------------------------------------------------.
1748 | Main program.  Decode ARGC arguments passed through the ARGV array of |
1749 | strings, then launch execution.                                       |
1750 `----------------------------------------------------------------------*/
1751
1752 /* Long options equivalences.  */
1753 static struct option const long_options[] =
1754 {
1755   {"auto-reference", no_argument, nullptr, 'A'},
1756   {"break-file", required_argument, nullptr, 'b'},
1757   {"flag-truncation", required_argument, nullptr, 'F'},
1758   {"ignore-case", no_argument, nullptr, 'f'},
1759   {"gap-size", required_argument, nullptr, 'g'},
1760   {"ignore-file", required_argument, nullptr, 'i'},
1761   {"macro-name", required_argument, nullptr, 'M'},
1762   {"only-file", required_argument, nullptr, 'o'},
1763   {"references", no_argument, nullptr, 'r'},
1764   {"right-side-refs", no_argument, nullptr, 'R'},
1765   {"format", required_argument, nullptr, 10},
1766   {"sentence-regexp", required_argument, nullptr, 'S'},
1767   {"traditional", no_argument, nullptr, 'G'},
1768   {"typeset-mode", no_argument, nullptr, 't'},
1769   {"width", required_argument, nullptr, 'w'},
1770   {"word-regexp", required_argument, nullptr, 'W'},
1771   {GETOPT_HELP_OPTION_DECL},
1772   {GETOPT_VERSION_OPTION_DECL},
1773   {nullptr, 0, nullptr, 0},
1774 };
1775
1776 static char const *const format_args[] =
1777 {
1778   "roff", "tex", nullptr
1779 };
1780
1781 static enum Format const format_vals[] =
1782 {
1783   ROFF_FORMAT, TEX_FORMAT
1784 };
1785
1786 int
1787 main (int argc, char **argv)
1788 {
1789   int optchar;                  /* argument character */
1790   int file_index;               /* index in text input file arrays */
1791
1792   /* Decode program options.  */
1793
1794   initialize_main (&argc, &argv);
1795   set_program_name (argv[0]);
1796   setlocale (LC_ALL, "");
1797   bindtextdomain (PACKAGE, LOCALEDIR);
1798   textdomain (PACKAGE);
1799
1800   atexit (close_stdout);
1801
1802 #if HAVE_SETCHRCLASS
1803   setchrclass (nullptr);
1804 #endif
1805
1806   while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1807                                 long_options, nullptr),
1808          optchar != EOF)
1809     {
1810       switch (optchar)
1811         {
1812         default:
1813           usage (EXIT_FAILURE);
1814
1815         case 'G':
1816           gnu_extensions = false;
1817           break;
1818
1819         case 'b':
1820           break_file = optarg;
1821           break;
1822
1823         case 'f':
1824           ignore_case = true;
1825           break;
1826
1827         case 'g':
1828           {
1829             intmax_t tmp;
1830             if (! (xstrtoimax (optarg, nullptr, 0, &tmp, "") == LONGINT_OK
1831                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1832               error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
1833                      quote (optarg));
1834             gap_size = tmp;
1835             break;
1836           }
1837
1838         case 'i':
1839           ignore_file = optarg;
1840           break;
1841
1842         case 'o':
1843           only_file = optarg;
1844           break;
1845
1846         case 'r':
1847           input_reference = true;
1848           break;
1849
1850         case 't':
1851           /* Yet to understand...  */
1852           break;
1853
1854         case 'w':
1855           {
1856             intmax_t tmp;
1857             if (! (xstrtoimax (optarg, nullptr, 0, &tmp, "") == LONGINT_OK
1858                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1859               error (EXIT_FAILURE, 0, _("invalid line width: %s"),
1860                      quote (optarg));
1861             line_width = tmp;
1862             break;
1863           }
1864
1865         case 'A':
1866           auto_reference = true;
1867           break;
1868
1869         case 'F':
1870           truncation_string = optarg;
1871           unescape_string (optarg);
1872           break;
1873
1874         case 'M':
1875           macro_name = optarg;
1876           break;
1877
1878         case 'O':
1879           output_format = ROFF_FORMAT;
1880           break;
1881
1882         case 'R':
1883           right_reference = true;
1884           break;
1885
1886         case 'S':
1887           context_regex.string = optarg;
1888           unescape_string (optarg);
1889           break;
1890
1891         case 'T':
1892           output_format = TEX_FORMAT;
1893           break;
1894
1895         case 'W':
1896           word_regex.string = optarg;
1897           unescape_string (optarg);
1898           if (!*word_regex.string)
1899             word_regex.string = nullptr;
1900           break;
1901
1902         case 10:
1903           output_format = XARGMATCH ("--format", optarg,
1904                                      format_args, format_vals);
1905           break;
1906
1907         case_GETOPT_HELP_CHAR;
1908
1909         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1910         }
1911     }
1912
1913   /* Process remaining arguments.  If GNU extensions are enabled, process
1914      all arguments as input parameters.  If disabled, accept at most two
1915      arguments, the second of which is an output parameter.  */
1916
1917   if (optind == argc)
1918     {
1919
1920       /* No more argument simply means: read standard input.  */
1921
1922       input_file_name = xmalloc (sizeof *input_file_name);
1923       file_line_count = xmalloc (sizeof *file_line_count);
1924       text_buffers =    xmalloc (sizeof *text_buffers);
1925       number_input_files = 1;
1926       input_file_name[0] = nullptr;
1927     }
1928   else if (gnu_extensions)
1929     {
1930       number_input_files = argc - optind;
1931       input_file_name = xnmalloc (number_input_files, sizeof *input_file_name);
1932       file_line_count = xnmalloc (number_input_files, sizeof *file_line_count);
1933       text_buffers    = xnmalloc (number_input_files, sizeof *text_buffers);
1934
1935       for (file_index = 0; file_index < number_input_files; file_index++)
1936         {
1937           if (!*argv[optind] || STREQ (argv[optind], "-"))
1938             input_file_name[file_index] = nullptr;
1939           else
1940             input_file_name[file_index] = argv[optind];
1941           optind++;
1942         }
1943     }
1944   else
1945     {
1946
1947       /* There is one necessary input file.  */
1948
1949       number_input_files = 1;
1950       input_file_name = xmalloc (sizeof *input_file_name);
1951       file_line_count = xmalloc (sizeof *file_line_count);
1952       text_buffers    = xmalloc (sizeof *text_buffers);
1953       if (!*argv[optind] || STREQ (argv[optind], "-"))
1954         input_file_name[0] = nullptr;
1955       else
1956         input_file_name[0] = argv[optind];
1957       optind++;
1958
1959       /* Redirect standard output, only if requested.  */
1960
1961       if (optind < argc)
1962         {
1963           if (! freopen (argv[optind], "w", stdout))
1964             error (EXIT_FAILURE, errno, "%s", quotef (argv[optind]));
1965           optind++;
1966         }
1967
1968       /* Diagnose any other argument as an error.  */
1969
1970       if (optind < argc)
1971         {
1972           error (0, 0, _("extra operand %s"), quote (argv[optind]));
1973           usage (EXIT_FAILURE);
1974         }
1975     }
1976
1977   /* If the output format has not been explicitly selected, choose dumb
1978      terminal format if GNU extensions are enabled, else 'roff' format.  */
1979
1980   if (output_format == UNKNOWN_FORMAT)
1981     output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
1982
1983   /* Initialize the main tables.  */
1984
1985   initialize_regex ();
1986
1987   /* Read 'Break character' file, if any.  */
1988
1989   if (break_file)
1990     digest_break_file (break_file);
1991
1992   /* Read 'Ignore words' file and 'Only words' files, if any.  If any of
1993      these files is empty, reset the name of the file to null, to avoid
1994      unnecessary calls to search_table. */
1995
1996   if (ignore_file)
1997     {
1998       digest_word_file (ignore_file, &ignore_table);
1999       if (ignore_table.length == 0)
2000         ignore_file = nullptr;
2001     }
2002
2003   if (only_file)
2004     {
2005       digest_word_file (only_file, &only_table);
2006       if (only_table.length == 0)
2007         only_file = nullptr;
2008     }
2009
2010   /* Prepare to study all the input files.  */
2011
2012   number_of_occurs[0] = 0;
2013   total_line_count = 0;
2014   maximum_word_length = 0;
2015   reference_max_width = 0;
2016
2017   for (file_index = 0; file_index < number_input_files; file_index++)
2018     {
2019       BLOCK *text_buffer = text_buffers + file_index;
2020
2021       /* Read the file contents into memory, then study it.  */
2022
2023       swallow_file_in_memory (input_file_name[file_index], text_buffer);
2024       find_occurs_in_text (file_index);
2025
2026       /* Maintain for each file how many lines has been read so far when its
2027          end is reached.  Incrementing the count first is a simple kludge to
2028          handle a possible incomplete line at end of file.  */
2029
2030       total_line_count++;
2031       file_line_count[file_index] = total_line_count;
2032     }
2033
2034   /* Do the output process phase.  */
2035
2036   sort_found_occurs ();
2037   fix_output_parameters ();
2038   generate_all_output ();
2039
2040   /* All done.  */
2041
2042   return EXIT_SUCCESS;
2043 }