1 /* Permuted index for GNU, with keywords in their context.
2 Copyright (C) 1990-2015 Free Software Foundation, Inc.
3 François Pinard <pinard@iro.umontreal.ca>, 1988.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 François Pinard <pinard@iro.umontreal.ca> */
23 #include <sys/types.h>
31 #include "read-file.h"
35 /* The official name of this program (e.g., no 'g' prefix). */
36 #define PROGRAM_NAME "ptx"
38 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
39 if "ç" (c-with-cedilla) is available in the translation's character
41 #define AUTHORS proper_name_utf8 ("F. Pinard", "Fran\xc3\xa7ois Pinard")
43 /* Number of possible characters in a byte. */
44 #define CHAR_SET_SIZE 256
46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
48 : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
49 #define OCTTOBIN(C) ((C) - '0')
51 /* Debugging the memory allocator. */
54 # define MALLOC_FUNC_CHECK 1
58 /* Global definitions. */
60 /* FIXME: There are many unchecked integer overflows in this file,
61 that will cause this command to misbehave given large inputs or
62 options. Many of the "int" values below should be "size_t" or
63 something else like that. */
65 /* Program options. */
69 UNKNOWN_FORMAT
, /* output format still unknown */
70 DUMB_FORMAT
, /* output for a dumb terminal */
71 ROFF_FORMAT
, /* output for 'troff' or 'nroff' */
72 TEX_FORMAT
/* output for 'TeX' or 'LaTeX' */
75 static bool gnu_extensions
= true; /* trigger all GNU extensions */
76 static bool auto_reference
= false; /* refs are 'file_name:line_number:' */
77 static bool input_reference
= false; /* refs at beginning of input lines */
78 static bool right_reference
= false; /* output refs after right context */
79 static int line_width
= 72; /* output line width in characters */
80 static int gap_size
= 3; /* number of spaces between output fields */
81 static const char *truncation_string
= "/";
82 /* string used to mark line truncations */
83 static const char *macro_name
= "xx"; /* macro name for roff or TeX output */
84 static enum Format output_format
= UNKNOWN_FORMAT
;
87 static bool ignore_case
= false; /* fold lower to upper for sorting */
88 static const char *break_file
= NULL
; /* name of the 'Break chars' file */
89 static const char *only_file
= NULL
; /* name of the 'Only words' file */
90 static const char *ignore_file
= NULL
; /* name of the 'Ignore words' file */
92 /* Options that use regular expressions. */
95 /* The original regular expression, as a string. */
98 /* The compiled regular expression, and its fastmap. */
99 struct re_pattern_buffer pattern
;
100 char fastmap
[UCHAR_MAX
+ 1];
103 static struct regex_data context_regex
; /* end of context */
104 static struct regex_data word_regex
; /* keyword */
106 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
107 whole file. A WORD is something smaller, its length should fit in a
108 short integer. A WORD_TABLE may contain several WORDs. */
112 char *start
; /* pointer to beginning of region */
113 char *end
; /* pointer to end + 1 of region */
119 char *start
; /* pointer to beginning of region */
120 short int size
; /* length of the region */
126 WORD
*start
; /* array of WORDs */
127 size_t alloc
; /* allocated length */
128 size_t length
; /* number of used entries */
132 /* Pattern description tables. */
134 /* For each character, provide its folded equivalent. */
135 static unsigned char folded_chars
[CHAR_SET_SIZE
];
137 /* End of context pattern register indices. */
138 static struct re_registers context_regs
;
140 /* Keyword pattern register indices. */
141 static struct re_registers word_regs
;
143 /* A word characters fastmap is used only when no word regexp has been
144 provided. A word is then made up of a sequence of one or more characters
145 allowed by the fastmap. Contains !0 if character allowed in word. Not
146 only this is faster in most cases, but it simplifies the implementation
147 of the Break files. */
148 static char word_fastmap
[CHAR_SET_SIZE
];
150 /* Maximum length of any word read. */
151 static int maximum_word_length
;
153 /* Maximum width of any reference used. */
154 static int reference_max_width
;
156 /* Ignore and Only word tables. */
158 static WORD_TABLE ignore_table
; /* table of words to ignore */
159 static WORD_TABLE only_table
; /* table of words to select */
161 /* Source text table, and scanning macros. */
163 static int number_input_files
; /* number of text input files */
164 static int total_line_count
; /* total number of lines seen so far */
165 static const char **input_file_name
; /* array of text input file names */
166 static int *file_line_count
; /* array of 'total_line_count' values at end */
168 static BLOCK
*text_buffers
; /* files to study */
170 /* SKIP_NON_WHITE used only for getting or skipping the reference. */
172 #define SKIP_NON_WHITE(cursor, limit) \
173 while (cursor < limit && ! isspace (to_uchar (*cursor))) \
176 #define SKIP_WHITE(cursor, limit) \
177 while (cursor < limit && isspace (to_uchar (*cursor))) \
180 #define SKIP_WHITE_BACKWARDS(cursor, start) \
181 while (cursor > start && isspace (to_uchar (cursor[-1]))) \
184 #define SKIP_SOMETHING(cursor, limit) \
185 if (word_regex.string) \
188 count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
191 cursor += count == -1 ? 1 : count; \
193 else if (word_fastmap[to_uchar (*cursor)]) \
194 while (cursor < limit && word_fastmap[to_uchar (*cursor)]) \
199 /* Occurrences table.
201 The 'keyword' pointer provides the central word, which is surrounded
202 by a left context and a right context. The 'keyword' and 'length'
203 field allow full 8-bit characters keys, even including NULs. At other
204 places in this program, the name 'keyafter' refers to the keyword
205 followed by its right context.
207 The left context does not extend, towards the beginning of the file,
208 further than a distance given by the 'left' value. This value is
209 relative to the keyword beginning, it is usually negative. This
210 insures that, except for white space, we will never have to backward
211 scan the source text, when it is time to generate the final output
214 The right context, indirectly attainable through the keyword end, does
215 not extend, towards the end of the file, further than a distance given
216 by the 'right' value. This value is relative to the keyword
217 beginning, it is usually positive.
219 When automatic references are used, the 'reference' value is the
220 overall line number in all input files read so far, in this case, it
221 is of type (int). When input references are used, the 'reference'
222 value indicates the distance between the keyword beginning and the
223 start of the reference field, it is of type (DELTA) and usually
226 typedef short int DELTA
; /* to hold displacement within one context */
230 WORD key
; /* description of the keyword */
231 DELTA left
; /* distance to left context start */
232 DELTA right
; /* distance to right context end */
233 int reference
; /* reference descriptor */
234 size_t file_index
; /* corresponding file */
238 /* The various OCCURS tables are indexed by the language. But the time
239 being, there is no such multiple language support. */
241 static OCCURS
*occurs_table
[1]; /* all words retained from the read text */
242 static size_t occurs_alloc
[1]; /* allocated size of occurs_table */
243 static size_t number_of_occurs
[1]; /* number of used slots in occurs_table */
246 /* Communication among output routines. */
248 /* Indicate if special output processing is requested for each character. */
249 static char edited_flag
[CHAR_SET_SIZE
];
251 static int half_line_width
; /* half of line width, reference excluded */
252 static int before_max_width
; /* maximum width of before field */
253 static int keyafter_max_width
; /* maximum width of keyword-and-after field */
254 static int truncation_string_length
;/* length of string that flags truncation */
256 /* When context is limited by lines, wraparound may happen on final output:
257 the 'head' pointer gives access to some supplementary left context which
258 will be seen at the end of the output line, the 'tail' pointer gives
259 access to some supplementary right context which will be seen at the
260 beginning of the output line. */
262 static BLOCK tail
; /* tail field */
263 static int tail_truncation
; /* flag truncation after the tail field */
265 static BLOCK before
; /* before field */
266 static int before_truncation
; /* flag truncation before the before field */
268 static BLOCK keyafter
; /* keyword-and-after field */
269 static int keyafter_truncation
; /* flag truncation after the keyafter field */
271 static BLOCK head
; /* head field */
272 static int head_truncation
; /* flag truncation before the head field */
274 static BLOCK reference
; /* reference field for input reference mode */
276 /* Miscellaneous routines. */
278 /* Diagnose an error in the regular expression matcher. Then exit. */
280 static void ATTRIBUTE_NORETURN
283 error (0, errno
, _("error in regular expression matcher"));
287 /*------------------------------------------------------.
288 | Duplicate string STRING, while evaluating \-escapes. |
289 `------------------------------------------------------*/
291 /* Loosely adapted from GNU sh-utils printf.c code. */
294 copy_unescaped_string (const char *string
)
296 char *result
; /* allocated result */
297 char *cursor
; /* cursor in result */
298 int value
; /* value of \nnn escape */
299 int length
; /* length of \nnn escape */
301 result
= xmalloc (strlen (string
) + 1);
311 case 'x': /* \xhhh escape, 3 chars maximum */
313 for (length
= 0, string
++;
314 length
< 3 && isxdigit (to_uchar (*string
));
316 value
= value
* 16 + HEXTOBIN (*string
);
326 case '0': /* \0ooo escape, 3 chars maximum */
328 for (length
= 0, string
++;
329 length
< 3 && ISODIGIT (*string
);
331 value
= value
* 8 + OCTTOBIN (*string
);
335 case 'a': /* alert */
344 case 'b': /* backspace */
349 case 'c': /* cancel the rest of the output */
354 case 'f': /* form feed */
359 case 'n': /* new line */
364 case 'r': /* carriage return */
369 case 't': /* horizontal tab */
374 case 'v': /* vertical tab */
383 case '\0': /* lone backslash at end of string */
389 *cursor
++ = *string
++;
394 *cursor
++ = *string
++;
401 /*--------------------------------------------------------------------------.
402 | Compile the regex represented by REGEX, diagnose and abort if any error. |
403 `--------------------------------------------------------------------------*/
406 compile_regex (struct regex_data
*regex
)
408 struct re_pattern_buffer
*pattern
= ®ex
->pattern
;
409 char const *string
= regex
->string
;
412 pattern
->buffer
= NULL
;
413 pattern
->allocated
= 0;
414 pattern
->fastmap
= regex
->fastmap
;
415 pattern
->translate
= ignore_case
? folded_chars
: NULL
;
417 message
= re_compile_pattern (string
, strlen (string
), pattern
);
419 error (EXIT_FAILURE
, 0, _("%s (for regexp %s)"), message
, quote (string
));
421 /* The fastmap should be compiled before 're_match'. The following
422 call is not mandatory, because 're_search' is always called sooner,
423 and it compiles the fastmap if this has not been done yet. */
425 re_compile_fastmap (pattern
);
428 /*------------------------------------------------------------------------.
429 | This will initialize various tables for pattern match and compiles some |
431 `------------------------------------------------------------------------*/
434 initialize_regex (void)
436 int character
; /* character value */
438 /* Initialize the case folding table. */
441 for (character
= 0; character
< CHAR_SET_SIZE
; character
++)
442 folded_chars
[character
] = toupper (character
);
444 /* Unless the user already provided a description of the end of line or
445 end of sentence sequence, select an end of line sequence to compile.
446 If the user provided an empty definition, thus disabling end of line
447 or sentence feature, make it NULL to speed up tests. If GNU
448 extensions are enabled, use end of sentence like in GNU emacs. If
449 disabled, use end of lines. */
451 if (context_regex
.string
)
453 if (!*context_regex
.string
)
454 context_regex
.string
= NULL
;
456 else if (gnu_extensions
&& !input_reference
)
457 context_regex
.string
= "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
459 context_regex
.string
= "\n";
461 if (context_regex
.string
)
462 compile_regex (&context_regex
);
464 /* If the user has already provided a non-empty regexp to describe
465 words, compile it. Else, unless this has already been done through
466 a user provided Break character file, construct a fastmap of
467 characters that may appear in a word. If GNU extensions enabled,
468 include only letters of the underlying character set. If disabled,
469 include almost everything, even punctuations; stop only on white
472 if (word_regex
.string
)
473 compile_regex (&word_regex
);
474 else if (!break_file
)
481 for (character
= 0; character
< CHAR_SET_SIZE
; character
++)
482 word_fastmap
[character
] = !! isalpha (character
);
487 /* Simulate [^ \t\n]+. */
489 memset (word_fastmap
, 1, CHAR_SET_SIZE
);
490 word_fastmap
[' '] = 0;
491 word_fastmap
['\t'] = 0;
492 word_fastmap
['\n'] = 0;
497 /*------------------------------------------------------------------------.
498 | This routine will attempt to swallow a whole file name FILE_NAME into a |
499 | contiguous region of memory and return a description of it into BLOCK. |
500 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
502 | Previously, in some cases, white space compression was attempted while |
503 | inputting text. This was defeating some regexps like default end of |
504 | sentence, which checks for two consecutive spaces. If white space |
505 | compression is ever reinstated, it should be in output routines. |
506 `------------------------------------------------------------------------*/
509 swallow_file_in_memory (const char *file_name
, BLOCK
*block
)
511 size_t used_length
; /* used length in memory buffer */
513 /* As special cases, a file name which is NULL or "-" indicates standard
514 input, which is already opened. In all other cases, open the file from
516 bool using_stdin
= !file_name
|| !*file_name
|| STREQ (file_name
, "-");
518 block
->start
= fread_file (stdin
, &used_length
);
520 block
->start
= read_file (file_name
, &used_length
);
523 error (EXIT_FAILURE
, errno
, "%s", quotef (using_stdin
? "-" : file_name
));
525 block
->end
= block
->start
+ used_length
;
528 /* Sort and search routines. */
530 /*--------------------------------------------------------------------------.
531 | Compare two words, FIRST and SECOND, and return 0 if they are identical. |
532 | Return less than 0 if the first word goes before the second; return |
533 | greater than 0 if the first word goes after the second. |
535 | If a word is indeed a prefix of the other, the shorter should go first. |
536 `--------------------------------------------------------------------------*/
539 compare_words (const void *void_first
, const void *void_second
)
541 #define first ((const WORD *) void_first)
542 #define second ((const WORD *) void_second)
543 int length
; /* minimum of two lengths */
544 int counter
; /* cursor in words */
545 int value
; /* value of comparison */
547 length
= first
->size
< second
->size
? first
->size
: second
->size
;
551 for (counter
= 0; counter
< length
; counter
++)
553 value
= (folded_chars
[to_uchar (first
->start
[counter
])]
554 - folded_chars
[to_uchar (second
->start
[counter
])]);
561 for (counter
= 0; counter
< length
; counter
++)
563 value
= (to_uchar (first
->start
[counter
])
564 - to_uchar (second
->start
[counter
]));
570 return first
->size
- second
->size
;
575 /*-----------------------------------------------------------------------.
576 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
577 | go first. In case of a tie, preserve the original order through a |
578 | pointer comparison. |
579 `-----------------------------------------------------------------------*/
582 compare_occurs (const void *void_first
, const void *void_second
)
584 #define first ((const OCCURS *) void_first)
585 #define second ((const OCCURS *) void_second)
588 value
= compare_words (&first
->key
, &second
->key
);
589 return value
== 0 ? first
->key
.start
- second
->key
.start
: value
;
594 /*------------------------------------------------------------.
595 | Return !0 if WORD appears in TABLE. Uses a binary search. |
596 `------------------------------------------------------------*/
598 static int _GL_ATTRIBUTE_PURE
599 search_table (WORD
*word
, WORD_TABLE
*table
)
601 int lowest
; /* current lowest possible index */
602 int highest
; /* current highest possible index */
603 int middle
; /* current middle index */
604 int value
; /* value from last comparison */
607 highest
= table
->length
- 1;
608 while (lowest
<= highest
)
610 middle
= (lowest
+ highest
) / 2;
611 value
= compare_words (word
, table
->start
+ middle
);
613 highest
= middle
- 1;
622 /*---------------------------------------------------------------------.
623 | Sort the whole occurs table in memory. Presumably, 'qsort' does not |
624 | take intermediate copies or table elements, so the sort will be |
625 | stabilized throughout the comparison routine. |
626 `---------------------------------------------------------------------*/
629 sort_found_occurs (void)
632 /* Only one language for the time being. */
633 if (number_of_occurs
[0])
634 qsort (occurs_table
[0], number_of_occurs
[0], sizeof **occurs_table
,
638 /* Parameter files reading routines. */
640 /*----------------------------------------------------------------------.
641 | Read a file named FILE_NAME, containing a set of break characters. |
642 | Build a content to the array word_fastmap in which all characters are |
643 | allowed except those found in the file. Characters may be repeated. |
644 `----------------------------------------------------------------------*/
647 digest_break_file (const char *file_name
)
649 BLOCK file_contents
; /* to receive a copy of the file */
650 char *cursor
; /* cursor in file copy */
652 swallow_file_in_memory (file_name
, &file_contents
);
654 /* Make the fastmap and record the file contents in it. */
656 memset (word_fastmap
, 1, CHAR_SET_SIZE
);
657 for (cursor
= file_contents
.start
; cursor
< file_contents
.end
; cursor
++)
658 word_fastmap
[to_uchar (*cursor
)] = 0;
663 /* If GNU extensions are enabled, the only way to avoid newline as
664 a break character is to write all the break characters in the
665 file with no newline at all, not even at the end of the file.
666 If disabled, spaces, tabs and newlines are always considered as
667 break characters even if not included in the break file. */
669 word_fastmap
[' '] = 0;
670 word_fastmap
['\t'] = 0;
671 word_fastmap
['\n'] = 0;
674 /* Return the space of the file, which is no more required. */
676 free (file_contents
.start
);
679 /*-----------------------------------------------------------------------.
680 | Read a file named FILE_NAME, containing one word per line, then |
681 | construct in TABLE a table of WORD descriptors for them. The routine |
682 | swallows the whole file in memory; this is at the expense of space |
683 | needed for newlines, which are useless; however, the reading is fast. |
684 `-----------------------------------------------------------------------*/
687 digest_word_file (const char *file_name
, WORD_TABLE
*table
)
689 BLOCK file_contents
; /* to receive a copy of the file */
690 char *cursor
; /* cursor in file copy */
691 char *word_start
; /* start of the current word */
693 swallow_file_in_memory (file_name
, &file_contents
);
699 /* Read the whole file. */
701 cursor
= file_contents
.start
;
702 while (cursor
< file_contents
.end
)
705 /* Read one line, and save the word in contains. */
708 while (cursor
< file_contents
.end
&& *cursor
!= '\n')
711 /* Record the word in table if it is not empty. */
713 if (cursor
> word_start
)
715 if (table
->length
== table
->alloc
)
717 if ((SIZE_MAX
/ sizeof *table
->start
- 1) / 2 < table
->alloc
)
719 table
->alloc
= table
->alloc
* 2 + 1;
720 table
->start
= xrealloc (table
->start
,
721 table
->alloc
* sizeof *table
->start
);
724 table
->start
[table
->length
].start
= word_start
;
725 table
->start
[table
->length
].size
= cursor
- word_start
;
729 /* This test allows for an incomplete line at end of file. */
731 if (cursor
< file_contents
.end
)
735 /* Finally, sort all the words read. */
737 qsort (table
->start
, table
->length
, sizeof table
->start
[0], compare_words
);
740 /* Keyword recognition and selection. */
742 /*----------------------------------------------------------------------.
743 | For each keyword in the source text, constructs an OCCURS structure. |
744 `----------------------------------------------------------------------*/
747 find_occurs_in_text (size_t file_index
)
749 char *cursor
; /* for scanning the source text */
750 char *scan
; /* for scanning the source text also */
751 char *line_start
; /* start of the current input line */
752 char *line_scan
; /* newlines scanned until this point */
753 int reference_length
; /* length of reference in input mode */
754 WORD possible_key
; /* possible key, to ease searches */
755 OCCURS
*occurs_cursor
; /* current OCCURS under construction */
757 char *context_start
; /* start of left context */
758 char *context_end
; /* end of right context */
759 char *word_start
; /* start of word */
760 char *word_end
; /* end of word */
761 char *next_context_start
; /* next start of left context */
763 const BLOCK
*text_buffer
= &text_buffers
[file_index
];
765 /* reference_length is always used within 'if (input_reference)'.
766 However, GNU C diagnoses that it may be used uninitialized. The
767 following assignment is merely to shut it up. */
769 reference_length
= 0;
771 /* Tracking where lines start is helpful for reference processing. In
772 auto reference mode, this allows counting lines. In input reference
773 mode, this permits finding the beginning of the references.
775 The first line begins with the file, skip immediately this very first
776 reference in input reference mode, to help further rejection any word
777 found inside it. Also, unconditionally assigning these variable has
778 the happy effect of shutting up lint. */
780 line_start
= text_buffer
->start
;
781 line_scan
= line_start
;
784 SKIP_NON_WHITE (line_scan
, text_buffer
->end
);
785 reference_length
= line_scan
- line_start
;
786 SKIP_WHITE (line_scan
, text_buffer
->end
);
789 /* Process the whole buffer, one line or one sentence at a time. */
791 for (cursor
= text_buffer
->start
;
792 cursor
< text_buffer
->end
;
793 cursor
= next_context_start
)
796 /* 'context_start' gets initialized before the processing of each
797 line, or once for the whole buffer if no end of line or sentence
798 sequence separator. */
800 context_start
= cursor
;
802 /* If an end of line or end of sentence sequence is defined and
803 non-empty, 'next_context_start' will be recomputed to be the end of
804 each line or sentence, before each one is processed. If no such
805 sequence, then 'next_context_start' is set at the end of the whole
806 buffer, which is then considered to be a single line or sentence.
807 This test also accounts for the case of an incomplete line or
808 sentence at the end of the buffer. */
810 next_context_start
= text_buffer
->end
;
811 if (context_regex
.string
)
812 switch (re_search (&context_regex
.pattern
, cursor
,
813 text_buffer
->end
- cursor
,
814 0, text_buffer
->end
- cursor
, &context_regs
))
823 next_context_start
= cursor
+ context_regs
.end
[0];
827 /* Include the separator into the right context, but not any suffix
828 white space in this separator; this insures it will be seen in
829 output and will not take more space than necessary. */
831 context_end
= next_context_start
;
832 SKIP_WHITE_BACKWARDS (context_end
, context_start
);
834 /* Read and process a single input line or sentence, one word at a
839 if (word_regex
.string
)
841 /* If a word regexp has been compiled, use it to skip at the
842 beginning of the next word. If there is no such word, exit
846 regoff_t r
= re_search (&word_regex
.pattern
, cursor
,
847 context_end
- cursor
,
848 0, context_end
- cursor
, &word_regs
);
853 word_start
= cursor
+ word_regs
.start
[0];
854 word_end
= cursor
+ word_regs
.end
[0];
858 /* Avoid re_search and use the fastmap to skip to the
859 beginning of the next word. If there is no more word in
860 the buffer, exit the loop. */
864 while (scan
< context_end
865 && !word_fastmap
[to_uchar (*scan
)])
868 if (scan
== context_end
)
873 while (scan
< context_end
874 && word_fastmap
[to_uchar (*scan
)])
880 /* Skip right to the beginning of the found word. */
884 /* Skip any zero length word. Just advance a single position,
885 then go fetch the next word. */
887 if (word_end
== word_start
)
893 /* This is a genuine, non empty word, so save it as a possible
894 key. Then skip over it. Also, maintain the maximum length of
895 all words read so far. It is mandatory to take the maximum
896 length of all words in the file, without considering if they
897 are actually kept or rejected, because backward jumps at output
898 generation time may fall in *any* word. */
900 possible_key
.start
= cursor
;
901 possible_key
.size
= word_end
- word_start
;
902 cursor
+= possible_key
.size
;
904 if (possible_key
.size
> maximum_word_length
)
905 maximum_word_length
= possible_key
.size
;
907 /* In input reference mode, update 'line_start' from its previous
908 value. Count the lines just in case auto reference mode is
909 also selected. If it happens that the word just matched is
910 indeed part of a reference; just ignore it. */
914 while (line_scan
< possible_key
.start
)
915 if (*line_scan
== '\n')
919 line_start
= line_scan
;
920 SKIP_NON_WHITE (line_scan
, text_buffer
->end
);
921 reference_length
= line_scan
- line_start
;
925 if (line_scan
> possible_key
.start
)
929 /* Ignore the word if an 'Ignore words' table exists and if it is
930 part of it. Also ignore the word if an 'Only words' table and
931 if it is *not* part of it.
933 It is allowed that both tables be used at once, even if this
934 may look strange for now. Just ignore a word that would appear
935 in both. If regexps are eventually implemented for these
936 tables, the Ignore table could then reject words that would
937 have been previously accepted by the Only table. */
939 if (ignore_file
&& search_table (&possible_key
, &ignore_table
))
941 if (only_file
&& !search_table (&possible_key
, &only_table
))
944 /* A non-empty word has been found. First of all, insure
945 proper allocation of the next OCCURS, and make a pointer to
946 where it will be constructed. */
948 if (number_of_occurs
[0] == occurs_alloc
[0])
950 if ((SIZE_MAX
/ sizeof *occurs_table
[0] - 1) / 2
953 occurs_alloc
[0] = occurs_alloc
[0] * 2 + 1;
955 xrealloc (occurs_table
[0],
956 occurs_alloc
[0] * sizeof *occurs_table
[0]);
959 occurs_cursor
= occurs_table
[0] + number_of_occurs
[0];
961 /* Define the reference field, if any. */
966 /* While auto referencing, update 'line_start' from its
967 previous value, counting lines as we go. If input
968 referencing at the same time, 'line_start' has been
969 advanced earlier, and the following loop is never really
972 while (line_scan
< possible_key
.start
)
973 if (*line_scan
== '\n')
977 line_start
= line_scan
;
978 SKIP_NON_WHITE (line_scan
, text_buffer
->end
);
983 occurs_cursor
->reference
= total_line_count
;
985 else if (input_reference
)
988 /* If only input referencing, 'line_start' has been computed
989 earlier to detect the case the word matched would be part
990 of the reference. The reference position is simply the
991 value of 'line_start'. */
993 occurs_cursor
->reference
994 = (DELTA
) (line_start
- possible_key
.start
);
995 if (reference_length
> reference_max_width
)
996 reference_max_width
= reference_length
;
999 /* Exclude the reference from the context in simple cases. */
1001 if (input_reference
&& line_start
== context_start
)
1003 SKIP_NON_WHITE (context_start
, context_end
);
1004 SKIP_WHITE (context_start
, context_end
);
1007 /* Completes the OCCURS structure. */
1009 occurs_cursor
->key
= possible_key
;
1010 occurs_cursor
->left
= context_start
- possible_key
.start
;
1011 occurs_cursor
->right
= context_end
- possible_key
.start
;
1012 occurs_cursor
->file_index
= file_index
;
1014 number_of_occurs
[0]++;
1019 /* Formatting and actual output - service routines. */
1021 /*-----------------------------------------.
1022 | Prints some NUMBER of spaces on stdout. |
1023 `-----------------------------------------*/
1026 print_spaces (int number
)
1030 for (counter
= number
; counter
> 0; counter
--)
1034 /*-------------------------------------.
1035 | Prints the field provided by FIELD. |
1036 `-------------------------------------*/
1039 print_field (BLOCK field
)
1041 char *cursor
; /* Cursor in field to print */
1042 int base
; /* Base character, without diacritic */
1043 int diacritic
; /* Diacritic code for the character */
1045 /* Whitespace is not really compressed. Instead, each white space
1046 character (tab, vt, ht etc.) is printed as one single space. */
1048 for (cursor
= field
.start
; cursor
< field
.end
; cursor
++)
1050 unsigned char character
= *cursor
;
1051 if (edited_flag
[character
])
1054 /* First check if this is a diacriticized character.
1056 This works only for TeX. I do not know how diacriticized
1057 letters work with 'roff'. Please someone explain it to me! */
1059 diacritic
= todiac (character
);
1060 if (diacritic
!= 0 && output_format
== TEX_FORMAT
)
1062 base
= tobase (character
);
1066 case 1: /* Latin diphthongs */
1070 fputs ("\\oe{}", stdout
);
1074 fputs ("\\OE{}", stdout
);
1078 fputs ("\\ae{}", stdout
);
1082 fputs ("\\AE{}", stdout
);
1090 case 2: /* Acute accent */
1091 printf ("\\'%s%c", (base
== 'i' ? "\\" : ""), base
);
1094 case 3: /* Grave accent */
1095 printf ("\\'%s%c", (base
== 'i' ? "\\" : ""), base
);
1098 case 4: /* Circumflex accent */
1099 printf ("\\^%s%c", (base
== 'i' ? "\\" : ""), base
);
1102 case 5: /* Diaeresis */
1103 printf ("\\\"%s%c", (base
== 'i' ? "\\" : ""), base
);
1106 case 6: /* Tilde accent */
1107 printf ("\\~%s%c", (base
== 'i' ? "\\" : ""), base
);
1110 case 7: /* Cedilla */
1111 printf ("\\c{%c}", base
);
1114 case 8: /* Small circle beneath */
1118 fputs ("\\aa{}", stdout
);
1122 fputs ("\\AA{}", stdout
);
1130 case 9: /* Strike through */
1134 fputs ("\\o{}", stdout
);
1138 fputs ("\\O{}", stdout
);
1149 /* This is not a diacritic character, so handle cases which are
1150 really specific to 'roff' or TeX. All white space processing
1151 is done as the default case of this switch. */
1156 /* In roff output format, double any quote. */
1166 /* In TeX output format, precede these with a backslash. */
1168 putchar (character
);
1173 /* In TeX output format, precede these with a backslash and
1174 force mathematical mode. */
1175 printf ("$\\%c$", character
);
1179 /* In TeX output mode, request production of a backslash. */
1180 fputs ("\\backslash{}", stdout
);
1184 /* Any other flagged character produces a single space. */
1193 /* Formatting and actual output - planning routines. */
1195 /*--------------------------------------------------------------------.
1196 | From information collected from command line options and input file |
1197 | readings, compute and fix some output parameter values. |
1198 `--------------------------------------------------------------------*/
1201 fix_output_parameters (void)
1203 int file_index
; /* index in text input file arrays */
1204 int line_ordinal
; /* line ordinal value for reference */
1205 char ordinal_string
[12]; /* edited line ordinal for reference */
1206 int reference_width
; /* width for the whole reference */
1207 int character
; /* character ordinal */
1208 const char *cursor
; /* cursor in some constant strings */
1210 /* In auto reference mode, the maximum width of this field is
1211 precomputed and subtracted from the overall line width. Add one for
1212 the column which separate the file name from the line number. */
1216 reference_max_width
= 0;
1217 for (file_index
= 0; file_index
< number_input_files
; file_index
++)
1219 line_ordinal
= file_line_count
[file_index
] + 1;
1221 line_ordinal
-= file_line_count
[file_index
- 1];
1222 sprintf (ordinal_string
, "%d", line_ordinal
);
1223 reference_width
= strlen (ordinal_string
);
1224 if (input_file_name
[file_index
])
1225 reference_width
+= strlen (input_file_name
[file_index
]);
1226 if (reference_width
> reference_max_width
)
1227 reference_max_width
= reference_width
;
1229 reference_max_width
++;
1230 reference
.start
= xmalloc ((size_t) reference_max_width
+ 1);
1233 /* If the reference appears to the left of the output line, reserve some
1234 space for it right away, including one gap size. */
1236 if ((auto_reference
|| input_reference
) && !right_reference
)
1237 line_width
-= reference_max_width
+ gap_size
;
1239 /* The output lines, minimally, will contain from left to right a left
1240 context, a gap, and a keyword followed by the right context with no
1241 special intervening gap. Half of the line width is dedicated to the
1242 left context and the gap, the other half is dedicated to the keyword
1243 and the right context; these values are computed once and for all here.
1244 There also are tail and head wrap around fields, used when the keyword
1245 is near the beginning or the end of the line, or when some long word
1246 cannot fit in, but leave place from wrapped around shorter words. The
1247 maximum width of these fields are recomputed separately for each line,
1248 on a case by case basis. It is worth noting that it cannot happen that
1249 both the tail and head fields are used at once. */
1251 half_line_width
= line_width
/ 2;
1252 before_max_width
= half_line_width
- gap_size
;
1253 keyafter_max_width
= half_line_width
;
1255 /* If truncation_string is the empty string, make it NULL to speed up
1256 tests. In this case, truncation_string_length will never get used, so
1257 there is no need to set it. */
1259 if (truncation_string
&& *truncation_string
)
1260 truncation_string_length
= strlen (truncation_string
);
1262 truncation_string
= NULL
;
1267 /* When flagging truncation at the left of the keyword, the
1268 truncation mark goes at the beginning of the before field,
1269 unless there is a head field, in which case the mark goes at the
1270 left of the head field. When flagging truncation at the right
1271 of the keyword, the mark goes at the end of the keyafter field,
1272 unless there is a tail field, in which case the mark goes at the
1273 end of the tail field. Only eight combination cases could arise
1274 for truncation marks:
1277 . One beginning the before field.
1278 . One beginning the head field.
1279 . One ending the keyafter field.
1280 . One ending the tail field.
1281 . One beginning the before field, another ending the keyafter field.
1282 . One ending the tail field, another beginning the before field.
1283 . One ending the keyafter field, another beginning the head field.
1285 So, there is at most two truncation marks, which could appear both
1286 on the left side of the center of the output line, both on the
1287 right side, or one on either side. */
1289 before_max_width
-= 2 * truncation_string_length
;
1290 if (before_max_width
< 0)
1291 before_max_width
= 0;
1292 keyafter_max_width
-= 2 * truncation_string_length
;
1297 /* I never figured out exactly how UNIX' ptx plans the output width
1298 of its various fields. If GNU extensions are disabled, do not
1299 try computing the field widths correctly; instead, use the
1300 following formula, which does not completely imitate UNIX' ptx,
1303 keyafter_max_width
-= 2 * truncation_string_length
+ 1;
1306 /* Compute which characters need special output processing. Initialize
1307 by flagging any white space character. Some systems do not consider
1308 form feed as a space character, but we do. */
1310 for (character
= 0; character
< CHAR_SET_SIZE
; character
++)
1311 edited_flag
[character
] = !! isspace (character
);
1312 edited_flag
['\f'] = 1;
1314 /* Complete the special character flagging according to selected output
1317 switch (output_format
)
1319 case UNKNOWN_FORMAT
:
1320 /* Should never happen. */
1327 /* 'Quote' characters should be doubled. */
1329 edited_flag
['"'] = 1;
1334 /* Various characters need special processing. */
1336 for (cursor
= "$%&#_{}\\"; *cursor
; cursor
++)
1337 edited_flag
[to_uchar (*cursor
)] = 1;
1339 /* Any character with 8th bit set will print to a single space, unless
1340 it is diacriticized. */
1342 for (character
= 0200; character
< CHAR_SET_SIZE
; character
++)
1343 edited_flag
[character
] = todiac (character
) != 0;
1348 /*------------------------------------------------------------------.
1349 | Compute the position and length of all the output fields, given a |
1350 | pointer to some OCCURS. |
1351 `------------------------------------------------------------------*/
1354 define_all_fields (OCCURS
*occurs
)
1356 int tail_max_width
; /* allowable width of tail field */
1357 int head_max_width
; /* allowable width of head field */
1358 char *cursor
; /* running cursor in source text */
1359 char *left_context_start
; /* start of left context */
1360 char *right_context_end
; /* end of right context */
1361 char *left_field_start
; /* conservative start for 'head'/'before' */
1362 const char *file_name
; /* file name for reference */
1363 int line_ordinal
; /* line ordinal for reference */
1364 const char *buffer_start
; /* start of buffered file for this occurs */
1365 const char *buffer_end
; /* end of buffered file for this occurs */
1367 /* Define 'keyafter', start of left context and end of right context.
1368 'keyafter' starts at the saved position for keyword and extend to the
1369 right from the end of the keyword, eating separators or full words, but
1370 not beyond maximum allowed width for 'keyafter' field or limit for the
1371 right context. Suffix spaces will be removed afterwards. */
1373 keyafter
.start
= occurs
->key
.start
;
1374 keyafter
.end
= keyafter
.start
+ occurs
->key
.size
;
1375 left_context_start
= keyafter
.start
+ occurs
->left
;
1376 right_context_end
= keyafter
.start
+ occurs
->right
;
1378 buffer_start
= text_buffers
[occurs
->file_index
].start
;
1379 buffer_end
= text_buffers
[occurs
->file_index
].end
;
1381 cursor
= keyafter
.end
;
1382 while (cursor
< right_context_end
1383 && cursor
<= keyafter
.start
+ keyafter_max_width
)
1385 keyafter
.end
= cursor
;
1386 SKIP_SOMETHING (cursor
, right_context_end
);
1388 if (cursor
<= keyafter
.start
+ keyafter_max_width
)
1389 keyafter
.end
= cursor
;
1391 keyafter_truncation
= truncation_string
&& keyafter
.end
< right_context_end
;
1393 SKIP_WHITE_BACKWARDS (keyafter
.end
, keyafter
.start
);
1395 /* When the left context is wide, it might take some time to catch up from
1396 the left context boundary to the beginning of the 'head' or 'before'
1397 fields. So, in this case, to speed the catchup, we jump back from the
1398 keyword, using some secure distance, possibly falling in the middle of
1399 a word. A secure backward jump would be at least half the maximum
1400 width of a line, plus the size of the longest word met in the whole
1401 input. We conclude this backward jump by a skip forward of at least
1402 one word. In this manner, we should not inadvertently accept only part
1403 of a word. From the reached point, when it will be time to fix the
1404 beginning of 'head' or 'before' fields, we will skip forward words or
1405 delimiters until we get sufficiently near. */
1407 if (-occurs
->left
> half_line_width
+ maximum_word_length
)
1410 = keyafter
.start
- (half_line_width
+ maximum_word_length
);
1411 SKIP_SOMETHING (left_field_start
, keyafter
.start
);
1414 left_field_start
= keyafter
.start
+ occurs
->left
;
1416 /* 'before' certainly ends at the keyword, but not including separating
1417 spaces. It starts after than the saved value for the left context, by
1418 advancing it until it falls inside the maximum allowed width for the
1419 before field. There will be no prefix spaces either. 'before' only
1420 advances by skipping single separators or whole words. */
1422 before
.start
= left_field_start
;
1423 before
.end
= keyafter
.start
;
1424 SKIP_WHITE_BACKWARDS (before
.end
, before
.start
);
1426 while (before
.start
+ before_max_width
< before
.end
)
1427 SKIP_SOMETHING (before
.start
, before
.end
);
1429 if (truncation_string
)
1431 cursor
= before
.start
;
1432 SKIP_WHITE_BACKWARDS (cursor
, buffer_start
);
1433 before_truncation
= cursor
> left_context_start
;
1436 before_truncation
= 0;
1438 SKIP_WHITE (before
.start
, buffer_end
);
1440 /* The tail could not take more columns than what has been left in the
1441 left context field, and a gap is mandatory. It starts after the
1442 right context, and does not contain prefixed spaces. It ends at
1443 the end of line, the end of buffer or when the tail field is full,
1444 whichever comes first. It cannot contain only part of a word, and
1445 has no suffixed spaces. */
1448 = before_max_width
- (before
.end
- before
.start
) - gap_size
;
1450 if (tail_max_width
> 0)
1452 tail
.start
= keyafter
.end
;
1453 SKIP_WHITE (tail
.start
, buffer_end
);
1455 tail
.end
= tail
.start
;
1457 while (cursor
< right_context_end
1458 && cursor
< tail
.start
+ tail_max_width
)
1461 SKIP_SOMETHING (cursor
, right_context_end
);
1464 if (cursor
< tail
.start
+ tail_max_width
)
1467 if (tail
.end
> tail
.start
)
1469 keyafter_truncation
= 0;
1470 tail_truncation
= truncation_string
&& tail
.end
< right_context_end
;
1473 tail_truncation
= 0;
1475 SKIP_WHITE_BACKWARDS (tail
.end
, tail
.start
);
1480 /* No place left for a tail field. */
1484 tail_truncation
= 0;
1487 /* 'head' could not take more columns than what has been left in the right
1488 context field, and a gap is mandatory. It ends before the left
1489 context, and does not contain suffixed spaces. Its pointer is advanced
1490 until the head field has shrunk to its allowed width. It cannot
1491 contain only part of a word, and has no suffixed spaces. */
1494 = keyafter_max_width
- (keyafter
.end
- keyafter
.start
) - gap_size
;
1496 if (head_max_width
> 0)
1498 head
.end
= before
.start
;
1499 SKIP_WHITE_BACKWARDS (head
.end
, buffer_start
);
1501 head
.start
= left_field_start
;
1502 while (head
.start
+ head_max_width
< head
.end
)
1503 SKIP_SOMETHING (head
.start
, head
.end
);
1505 if (head
.end
> head
.start
)
1507 before_truncation
= 0;
1508 head_truncation
= (truncation_string
1509 && head
.start
> left_context_start
);
1512 head_truncation
= 0;
1514 SKIP_WHITE (head
.start
, head
.end
);
1519 /* No place left for a head field. */
1523 head_truncation
= 0;
1529 /* Construct the reference text in preallocated space from the file
1530 name and the line number. Standard input yields an empty file name.
1531 Ensure line numbers are 1 based, even if they are computed 0 based. */
1533 file_name
= input_file_name
[occurs
->file_index
];
1537 line_ordinal
= occurs
->reference
+ 1;
1538 if (occurs
->file_index
> 0)
1539 line_ordinal
-= file_line_count
[occurs
->file_index
- 1];
1541 sprintf (reference
.start
, "%s:%d", file_name
, line_ordinal
);
1542 reference
.end
= reference
.start
+ strlen (reference
.start
);
1544 else if (input_reference
)
1547 /* Reference starts at saved position for reference and extends right
1548 until some white space is met. */
1550 reference
.start
= keyafter
.start
+ (DELTA
) occurs
->reference
;
1551 reference
.end
= reference
.start
;
1552 SKIP_NON_WHITE (reference
.end
, right_context_end
);
1556 /* Formatting and actual output - control routines. */
1558 /*----------------------------------------------------------------------.
1559 | Output the current output fields as one line for 'troff' or 'nroff'. |
1560 `----------------------------------------------------------------------*/
1563 output_one_roff_line (void)
1565 /* Output the 'tail' field. */
1567 printf (".%s \"", macro_name
);
1569 if (tail_truncation
)
1570 fputs (truncation_string
, stdout
);
1573 /* Output the 'before' field. */
1575 fputs (" \"", stdout
);
1576 if (before_truncation
)
1577 fputs (truncation_string
, stdout
);
1578 print_field (before
);
1581 /* Output the 'keyafter' field. */
1583 fputs (" \"", stdout
);
1584 print_field (keyafter
);
1585 if (keyafter_truncation
)
1586 fputs (truncation_string
, stdout
);
1589 /* Output the 'head' field. */
1591 fputs (" \"", stdout
);
1592 if (head_truncation
)
1593 fputs (truncation_string
, stdout
);
1597 /* Conditionally output the 'reference' field. */
1599 if (auto_reference
|| input_reference
)
1601 fputs (" \"", stdout
);
1602 print_field (reference
);
1609 /*---------------------------------------------------------.
1610 | Output the current output fields as one line for 'TeX'. |
1611 `---------------------------------------------------------*/
1614 output_one_tex_line (void)
1616 BLOCK key
; /* key field, isolated */
1617 BLOCK after
; /* after field, isolated */
1618 char *cursor
; /* running cursor in source text */
1620 printf ("\\%s ", macro_name
);
1623 fputs ("}{", stdout
);
1624 print_field (before
);
1625 fputs ("}{", stdout
);
1626 key
.start
= keyafter
.start
;
1627 after
.end
= keyafter
.end
;
1628 cursor
= keyafter
.start
;
1629 SKIP_SOMETHING (cursor
, keyafter
.end
);
1631 after
.start
= cursor
;
1633 fputs ("}{", stdout
);
1634 print_field (after
);
1635 fputs ("}{", stdout
);
1638 if (auto_reference
|| input_reference
)
1641 print_field (reference
);
1647 /*-------------------------------------------------------------------.
1648 | Output the current output fields as one line for a dumb terminal. |
1649 `-------------------------------------------------------------------*/
1652 output_one_dumb_line (void)
1654 if (!right_reference
)
1659 /* Output the 'reference' field, in such a way that GNU emacs
1660 next-error will handle it. The ending colon is taken from the
1661 gap which follows. */
1663 print_field (reference
);
1665 print_spaces (reference_max_width
1667 - (reference
.end
- reference
.start
)
1673 /* Output the 'reference' field and its following gap. */
1675 print_field (reference
);
1676 print_spaces (reference_max_width
1678 - (reference
.end
- reference
.start
));
1682 if (tail
.start
< tail
.end
)
1684 /* Output the 'tail' field. */
1687 if (tail_truncation
)
1688 fputs (truncation_string
, stdout
);
1690 print_spaces (half_line_width
- gap_size
1691 - (before
.end
- before
.start
)
1692 - (before_truncation
? truncation_string_length
: 0)
1693 - (tail
.end
- tail
.start
)
1694 - (tail_truncation
? truncation_string_length
: 0));
1697 print_spaces (half_line_width
- gap_size
1698 - (before
.end
- before
.start
)
1699 - (before_truncation
? truncation_string_length
: 0));
1701 /* Output the 'before' field. */
1703 if (before_truncation
)
1704 fputs (truncation_string
, stdout
);
1705 print_field (before
);
1707 print_spaces (gap_size
);
1709 /* Output the 'keyafter' field. */
1711 print_field (keyafter
);
1712 if (keyafter_truncation
)
1713 fputs (truncation_string
, stdout
);
1715 if (head
.start
< head
.end
)
1717 /* Output the 'head' field. */
1719 print_spaces (half_line_width
1720 - (keyafter
.end
- keyafter
.start
)
1721 - (keyafter_truncation
? truncation_string_length
: 0)
1722 - (head
.end
- head
.start
)
1723 - (head_truncation
? truncation_string_length
: 0));
1724 if (head_truncation
)
1725 fputs (truncation_string
, stdout
);
1730 if ((auto_reference
|| input_reference
) && right_reference
)
1731 print_spaces (half_line_width
1732 - (keyafter
.end
- keyafter
.start
)
1733 - (keyafter_truncation
? truncation_string_length
: 0));
1735 if ((auto_reference
|| input_reference
) && right_reference
)
1737 /* Output the 'reference' field. */
1739 print_spaces (gap_size
);
1740 print_field (reference
);
1746 /*------------------------------------------------------------------------.
1747 | Scan the whole occurs table and, for each entry, output one line in the |
1748 | appropriate format. |
1749 `------------------------------------------------------------------------*/
1752 generate_all_output (void)
1754 size_t occurs_index
; /* index of keyword entry being processed */
1755 OCCURS
*occurs_cursor
; /* current keyword entry being processed */
1757 /* The following assignments are useful to provide default values in case
1758 line contexts or references are not used, in which case these variables
1759 would never be computed. */
1763 tail_truncation
= 0;
1767 head_truncation
= 0;
1769 /* Loop over all keyword occurrences. */
1771 occurs_cursor
= occurs_table
[0];
1773 for (occurs_index
= 0; occurs_index
< number_of_occurs
[0]; occurs_index
++)
1775 /* Compute the exact size of every field and whenever truncation flags
1776 are present or not. */
1778 define_all_fields (occurs_cursor
);
1780 /* Produce one output line according to selected format. */
1782 switch (output_format
)
1784 case UNKNOWN_FORMAT
:
1785 /* Should never happen. */
1788 output_one_dumb_line ();
1792 output_one_roff_line ();
1796 output_one_tex_line ();
1800 /* Advance the cursor into the occurs table. */
1806 /* Option decoding and main program. */
1808 /*------------------------------------------------------.
1809 | Print program identification and options, then exit. |
1810 `------------------------------------------------------*/
1815 if (status
!= EXIT_SUCCESS
)
1820 Usage: %s [OPTION]... [INPUT]... (without -G)\n\
1821 or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1822 program_name
, program_name
);
1824 Output a permuted index, including context, of the words in the input files.\n\
1828 emit_mandatory_arg_note ();
1831 -A, --auto-reference output automatically generated references\n\
1832 -G, --traditional behave more like System V 'ptx'\n\
1835 -F, --flag-truncation=STRING use STRING for flagging line truncations.\n\
1836 The default is '/'\n\
1839 -M, --macro-name=STRING macro name to use instead of 'xx'\n\
1840 -O, --format=roff generate output as roff directives\n\
1841 -R, --right-side-refs put references at right, not counted in -w\n\
1842 -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
1843 -T, --format=tex generate output as TeX directives\n\
1846 -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
1847 -b, --break-file=FILE word break characters in this FILE\n\
1848 -f, --ignore-case fold lower case to upper case for sorting\n\
1849 -g, --gap-size=NUMBER gap size in columns between output fields\n\
1850 -i, --ignore-file=FILE read ignore word list from FILE\n\
1851 -o, --only-file=FILE read only word list from this FILE\n\
1854 -r, --references first field of each line is a reference\n\
1855 -t, --typeset-mode - not implemented -\n\
1856 -w, --width=NUMBER output width in columns, reference excluded\n\
1858 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
1859 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
1860 emit_ancillary_info (PROGRAM_NAME
);
1865 /*----------------------------------------------------------------------.
1866 | Main program. Decode ARGC arguments passed through the ARGV array of |
1867 | strings, then launch execution. |
1868 `----------------------------------------------------------------------*/
1870 /* Long options equivalences. */
1871 static struct option
const long_options
[] =
1873 {"auto-reference", no_argument
, NULL
, 'A'},
1874 {"break-file", required_argument
, NULL
, 'b'},
1875 {"flag-truncation", required_argument
, NULL
, 'F'},
1876 {"ignore-case", no_argument
, NULL
, 'f'},
1877 {"gap-size", required_argument
, NULL
, 'g'},
1878 {"ignore-file", required_argument
, NULL
, 'i'},
1879 {"macro-name", required_argument
, NULL
, 'M'},
1880 {"only-file", required_argument
, NULL
, 'o'},
1881 {"references", no_argument
, NULL
, 'r'},
1882 {"right-side-refs", no_argument
, NULL
, 'R'},
1883 {"format", required_argument
, NULL
, 10},
1884 {"sentence-regexp", required_argument
, NULL
, 'S'},
1885 {"traditional", no_argument
, NULL
, 'G'},
1886 {"typeset-mode", no_argument
, NULL
, 't'},
1887 {"width", required_argument
, NULL
, 'w'},
1888 {"word-regexp", required_argument
, NULL
, 'W'},
1889 {GETOPT_HELP_OPTION_DECL
},
1890 {GETOPT_VERSION_OPTION_DECL
},
1894 static char const* const format_args
[] =
1899 static enum Format
const format_vals
[] =
1901 ROFF_FORMAT
, TEX_FORMAT
1905 main (int argc
, char **argv
)
1907 int optchar
; /* argument character */
1908 int file_index
; /* index in text input file arrays */
1910 /* Decode program options. */
1912 initialize_main (&argc
, &argv
);
1913 set_program_name (argv
[0]);
1914 setlocale (LC_ALL
, "");
1915 bindtextdomain (PACKAGE
, LOCALEDIR
);
1916 textdomain (PACKAGE
);
1918 atexit (close_stdout
);
1920 #if HAVE_SETCHRCLASS
1924 while (optchar
= getopt_long (argc
, argv
, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1925 long_options
, NULL
),
1931 usage (EXIT_FAILURE
);
1934 gnu_extensions
= false;
1938 break_file
= optarg
;
1947 unsigned long int tmp_ulong
;
1948 if (xstrtoul (optarg
, NULL
, 0, &tmp_ulong
, NULL
) != LONGINT_OK
1949 || ! (0 < tmp_ulong
&& tmp_ulong
<= INT_MAX
))
1950 error (EXIT_FAILURE
, 0, _("invalid gap width: %s"),
1952 gap_size
= tmp_ulong
;
1957 ignore_file
= optarg
;
1965 input_reference
= true;
1969 /* Yet to understand... */
1974 unsigned long int tmp_ulong
;
1975 if (xstrtoul (optarg
, NULL
, 0, &tmp_ulong
, NULL
) != LONGINT_OK
1976 || ! (0 < tmp_ulong
&& tmp_ulong
<= INT_MAX
))
1977 error (EXIT_FAILURE
, 0, _("invalid line width: %s"),
1979 line_width
= tmp_ulong
;
1984 auto_reference
= true;
1988 truncation_string
= copy_unescaped_string (optarg
);
1992 macro_name
= optarg
;
1996 output_format
= ROFF_FORMAT
;
2000 right_reference
= true;
2004 context_regex
.string
= copy_unescaped_string (optarg
);
2008 output_format
= TEX_FORMAT
;
2012 word_regex
.string
= copy_unescaped_string (optarg
);
2013 if (!*word_regex
.string
)
2014 word_regex
.string
= NULL
;
2018 output_format
= XARGMATCH ("--format", optarg
,
2019 format_args
, format_vals
);
2022 case_GETOPT_HELP_CHAR
;
2024 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2028 /* Process remaining arguments. If GNU extensions are enabled, process
2029 all arguments as input parameters. If disabled, accept at most two
2030 arguments, the second of which is an output parameter. */
2035 /* No more argument simply means: read standard input. */
2037 input_file_name
= xmalloc (sizeof *input_file_name
);
2038 file_line_count
= xmalloc (sizeof *file_line_count
);
2039 text_buffers
= xmalloc (sizeof *text_buffers
);
2040 number_input_files
= 1;
2041 input_file_name
[0] = NULL
;
2043 else if (gnu_extensions
)
2045 number_input_files
= argc
- optind
;
2046 input_file_name
= xmalloc (number_input_files
* sizeof *input_file_name
);
2047 file_line_count
= xmalloc (number_input_files
* sizeof *file_line_count
);
2048 text_buffers
= xmalloc (number_input_files
* sizeof *text_buffers
);
2050 for (file_index
= 0; file_index
< number_input_files
; file_index
++)
2052 if (!*argv
[optind
] || STREQ (argv
[optind
], "-"))
2053 input_file_name
[file_index
] = NULL
;
2055 input_file_name
[file_index
] = argv
[optind
];
2062 /* There is one necessary input file. */
2064 number_input_files
= 1;
2065 input_file_name
= xmalloc (sizeof *input_file_name
);
2066 file_line_count
= xmalloc (sizeof *file_line_count
);
2067 text_buffers
= xmalloc (sizeof *text_buffers
);
2068 if (!*argv
[optind
] || STREQ (argv
[optind
], "-"))
2069 input_file_name
[0] = NULL
;
2071 input_file_name
[0] = argv
[optind
];
2074 /* Redirect standard output, only if requested. */
2078 if (! freopen (argv
[optind
], "w", stdout
))
2079 error (EXIT_FAILURE
, errno
, "%s", quotef (argv
[optind
]));
2083 /* Diagnose any other argument as an error. */
2087 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
2088 usage (EXIT_FAILURE
);
2092 /* If the output format has not been explicitly selected, choose dumb
2093 terminal format if GNU extensions are enabled, else 'roff' format. */
2095 if (output_format
== UNKNOWN_FORMAT
)
2096 output_format
= gnu_extensions
? DUMB_FORMAT
: ROFF_FORMAT
;
2098 /* Initialize the main tables. */
2100 initialize_regex ();
2102 /* Read 'Break character' file, if any. */
2105 digest_break_file (break_file
);
2107 /* Read 'Ignore words' file and 'Only words' files, if any. If any of
2108 these files is empty, reset the name of the file to NULL, to avoid
2109 unnecessary calls to search_table. */
2113 digest_word_file (ignore_file
, &ignore_table
);
2114 if (ignore_table
.length
== 0)
2120 digest_word_file (only_file
, &only_table
);
2121 if (only_table
.length
== 0)
2125 /* Prepare to study all the input files. */
2127 number_of_occurs
[0] = 0;
2128 total_line_count
= 0;
2129 maximum_word_length
= 0;
2130 reference_max_width
= 0;
2132 for (file_index
= 0; file_index
< number_input_files
; file_index
++)
2134 BLOCK
*text_buffer
= text_buffers
+ file_index
;
2136 /* Read the file in core, then study it. */
2138 swallow_file_in_memory (input_file_name
[file_index
], text_buffer
);
2139 find_occurs_in_text (file_index
);
2141 /* Maintain for each file how many lines has been read so far when its
2142 end is reached. Incrementing the count first is a simple kludge to
2143 handle a possible incomplete line at end of file. */
2146 file_line_count
[file_index
] = total_line_count
;
2149 /* Do the output process phase. */
2151 sort_found_occurs ();
2152 fix_output_parameters ();
2153 generate_all_output ();
2157 return EXIT_SUCCESS
;