(recheck): Handle a race condition (including <dev,inode>
[coreutils.git] / src / ptx.c
blob20c3c67d20479e7000c1d0d645aa4ac58f8e49c9
1 /* Permuted index for GNU, with keywords in their context.
2 Copyright (C) 1990, 1991, 1993, 1998-1999 Free Software Foundation, Inc.
3 François Pinard <pinard@iro.umontreal.ca>, 1988.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 François Pinard <pinard@iro.umontreal.ca> */
21 #include <config.h>
23 #include <stdio.h>
24 #include <getopt.h>
25 #include <sys/types.h>
26 #include "system.h"
27 #include "argmatch.h"
28 #include "bumpalloc.h"
29 #include "diacrit.h"
30 #include "error.h"
31 #include "regex.h"
33 /* The official name of this program (e.g., no `g' prefix). */
34 #define PROGRAM_NAME "ptx"
36 #define AUTHORS "François Pinard"
38 /* Number of possible characters in a byte. */
39 #define CHAR_SET_SIZE 256
41 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
42 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
43 : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
44 #define OCTTOBIN(C) ((C) - '0')
46 /* Debugging the memory allocator. */
48 #if WITH_DMALLOC
49 # define MALLOC_FUNC_CHECK 1
50 # include <dmalloc.h>
51 #endif
53 /* Global definitions. */
55 /* Reallocation step when swallowing non regular files. The value is not
56 the actual reallocation step, but its base two logarithm. */
57 #define SWALLOW_REALLOC_LOG 12
59 /* Imported from "regex.c". */
60 #define Sword 1
62 /* The name this program was run with. */
63 char *program_name;
65 /* Program options. */
67 enum Format
69 UNKNOWN_FORMAT, /* output format still unknown */
70 DUMB_FORMAT, /* output for a dumb terminal */
71 ROFF_FORMAT, /* output for `troff' or `nroff' */
72 TEX_FORMAT /* output for `TeX' or `LaTeX' */
75 int gnu_extensions = 1; /* trigger all GNU extensions */
76 int auto_reference = 0; /* references are `file_name:line_number:' */
77 int input_reference = 0; /* references at beginning of input lines */
78 int right_reference = 0; /* output references after right context */
79 int line_width = 72; /* output line width in characters */
80 int gap_size = 3; /* number of spaces between output fields */
81 const char *truncation_string = "/";
82 /* string used to mark line truncations */
83 const char *macro_name = "xx"; /* macro name for roff or TeX output */
84 enum Format output_format = UNKNOWN_FORMAT;
85 /* output format */
87 int ignore_case = 0; /* fold lower to upper case for sorting */
88 const char *context_regex_string = NULL;
89 /* raw regex for end of context */
90 const char *word_regex_string = NULL;
91 /* raw regex for a keyword */
92 const char *break_file = NULL; /* name of the `Break characters' file */
93 const char *only_file = NULL; /* name of the `Only words' file */
94 const char *ignore_file = NULL; /* name of the `Ignore words' file */
96 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
97 whole file. A WORD is something smaller, its length should fit in a
98 short integer. A WORD_TABLE may contain several WORDs. */
100 typedef struct
102 char *start; /* pointer to beginning of region */
103 char *end; /* pointer to end + 1 of region */
105 BLOCK;
107 typedef struct
109 char *start; /* pointer to beginning of region */
110 short size; /* length of the region */
112 WORD;
114 typedef struct
116 WORD *start; /* array of WORDs */
117 size_t length; /* number of entries */
119 WORD_TABLE;
121 /* Pattern description tables. */
123 /* For each character, provide its folded equivalent. */
124 unsigned char folded_chars[CHAR_SET_SIZE];
126 /* For each character, indicate if it is part of a word. */
127 char syntax_table[CHAR_SET_SIZE];
128 char *re_syntax_table = syntax_table;
130 /* Compiled regex for end of context. */
131 struct re_pattern_buffer *context_regex;
133 /* End of context pattern register indices. */
134 struct re_registers context_regs;
136 /* Compiled regex for a keyword. */
137 struct re_pattern_buffer *word_regex;
139 /* Keyword pattern register indices. */
140 struct re_registers word_regs;
142 /* A word characters fastmap is used only when no word regexp has been
143 provided. A word is then made up of a sequence of one or more characters
144 allowed by the fastmap. Contains !0 if character allowed in word. Not
145 only this is faster in most cases, but it simplifies the implementation
146 of the Break files. */
147 char word_fastmap[CHAR_SET_SIZE];
149 /* Maximum length of any word read. */
150 int maximum_word_length;
152 /* Maximum width of any reference used. */
153 int reference_max_width;
155 /* Ignore and Only word tables. */
157 WORD_TABLE ignore_table; /* table of words to ignore */
158 WORD_TABLE only_table; /* table of words to select */
160 #define ALLOC_NEW_WORD(table) \
161 BUMP_ALLOC ((table)->start, (table)->length, 8, WORD)
163 /* Source text table, and scanning macros. */
165 int number_input_files; /* number of text input files */
166 int total_line_count; /* total number of lines seen so far */
167 const char **input_file_name; /* array of text input file names */
168 int *file_line_count; /* array of `total_line_count' values at end */
170 BLOCK text_buffer; /* file to study */
171 char *text_buffer_maxend; /* allocated end of text_buffer */
173 /* SKIP_NON_WHITE used only for getting or skipping the reference. */
175 #define SKIP_NON_WHITE(cursor, limit) \
176 while (cursor < limit && !ISSPACE(*cursor)) \
177 cursor++
179 #define SKIP_WHITE(cursor, limit) \
180 while (cursor < limit && ISSPACE(*cursor)) \
181 cursor++
183 #define SKIP_WHITE_BACKWARDS(cursor, start) \
184 while (cursor > start && ISSPACE(cursor[-1])) \
185 cursor--
187 #define SKIP_SOMETHING(cursor, limit) \
188 if (word_regex_string) \
190 int count; \
191 count = re_match (word_regex, cursor, limit - cursor, 0, NULL); \
192 cursor += count <= 0 ? 1 : count; \
194 else if (word_fastmap[(unsigned char) *cursor]) \
195 while (cursor < limit && word_fastmap[(unsigned char) *cursor]) \
196 cursor++; \
197 else \
198 cursor++
200 /* Occurrences table.
202 The `keyword' pointer provides the central word, which is surrounded
203 by a left context and a right context. The `keyword' and `length'
204 field allow full 8-bit characters keys, even including NULs. At other
205 places in this program, the name `keyafter' refers to the keyword
206 followed by its right context.
208 The left context does not extend, towards the beginning of the file,
209 further than a distance given by the `left' value. This value is
210 relative to the keyword beginning, it is usually negative. This
211 insures that, except for white space, we will never have to backward
212 scan the source text, when it is time to generate the final output
213 lines.
215 The right context, indirectly attainable through the keyword end, does
216 not extend, towards the end of the file, further than a distance given
217 by the `right' value. This value is relative to the keyword
218 beginning, it is usually positive.
220 When automatic references are used, the `reference' value is the
221 overall line number in all input files read so far, in this case, it
222 is of type (int). When input references are used, the `reference'
223 value indicates the distance between the keyword beginning and the
224 start of the reference field, it is of type (DELTA) and usually
225 negative. */
227 typedef short DELTA; /* to hold displacement within one context */
229 typedef struct
231 WORD key; /* description of the keyword */
232 DELTA left; /* distance to left context start */
233 DELTA right; /* distance to right context end */
234 int reference; /* reference descriptor */
236 OCCURS;
238 /* The various OCCURS tables are indexed by the language. But the time
239 being, there is no such multiple language support. */
241 OCCURS *occurs_table[1]; /* all words retained from the read text */
242 size_t number_of_occurs[1]; /* number of used slots in occurs_table */
244 #define ALLOC_NEW_OCCURS(language) \
245 BUMP_ALLOC (occurs_table[language], number_of_occurs[language], 9, OCCURS)
247 /* Communication among output routines. */
249 /* Indicate if special output processing is requested for each character. */
250 char edited_flag[CHAR_SET_SIZE];
252 int half_line_width; /* half of line width, reference excluded */
253 int before_max_width; /* maximum width of before field */
254 int keyafter_max_width; /* maximum width of keyword-and-after field */
255 int truncation_string_length; /* length of string used to flag truncation */
257 /* When context is limited by lines, wraparound may happen on final output:
258 the `head' pointer gives access to some supplementary left context which
259 will be seen at the end of the output line, the `tail' pointer gives
260 access to some supplementary right context which will be seen at the
261 beginning of the output line. */
263 BLOCK tail; /* tail field */
264 int tail_truncation; /* flag truncation after the tail field */
266 BLOCK before; /* before field */
267 int before_truncation; /* flag truncation before the before field */
269 BLOCK keyafter; /* keyword-and-after field */
270 int keyafter_truncation; /* flag truncation after the keyafter field */
272 BLOCK head; /* head field */
273 int head_truncation; /* flag truncation before the head field */
275 BLOCK reference; /* reference field for input reference mode */
277 /* Miscellaneous routines. */
279 /*------------------------------------------------------.
280 | Duplicate string STRING, while evaluating \-escapes. |
281 `------------------------------------------------------*/
283 /* Loosely adapted from GNU sh-utils printf.c code. */
285 static char *
286 copy_unescaped_string (const char *string)
288 char *result; /* allocated result */
289 char *cursor; /* cursor in result */
290 int value; /* value of \nnn escape */
291 int length; /* length of \nnn escape */
293 result = xmalloc (strlen (string) + 1);
294 cursor = result;
296 while (*string)
297 if (*string == '\\')
299 string++;
300 switch (*string)
302 case 'x': /* \xhhh escape, 3 chars maximum */
303 value = 0;
304 for (length = 0, string++;
305 length < 3 && ISXDIGIT (*string);
306 length++, string++)
307 value = value * 16 + HEXTOBIN (*string);
308 if (length == 0)
310 *cursor++ = '\\';
311 *cursor++ = 'x';
313 else
314 *cursor++ = value;
315 break;
317 case '0': /* \0ooo escape, 3 chars maximum */
318 value = 0;
319 for (length = 0, string++;
320 length < 3 && ISODIGIT (*string);
321 length++, string++)
322 value = value * 8 + OCTTOBIN (*string);
323 *cursor++ = value;
324 break;
326 case 'a': /* alert */
327 #if __STDC__
328 *cursor++ = '\a';
329 #else
330 *cursor++ = 7;
331 #endif
332 string++;
333 break;
335 case 'b': /* backspace */
336 *cursor++ = '\b';
337 string++;
338 break;
340 case 'c': /* cancel the rest of the output */
341 while (*string)
342 string++;
343 break;
345 case 'f': /* form feed */
346 *cursor++ = '\f';
347 string++;
348 break;
350 case 'n': /* new line */
351 *cursor++ = '\n';
352 string++;
353 break;
355 case 'r': /* carriage return */
356 *cursor++ = '\r';
357 string++;
358 break;
360 case 't': /* horizontal tab */
361 *cursor++ = '\t';
362 string++;
363 break;
365 case 'v': /* vertical tab */
366 #if __STDC__
367 *cursor++ = '\v';
368 #else
369 *cursor++ = 11;
370 #endif
371 string++;
372 break;
374 default:
375 *cursor++ = '\\';
376 *cursor++ = *string++;
377 break;
380 else
381 *cursor++ = *string++;
383 *cursor = '\0';
384 return result;
387 /*-------------------------------------------------------------------.
388 | Compile the regex represented by STRING, diagnose and abort if any |
389 | error. Returns the compiled regex structure. |
390 `-------------------------------------------------------------------*/
392 static struct re_pattern_buffer *
393 alloc_and_compile_regex (const char *string)
395 struct re_pattern_buffer *pattern; /* newly allocated structure */
396 const char *message; /* error message returned by regex.c */
398 pattern = (struct re_pattern_buffer *)
399 xmalloc (sizeof (struct re_pattern_buffer));
400 memset (pattern, 0, sizeof (struct re_pattern_buffer));
402 pattern->buffer = NULL;
403 pattern->allocated = 0;
404 pattern->translate = ignore_case ? (char *) folded_chars : NULL;
405 pattern->fastmap = (char *) xmalloc ((size_t) CHAR_SET_SIZE);
407 message = re_compile_pattern (string, (int) strlen (string), pattern);
408 if (message)
409 error (EXIT_FAILURE, 0, _("%s (for regexp `%s')"), message, string);
411 /* The fastmap should be compiled before `re_match'. The following
412 call is not mandatory, because `re_search' is always called sooner,
413 and it compiles the fastmap if this has not been done yet. */
415 re_compile_fastmap (pattern);
417 /* Do not waste extra allocated space. */
419 if (pattern->allocated > pattern->used)
421 pattern->buffer
422 = (unsigned char *) xrealloc (pattern->buffer, (size_t) pattern->used);
423 pattern->allocated = pattern->used;
426 return pattern;
429 /*------------------------------------------------------------------------.
430 | This will initialize various tables for pattern match and compiles some |
431 | regexps. |
432 `------------------------------------------------------------------------*/
434 static void
435 initialize_regex (void)
437 int character; /* character value */
439 /* Initialize the regex syntax table. */
441 for (character = 0; character < CHAR_SET_SIZE; character++)
442 syntax_table[character] = ISALPHA (character) ? Sword : 0;
444 /* Initialize the case folding table. */
446 if (ignore_case)
447 for (character = 0; character < CHAR_SET_SIZE; character++)
448 folded_chars[character] = TOUPPER (character);
450 /* Unless the user already provided a description of the end of line or
451 end of sentence sequence, select an end of line sequence to compile.
452 If the user provided an empty definition, thus disabling end of line
453 or sentence feature, make it NULL to speed up tests. If GNU
454 extensions are enabled, use end of sentence like in GNU emacs. If
455 disabled, use end of lines. */
457 if (context_regex_string)
459 if (!*context_regex_string)
460 context_regex_string = NULL;
462 else if (gnu_extensions && !input_reference)
463 context_regex_string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
464 else
465 context_regex_string = "\n";
467 if (context_regex_string)
468 context_regex = alloc_and_compile_regex (context_regex_string);
470 /* If the user has already provided a non-empty regexp to describe
471 words, compile it. Else, unless this has already been done through
472 a user provided Break character file, construct a fastmap of
473 characters that may appear in a word. If GNU extensions enabled,
474 include only letters of the underlying character set. If disabled,
475 include almost everything, even punctuations; stop only on white
476 space. */
478 if (word_regex_string && *word_regex_string)
479 word_regex = alloc_and_compile_regex (word_regex_string);
480 else if (!break_file)
482 if (gnu_extensions)
485 /* Simulate \w+. */
487 for (character = 0; character < CHAR_SET_SIZE; character++)
488 word_fastmap[character] = ISALPHA (character) ? 1 : 0;
490 else
493 /* Simulate [^ \t\n]+. */
495 memset (word_fastmap, 1, CHAR_SET_SIZE);
496 word_fastmap[' '] = 0;
497 word_fastmap['\t'] = 0;
498 word_fastmap['\n'] = 0;
503 /*------------------------------------------------------------------------.
504 | This routine will attempt to swallow a whole file name FILE_NAME into a |
505 | contiguous region of memory and return a description of it into BLOCK. |
506 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
508 | Previously, in some cases, white space compression was attempted while |
509 | inputting text. This was defeating some regexps like default end of |
510 | sentence, which checks for two consecutive spaces. If white space |
511 | compression is ever reinstated, it should be in output routines. |
512 `------------------------------------------------------------------------*/
514 static void
515 swallow_file_in_memory (const char *file_name, BLOCK *block)
517 int file_handle; /* file descriptor number */
518 struct stat stat_block; /* stat block for file */
519 size_t allocated_length; /* allocated length of memory buffer */
520 size_t used_length; /* used length in memory buffer */
521 int read_length; /* number of character gotten on last read */
523 /* As special cases, a file name which is NULL or "-" indicates standard
524 input, which is already opened. In all other cases, open the file from
525 its name. */
527 if (!file_name || !*file_name || strcmp (file_name, "-") == 0)
528 file_handle = fileno (stdin);
529 else
530 if ((file_handle = open (file_name, O_RDONLY)) < 0)
531 error (EXIT_FAILURE, errno, "%s", file_name);
533 /* If the file is a plain, regular file, allocate the memory buffer all at
534 once and swallow the file in one blow. In other cases, read the file
535 repeatedly in smaller chunks until we have it all, reallocating memory
536 once in a while, as we go. */
538 if (fstat (file_handle, &stat_block) < 0)
539 error (EXIT_FAILURE, errno, "%s", file_name);
541 if (S_ISREG (stat_block.st_mode))
543 size_t in_memory_size;
545 block->start = (char *) xmalloc ((size_t) stat_block.st_size);
547 if ((in_memory_size = read (file_handle,
548 block->start, (size_t) stat_block.st_size))
549 != stat_block.st_size)
551 #if MSDOS
552 /* On MSDOS, in memory size may be smaller than the file
553 size, because of end of line conversions. But it can
554 never be smaller than half the file size, because the
555 minimum is when all lines are empty and terminated by
556 CR+LF. */
557 if (in_memory_size != (size_t)-1
558 && in_memory_size >= stat_block.st_size / 2)
559 block->start = (char *) xrealloc (block->start, in_memory_size);
560 else
561 #endif /* not MSDOS */
563 error (EXIT_FAILURE, errno, "%s", file_name);
565 block->end = block->start + in_memory_size;
567 else
569 block->start = (char *) xmalloc ((size_t) 1 << SWALLOW_REALLOC_LOG);
570 used_length = 0;
571 allocated_length = (1 << SWALLOW_REALLOC_LOG);
573 while (read_length = read (file_handle,
574 block->start + used_length,
575 allocated_length - used_length),
576 read_length > 0)
578 used_length += read_length;
579 if (used_length == allocated_length)
581 allocated_length += (1 << SWALLOW_REALLOC_LOG);
582 block->start
583 = (char *) xrealloc (block->start, allocated_length);
587 if (read_length < 0)
588 error (EXIT_FAILURE, errno, "%s", file_name);
590 block->end = block->start + used_length;
593 /* Close the file, but only if it was not the standard input. */
595 if (file_handle != fileno (stdin))
596 close (file_handle);
599 /* Sort and search routines. */
601 /*--------------------------------------------------------------------------.
602 | Compare two words, FIRST and SECOND, and return 0 if they are identical. |
603 | Return less than 0 if the first word goes before the second; return |
604 | greater than 0 if the first word goes after the second. |
606 | If a word is indeed a prefix of the other, the shorter should go first. |
607 `--------------------------------------------------------------------------*/
609 static int
610 compare_words (const void *void_first, const void *void_second)
612 #define first ((const WORD *) void_first)
613 #define second ((const WORD *) void_second)
614 int length; /* minimum of two lengths */
615 int counter; /* cursor in words */
616 int value; /* value of comparison */
618 length = first->size < second->size ? first->size : second->size;
620 if (ignore_case)
622 for (counter = 0; counter < length; counter++)
624 value = (folded_chars [(unsigned char) (first->start[counter])]
625 - folded_chars [(unsigned char) (second->start[counter])]);
626 if (value != 0)
627 return value;
630 else
632 for (counter = 0; counter < length; counter++)
634 value = ((unsigned char) first->start[counter]
635 - (unsigned char) second->start[counter]);
636 if (value != 0)
637 return value;
641 return first->size - second->size;
642 #undef first
643 #undef second
646 /*-----------------------------------------------------------------------.
647 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
648 | go first. In case of a tie, preserve the original order through a |
649 | pointer comparison. |
650 `-----------------------------------------------------------------------*/
652 static int
653 compare_occurs (const void *void_first, const void *void_second)
655 #define first ((const OCCURS *) void_first)
656 #define second ((const OCCURS *) void_second)
657 int value;
659 value = compare_words (&first->key, &second->key);
660 return value == 0 ? first->key.start - second->key.start : value;
661 #undef first
662 #undef second
665 /*------------------------------------------------------------.
666 | Return !0 if WORD appears in TABLE. Uses a binary search. |
667 `------------------------------------------------------------*/
669 static int
670 search_table (WORD *word, WORD_TABLE *table)
672 int lowest; /* current lowest possible index */
673 int highest; /* current highest possible index */
674 int middle; /* current middle index */
675 int value; /* value from last comparison */
677 lowest = 0;
678 highest = table->length - 1;
679 while (lowest <= highest)
681 middle = (lowest + highest) / 2;
682 value = compare_words (word, table->start + middle);
683 if (value < 0)
684 highest = middle - 1;
685 else if (value > 0)
686 lowest = middle + 1;
687 else
688 return 1;
690 return 0;
693 /*---------------------------------------------------------------------.
694 | Sort the whole occurs table in memory. Presumably, `qsort' does not |
695 | take intermediate copies or table elements, so the sort will be |
696 | stabilized throughout the comparison routine. |
697 `---------------------------------------------------------------------*/
699 static void
700 sort_found_occurs (void)
703 /* Only one language for the time being. */
705 qsort (occurs_table[0], number_of_occurs[0], sizeof (OCCURS),
706 compare_occurs);
709 /* Parameter files reading routines. */
711 /*----------------------------------------------------------------------.
712 | Read a file named FILE_NAME, containing a set of break characters. |
713 | Build a content to the array word_fastmap in which all characters are |
714 | allowed except those found in the file. Characters may be repeated. |
715 `----------------------------------------------------------------------*/
717 static void
718 digest_break_file (const char *file_name)
720 BLOCK file_contents; /* to receive a copy of the file */
721 char *cursor; /* cursor in file copy */
723 swallow_file_in_memory (file_name, &file_contents);
725 /* Make the fastmap and record the file contents in it. */
727 memset (word_fastmap, 1, CHAR_SET_SIZE);
728 for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
729 word_fastmap[(unsigned char) *cursor] = 0;
731 if (!gnu_extensions)
734 /* If GNU extensions are enabled, the only way to avoid newline as
735 a break character is to write all the break characters in the
736 file with no newline at all, not even at the end of the file.
737 If disabled, spaces, tabs and newlines are always considered as
738 break characters even if not included in the break file. */
740 word_fastmap[' '] = 0;
741 word_fastmap['\t'] = 0;
742 word_fastmap['\n'] = 0;
745 /* Return the space of the file, which is no more required. */
747 free (file_contents.start);
750 /*-----------------------------------------------------------------------.
751 | Read a file named FILE_NAME, containing one word per line, then |
752 | construct in TABLE a table of WORD descriptors for them. The routine |
753 | swallows the whole file in memory; this is at the expense of space |
754 | needed for newlines, which are useless; however, the reading is fast. |
755 `-----------------------------------------------------------------------*/
757 static void
758 digest_word_file (const char *file_name, WORD_TABLE *table)
760 BLOCK file_contents; /* to receive a copy of the file */
761 char *cursor; /* cursor in file copy */
762 char *word_start; /* start of the current word */
764 swallow_file_in_memory (file_name, &file_contents);
766 table->start = NULL;
767 table->length = 0;
769 /* Read the whole file. */
771 cursor = file_contents.start;
772 while (cursor < file_contents.end)
775 /* Read one line, and save the word in contains. */
777 word_start = cursor;
778 while (cursor < file_contents.end && *cursor != '\n')
779 cursor++;
781 /* Record the word in table if it is not empty. */
783 if (cursor > word_start)
785 ALLOC_NEW_WORD (table);
786 table->start[table->length].start = word_start;
787 table->start[table->length].size = cursor - word_start;
788 table->length++;
791 /* This test allows for an incomplete line at end of file. */
793 if (cursor < file_contents.end)
794 cursor++;
797 /* Finally, sort all the words read. */
799 qsort (table->start, table->length, (size_t) sizeof (WORD), compare_words);
802 /* Keyword recognition and selection. */
804 /*----------------------------------------------------------------------.
805 | For each keyword in the source text, constructs an OCCURS structure. |
806 `----------------------------------------------------------------------*/
808 static void
809 find_occurs_in_text (void)
811 char *cursor; /* for scanning the source text */
812 char *scan; /* for scanning the source text also */
813 char *line_start; /* start of the current input line */
814 char *line_scan; /* newlines scanned until this point */
815 int reference_length; /* length of reference in input mode */
816 WORD possible_key; /* possible key, to ease searches */
817 OCCURS *occurs_cursor; /* current OCCURS under construction */
819 char *context_start; /* start of left context */
820 char *context_end; /* end of right context */
821 char *word_start; /* start of word */
822 char *word_end; /* end of word */
823 char *next_context_start; /* next start of left context */
825 /* reference_length is always used within `if (input_reference)'.
826 However, GNU C diagnoses that it may be used uninitialized. The
827 following assignment is merely to shut it up. */
829 reference_length = 0;
831 /* Tracking where lines start is helpful for reference processing. In
832 auto reference mode, this allows counting lines. In input reference
833 mode, this permits finding the beginning of the references.
835 The first line begins with the file, skip immediately this very first
836 reference in input reference mode, to help further rejection any word
837 found inside it. Also, unconditionally assigning these variable has
838 the happy effect of shutting up lint. */
840 line_start = text_buffer.start;
841 line_scan = line_start;
842 if (input_reference)
844 SKIP_NON_WHITE (line_scan, text_buffer.end);
845 reference_length = line_scan - line_start;
846 SKIP_WHITE (line_scan, text_buffer.end);
849 /* Process the whole buffer, one line or one sentence at a time. */
851 for (cursor = text_buffer.start;
852 cursor < text_buffer.end;
853 cursor = next_context_start)
856 /* `context_start' gets initialized before the processing of each
857 line, or once for the whole buffer if no end of line or sentence
858 sequence separator. */
860 context_start = cursor;
862 /* If a end of line or end of sentence sequence is defined and
863 non-empty, `next_context_start' will be recomputed to be the end of
864 each line or sentence, before each one is processed. If no such
865 sequence, then `next_context_start' is set at the end of the whole
866 buffer, which is then considered to be a single line or sentence.
867 This test also accounts for the case of an incomplete line or
868 sentence at the end of the buffer. */
870 if (context_regex_string
871 && (re_search (context_regex, cursor, text_buffer.end - cursor,
872 0, text_buffer.end - cursor, &context_regs)
873 >= 0))
874 next_context_start = cursor + context_regs.end[0];
876 else
877 next_context_start = text_buffer.end;
879 /* Include the separator into the right context, but not any suffix
880 white space in this separator; this insures it will be seen in
881 output and will not take more space than necessary. */
883 context_end = next_context_start;
884 SKIP_WHITE_BACKWARDS (context_end, context_start);
886 /* Read and process a single input line or sentence, one word at a
887 time. */
889 while (1)
891 if (word_regex)
893 /* If a word regexp has been compiled, use it to skip at the
894 beginning of the next word. If there is no such word, exit
895 the loop. */
898 if (re_search (word_regex, cursor, context_end - cursor,
899 0, context_end - cursor, &word_regs)
900 < 0)
901 break;
902 word_start = cursor + word_regs.start[0];
903 word_end = cursor + word_regs.end[0];
905 else
907 /* Avoid re_search and use the fastmap to skip to the
908 beginning of the next word. If there is no more word in
909 the buffer, exit the loop. */
912 scan = cursor;
913 while (scan < context_end
914 && !word_fastmap[(unsigned char) *scan])
915 scan++;
917 if (scan == context_end)
918 break;
920 word_start = scan;
922 while (scan < context_end
923 && word_fastmap[(unsigned char) *scan])
924 scan++;
926 word_end = scan;
929 /* Skip right to the beginning of the found word. */
931 cursor = word_start;
933 /* Skip any zero length word. Just advance a single position,
934 then go fetch the next word. */
936 if (word_end == word_start)
938 cursor++;
939 continue;
942 /* This is a genuine, non empty word, so save it as a possible
943 key. Then skip over it. Also, maintain the maximum length of
944 all words read so far. It is mandatory to take the maximum
945 length of all words in the file, without considering if they
946 are actually kept or rejected, because backward jumps at output
947 generation time may fall in *any* word. */
949 possible_key.start = cursor;
950 possible_key.size = word_end - word_start;
951 cursor += possible_key.size;
953 if (possible_key.size > maximum_word_length)
954 maximum_word_length = possible_key.size;
956 /* In input reference mode, update `line_start' from its previous
957 value. Count the lines just in case auto reference mode is
958 also selected. If it happens that the word just matched is
959 indeed part of a reference; just ignore it. */
961 if (input_reference)
963 while (line_scan < possible_key.start)
964 if (*line_scan == '\n')
966 total_line_count++;
967 line_scan++;
968 line_start = line_scan;
969 SKIP_NON_WHITE (line_scan, text_buffer.end);
970 reference_length = line_scan - line_start;
972 else
973 line_scan++;
974 if (line_scan > possible_key.start)
975 continue;
978 /* Ignore the word if an `Ignore words' table exists and if it is
979 part of it. Also ignore the word if an `Only words' table and
980 if it is *not* part of it.
982 It is allowed that both tables be used at once, even if this
983 may look strange for now. Just ignore a word that would appear
984 in both. If regexps are eventually implemented for these
985 tables, the Ignore table could then reject words that would
986 have been previously accepted by the Only table. */
988 if (ignore_file && search_table (&possible_key, &ignore_table))
989 continue;
990 if (only_file && !search_table (&possible_key, &only_table))
991 continue;
993 /* A non-empty word has been found. First of all, insure
994 proper allocation of the next OCCURS, and make a pointer to
995 where it will be constructed. */
997 ALLOC_NEW_OCCURS (0);
998 occurs_cursor = occurs_table[0] + number_of_occurs[0];
1000 /* Define the refence field, if any. */
1002 if (auto_reference)
1005 /* While auto referencing, update `line_start' from its
1006 previous value, counting lines as we go. If input
1007 referencing at the same time, `line_start' has been
1008 advanced earlier, and the following loop is never really
1009 executed. */
1011 while (line_scan < possible_key.start)
1012 if (*line_scan == '\n')
1014 total_line_count++;
1015 line_scan++;
1016 line_start = line_scan;
1017 SKIP_NON_WHITE (line_scan, text_buffer.end);
1019 else
1020 line_scan++;
1022 occurs_cursor->reference = total_line_count;
1024 else if (input_reference)
1027 /* If only input referencing, `line_start' has been computed
1028 earlier to detect the case the word matched would be part
1029 of the reference. The reference position is simply the
1030 value of `line_start'. */
1032 occurs_cursor->reference
1033 = (DELTA) (line_start - possible_key.start);
1034 if (reference_length > reference_max_width)
1035 reference_max_width = reference_length;
1038 /* Exclude the reference from the context in simple cases. */
1040 if (input_reference && line_start == context_start)
1042 SKIP_NON_WHITE (context_start, context_end);
1043 SKIP_WHITE (context_start, context_end);
1046 /* Completes the OCCURS structure. */
1048 occurs_cursor->key = possible_key;
1049 occurs_cursor->left = context_start - possible_key.start;
1050 occurs_cursor->right = context_end - possible_key.start;
1052 number_of_occurs[0]++;
1057 /* Formatting and actual output - service routines. */
1059 /*-----------------------------------------.
1060 | Prints some NUMBER of spaces on stdout. |
1061 `-----------------------------------------*/
1063 static void
1064 print_spaces (int number)
1066 int counter;
1068 for (counter = number; counter > 0; counter--)
1069 putchar (' ');
1072 /*-------------------------------------.
1073 | Prints the field provided by FIELD. |
1074 `-------------------------------------*/
1076 static void
1077 print_field (BLOCK field)
1079 char *cursor; /* Cursor in field to print */
1080 int character; /* Current character */
1081 int base; /* Base character, without diacritic */
1082 int diacritic; /* Diacritic code for the character */
1084 /* Whitespace is not really compressed. Instead, each white space
1085 character (tab, vt, ht etc.) is printed as one single space. */
1087 for (cursor = field.start; cursor < field.end; cursor++)
1089 character = (unsigned char) *cursor;
1090 if (edited_flag[character])
1093 /* First check if this is a diacriticized character.
1095 This works only for TeX. I do not know how diacriticized
1096 letters work with `roff'. Please someone explain it to me! */
1098 diacritic = todiac (character);
1099 if (diacritic != 0 && output_format == TEX_FORMAT)
1101 base = tobase (character);
1102 switch (diacritic)
1105 case 1: /* Latin diphthongs */
1106 switch (base)
1108 case 'o':
1109 fputs ("\\oe{}", stdout);
1110 break;
1112 case 'O':
1113 fputs ("\\OE{}", stdout);
1114 break;
1116 case 'a':
1117 fputs ("\\ae{}", stdout);
1118 break;
1120 case 'A':
1121 fputs ("\\AE{}", stdout);
1122 break;
1124 default:
1125 putchar (' ');
1127 break;
1129 case 2: /* Acute accent */
1130 printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1131 break;
1133 case 3: /* Grave accent */
1134 printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
1135 break;
1137 case 4: /* Circumflex accent */
1138 printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1139 break;
1141 case 5: /* Diaeresis */
1142 printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1143 break;
1145 case 6: /* Tilde accent */
1146 printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1147 break;
1149 case 7: /* Cedilla */
1150 printf ("\\c{%c}", base);
1151 break;
1153 case 8: /* Small circle beneath */
1154 switch (base)
1156 case 'a':
1157 fputs ("\\aa{}", stdout);
1158 break;
1160 case 'A':
1161 fputs ("\\AA{}", stdout);
1162 break;
1164 default:
1165 putchar (' ');
1167 break;
1169 case 9: /* Strike through */
1170 switch (base)
1172 case 'o':
1173 fputs ("\\o{}", stdout);
1174 break;
1176 case 'O':
1177 fputs ("\\O{}", stdout);
1178 break;
1180 default:
1181 putchar (' ');
1183 break;
1186 else
1188 /* This is not a diacritic character, so handle cases which are
1189 really specific to `roff' or TeX. All white space processing
1190 is done as the default case of this switch. */
1192 switch (character)
1194 case '"':
1195 /* In roff output format, double any quote. */
1196 putchar ('"');
1197 putchar ('"');
1198 break;
1200 case '$':
1201 case '%':
1202 case '&':
1203 case '#':
1204 case '_':
1205 /* In TeX output format, precede these with a backslash. */
1206 putchar ('\\');
1207 putchar (character);
1208 break;
1210 case '{':
1211 case '}':
1212 /* In TeX output format, precede these with a backslash and
1213 force mathematical mode. */
1214 printf ("$\\%c$", character);
1215 break;
1217 case '\\':
1218 /* In TeX output mode, request production of a backslash. */
1219 fputs ("\\backslash{}", stdout);
1220 break;
1222 default:
1223 /* Any other flagged character produces a single space. */
1224 putchar (' ');
1227 else
1228 putchar (*cursor);
1232 /* Formatting and actual output - planning routines. */
1234 /*--------------------------------------------------------------------.
1235 | From information collected from command line options and input file |
1236 | readings, compute and fix some output parameter values. |
1237 `--------------------------------------------------------------------*/
1239 static void
1240 fix_output_parameters (void)
1242 int file_index; /* index in text input file arrays */
1243 int line_ordinal; /* line ordinal value for reference */
1244 char ordinal_string[12]; /* edited line ordinal for reference */
1245 int reference_width; /* width for the whole reference */
1246 int character; /* character ordinal */
1247 const char *cursor; /* cursor in some constant strings */
1249 /* In auto reference mode, the maximum width of this field is
1250 precomputed and subtracted from the overall line width. Add one for
1251 the column which separate the file name from the line number. */
1253 if (auto_reference)
1255 reference_max_width = 0;
1256 for (file_index = 0; file_index < number_input_files; file_index++)
1258 line_ordinal = file_line_count[file_index] + 1;
1259 if (file_index > 0)
1260 line_ordinal -= file_line_count[file_index - 1];
1261 sprintf (ordinal_string, "%d", line_ordinal);
1262 reference_width = strlen (ordinal_string);
1263 if (input_file_name[file_index])
1264 reference_width += strlen (input_file_name[file_index]);
1265 if (reference_width > reference_max_width)
1266 reference_max_width = reference_width;
1268 reference_max_width++;
1269 reference.start = (char *) xmalloc ((size_t) reference_max_width + 1);
1272 /* If the reference appears to the left of the output line, reserve some
1273 space for it right away, including one gap size. */
1275 if ((auto_reference || input_reference) && !right_reference)
1276 line_width -= reference_max_width + gap_size;
1278 /* The output lines, minimally, will contain from left to right a left
1279 context, a gap, and a keyword followed by the right context with no
1280 special intervening gap. Half of the line width is dedicated to the
1281 left context and the gap, the other half is dedicated to the keyword
1282 and the right context; these values are computed once and for all here.
1283 There also are tail and head wrap around fields, used when the keyword
1284 is near the beginning or the end of the line, or when some long word
1285 cannot fit in, but leave place from wrapped around shorter words. The
1286 maximum width of these fields are recomputed separately for each line,
1287 on a case by case basis. It is worth noting that it cannot happen that
1288 both the tail and head fields are used at once. */
1290 half_line_width = line_width / 2;
1291 before_max_width = half_line_width - gap_size;
1292 keyafter_max_width = half_line_width;
1294 /* If truncation_string is the empty string, make it NULL to speed up
1295 tests. In this case, truncation_string_length will never get used, so
1296 there is no need to set it. */
1298 if (truncation_string && *truncation_string)
1299 truncation_string_length = strlen (truncation_string);
1300 else
1301 truncation_string = NULL;
1303 if (gnu_extensions)
1306 /* When flagging truncation at the left of the keyword, the
1307 truncation mark goes at the beginning of the before field,
1308 unless there is a head field, in which case the mark goes at the
1309 left of the head field. When flagging truncation at the right
1310 of the keyword, the mark goes at the end of the keyafter field,
1311 unless there is a tail field, in which case the mark goes at the
1312 end of the tail field. Only eight combination cases could arise
1313 for truncation marks:
1315 . None.
1316 . One beginning the before field.
1317 . One beginning the head field.
1318 . One ending the keyafter field.
1319 . One ending the tail field.
1320 . One beginning the before field, another ending the keyafter field.
1321 . One ending the tail field, another beginning the before field.
1322 . One ending the keyafter field, another beginning the head field.
1324 So, there is at most two truncation marks, which could appear both
1325 on the left side of the center of the output line, both on the
1326 right side, or one on either side. */
1328 before_max_width -= 2 * truncation_string_length;
1329 keyafter_max_width -= 2 * truncation_string_length;
1331 else
1334 /* I never figured out exactly how UNIX' ptx plans the output width
1335 of its various fields. If GNU extensions are disabled, do not
1336 try computing the field widths correctly; instead, use the
1337 following formula, which does not completely imitate UNIX' ptx,
1338 but almost. */
1340 keyafter_max_width -= 2 * truncation_string_length + 1;
1343 /* Compute which characters need special output processing. Initialize
1344 by flagging any white space character. Some systems do not consider
1345 form feed as a space character, but we do. */
1347 for (character = 0; character < CHAR_SET_SIZE; character++)
1348 edited_flag[character] = ISSPACE (character) != 0;
1349 edited_flag['\f'] = 1;
1351 /* Complete the special character flagging according to selected output
1352 format. */
1354 switch (output_format)
1356 case UNKNOWN_FORMAT:
1357 /* Should never happen. */
1359 case DUMB_FORMAT:
1360 break;
1362 case ROFF_FORMAT:
1364 /* `Quote' characters should be doubled. */
1366 edited_flag['"'] = 1;
1367 break;
1369 case TEX_FORMAT:
1371 /* Various characters need special processing. */
1373 for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1374 edited_flag[(unsigned char) *cursor] = 1;
1376 /* Any character with 8th bit set will print to a single space, unless
1377 it is diacriticized. */
1379 for (character = 0200; character < CHAR_SET_SIZE; character++)
1380 edited_flag[character] = todiac (character) != 0;
1381 break;
1385 /*------------------------------------------------------------------.
1386 | Compute the position and length of all the output fields, given a |
1387 | pointer to some OCCURS. |
1388 `------------------------------------------------------------------*/
1390 static void
1391 define_all_fields (OCCURS *occurs)
1393 int tail_max_width; /* allowable width of tail field */
1394 int head_max_width; /* allowable width of head field */
1395 char *cursor; /* running cursor in source text */
1396 char *left_context_start; /* start of left context */
1397 char *right_context_end; /* end of right context */
1398 char *left_field_start; /* conservative start for `head'/`before' */
1399 int file_index; /* index in text input file arrays */
1400 const char *file_name; /* file name for reference */
1401 int line_ordinal; /* line ordinal for reference */
1403 /* Define `keyafter', start of left context and end of right context.
1404 `keyafter' starts at the saved position for keyword and extend to the
1405 right from the end of the keyword, eating separators or full words, but
1406 not beyond maximum allowed width for `keyafter' field or limit for the
1407 right context. Suffix spaces will be removed afterwards. */
1409 keyafter.start = occurs->key.start;
1410 keyafter.end = keyafter.start + occurs->key.size;
1411 left_context_start = keyafter.start + occurs->left;
1412 right_context_end = keyafter.start + occurs->right;
1414 cursor = keyafter.end;
1415 while (cursor < right_context_end
1416 && cursor <= keyafter.start + keyafter_max_width)
1418 keyafter.end = cursor;
1419 SKIP_SOMETHING (cursor, right_context_end);
1421 if (cursor <= keyafter.start + keyafter_max_width)
1422 keyafter.end = cursor;
1424 keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1426 SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1428 /* When the left context is wide, it might take some time to catch up from
1429 the left context boundary to the beginning of the `head' or `before'
1430 fields. So, in this case, to speed the catchup, we jump back from the
1431 keyword, using some secure distance, possibly falling in the middle of
1432 a word. A secure backward jump would be at least half the maximum
1433 width of a line, plus the size of the longest word met in the whole
1434 input. We conclude this backward jump by a skip forward of at least
1435 one word. In this manner, we should not inadvertently accept only part
1436 of a word. From the reached point, when it will be time to fix the
1437 beginning of `head' or `before' fields, we will skip forward words or
1438 delimiters until we get sufficiently near. */
1440 if (-occurs->left > half_line_width + maximum_word_length)
1442 left_field_start
1443 = keyafter.start - (half_line_width + maximum_word_length);
1444 SKIP_SOMETHING (left_field_start, keyafter.start);
1446 else
1447 left_field_start = keyafter.start + occurs->left;
1449 /* `before' certainly ends at the keyword, but not including separating
1450 spaces. It starts after than the saved value for the left context, by
1451 advancing it until it falls inside the maximum allowed width for the
1452 before field. There will be no prefix spaces either. `before' only
1453 advances by skipping single separators or whole words. */
1455 before.start = left_field_start;
1456 before.end = keyafter.start;
1457 SKIP_WHITE_BACKWARDS (before.end, before.start);
1459 while (before.start + before_max_width < before.end)
1460 SKIP_SOMETHING (before.start, before.end);
1462 if (truncation_string)
1464 cursor = before.start;
1465 SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
1466 before_truncation = cursor > left_context_start;
1468 else
1469 before_truncation = 0;
1471 SKIP_WHITE (before.start, text_buffer.end);
1473 /* The tail could not take more columns than what has been left in the
1474 left context field, and a gap is mandatory. It starts after the
1475 right context, and does not contain prefixed spaces. It ends at
1476 the end of line, the end of buffer or when the tail field is full,
1477 whichever comes first. It cannot contain only part of a word, and
1478 has no suffixed spaces. */
1480 tail_max_width
1481 = before_max_width - (before.end - before.start) - gap_size;
1483 if (tail_max_width > 0)
1485 tail.start = keyafter.end;
1486 SKIP_WHITE (tail.start, text_buffer.end);
1488 tail.end = tail.start;
1489 cursor = tail.end;
1490 while (cursor < right_context_end
1491 && cursor < tail.start + tail_max_width)
1493 tail.end = cursor;
1494 SKIP_SOMETHING (cursor, right_context_end);
1497 if (cursor < tail.start + tail_max_width)
1498 tail.end = cursor;
1500 if (tail.end > tail.start)
1502 keyafter_truncation = 0;
1503 tail_truncation = truncation_string && tail.end < right_context_end;
1505 else
1506 tail_truncation = 0;
1508 SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1510 else
1513 /* No place left for a tail field. */
1515 tail.start = NULL;
1516 tail.end = NULL;
1517 tail_truncation = 0;
1520 /* `head' could not take more columns than what has been left in the right
1521 context field, and a gap is mandatory. It ends before the left
1522 context, and does not contain suffixed spaces. Its pointer is advanced
1523 until the head field has shrunk to its allowed width. It cannot
1524 contain only part of a word, and has no suffixed spaces. */
1526 head_max_width
1527 = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1529 if (head_max_width > 0)
1531 head.end = before.start;
1532 SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
1534 head.start = left_field_start;
1535 while (head.start + head_max_width < head.end)
1536 SKIP_SOMETHING (head.start, head.end);
1538 if (head.end > head.start)
1540 before_truncation = 0;
1541 head_truncation = (truncation_string
1542 && head.start > left_context_start);
1544 else
1545 head_truncation = 0;
1547 SKIP_WHITE (head.start, head.end);
1549 else
1552 /* No place left for a head field. */
1554 head.start = NULL;
1555 head.end = NULL;
1556 head_truncation = 0;
1559 if (auto_reference)
1562 /* Construct the reference text in preallocated space from the file
1563 name and the line number. Find out in which file the reference
1564 occurred. Standard input yields an empty file name. Insure line
1565 numbers are one based, even if they are computed zero based. */
1567 file_index = 0;
1568 while (file_line_count[file_index] < occurs->reference)
1569 file_index++;
1571 file_name = input_file_name[file_index];
1572 if (!file_name)
1573 file_name = "";
1575 line_ordinal = occurs->reference + 1;
1576 if (file_index > 0)
1577 line_ordinal -= file_line_count[file_index - 1];
1579 sprintf (reference.start, "%s:%d", file_name, line_ordinal);
1580 reference.end = reference.start + strlen (reference.start);
1582 else if (input_reference)
1585 /* Reference starts at saved position for reference and extends right
1586 until some white space is met. */
1588 reference.start = keyafter.start + (DELTA) occurs->reference;
1589 reference.end = reference.start;
1590 SKIP_NON_WHITE (reference.end, right_context_end);
1594 /* Formatting and actual output - control routines. */
1596 /*----------------------------------------------------------------------.
1597 | Output the current output fields as one line for `troff' or `nroff'. |
1598 `----------------------------------------------------------------------*/
1600 static void
1601 output_one_roff_line (void)
1603 /* Output the `tail' field. */
1605 printf (".%s \"", macro_name);
1606 print_field (tail);
1607 if (tail_truncation)
1608 fputs (truncation_string, stdout);
1609 putchar ('"');
1611 /* Output the `before' field. */
1613 fputs (" \"", stdout);
1614 if (before_truncation)
1615 fputs (truncation_string, stdout);
1616 print_field (before);
1617 putchar ('"');
1619 /* Output the `keyafter' field. */
1621 fputs (" \"", stdout);
1622 print_field (keyafter);
1623 if (keyafter_truncation)
1624 fputs (truncation_string, stdout);
1625 putchar ('"');
1627 /* Output the `head' field. */
1629 fputs (" \"", stdout);
1630 if (head_truncation)
1631 fputs (truncation_string, stdout);
1632 print_field (head);
1633 putchar ('"');
1635 /* Conditionally output the `reference' field. */
1637 if (auto_reference || input_reference)
1639 fputs (" \"", stdout);
1640 print_field (reference);
1641 putchar ('"');
1644 putchar ('\n');
1647 /*---------------------------------------------------------.
1648 | Output the current output fields as one line for `TeX'. |
1649 `---------------------------------------------------------*/
1651 static void
1652 output_one_tex_line (void)
1654 BLOCK key; /* key field, isolated */
1655 BLOCK after; /* after field, isolated */
1656 char *cursor; /* running cursor in source text */
1658 printf ("\\%s ", macro_name);
1659 fputs ("{", stdout);
1660 print_field (tail);
1661 fputs ("}{", stdout);
1662 print_field (before);
1663 fputs ("}{", stdout);
1664 key.start = keyafter.start;
1665 after.end = keyafter.end;
1666 cursor = keyafter.start;
1667 SKIP_SOMETHING (cursor, keyafter.end);
1668 key.end = cursor;
1669 after.start = cursor;
1670 print_field (key);
1671 fputs ("}{", stdout);
1672 print_field (after);
1673 fputs ("}{", stdout);
1674 print_field (head);
1675 fputs ("}", stdout);
1676 if (auto_reference || input_reference)
1678 fputs ("{", stdout);
1679 print_field (reference);
1680 fputs ("}", stdout);
1682 fputs ("\n", stdout);
1685 /*-------------------------------------------------------------------.
1686 | Output the current output fields as one line for a dumb terminal. |
1687 `-------------------------------------------------------------------*/
1689 static void
1690 output_one_dumb_line (void)
1692 if (!right_reference)
1694 if (auto_reference)
1697 /* Output the `reference' field, in such a way that GNU emacs
1698 next-error will handle it. The ending colon is taken from the
1699 gap which follows. */
1701 print_field (reference);
1702 putchar (':');
1703 print_spaces (reference_max_width
1704 + gap_size
1705 - (reference.end - reference.start)
1706 - 1);
1708 else
1711 /* Output the `reference' field and its following gap. */
1713 print_field (reference);
1714 print_spaces (reference_max_width
1715 + gap_size
1716 - (reference.end - reference.start));
1720 if (tail.start < tail.end)
1722 /* Output the `tail' field. */
1724 print_field (tail);
1725 if (tail_truncation)
1726 fputs (truncation_string, stdout);
1728 print_spaces (half_line_width - gap_size
1729 - (before.end - before.start)
1730 - (before_truncation ? truncation_string_length : 0)
1731 - (tail.end - tail.start)
1732 - (tail_truncation ? truncation_string_length : 0));
1734 else
1735 print_spaces (half_line_width - gap_size
1736 - (before.end - before.start)
1737 - (before_truncation ? truncation_string_length : 0));
1739 /* Output the `before' field. */
1741 if (before_truncation)
1742 fputs (truncation_string, stdout);
1743 print_field (before);
1745 print_spaces (gap_size);
1747 /* Output the `keyafter' field. */
1749 print_field (keyafter);
1750 if (keyafter_truncation)
1751 fputs (truncation_string, stdout);
1753 if (head.start < head.end)
1755 /* Output the `head' field. */
1757 print_spaces (half_line_width
1758 - (keyafter.end - keyafter.start)
1759 - (keyafter_truncation ? truncation_string_length : 0)
1760 - (head.end - head.start)
1761 - (head_truncation ? truncation_string_length : 0));
1762 if (head_truncation)
1763 fputs (truncation_string, stdout);
1764 print_field (head);
1766 else
1768 if ((auto_reference || input_reference) && right_reference)
1769 print_spaces (half_line_width
1770 - (keyafter.end - keyafter.start)
1771 - (keyafter_truncation ? truncation_string_length : 0));
1773 if ((auto_reference || input_reference) && right_reference)
1775 /* Output the `reference' field. */
1777 print_spaces (gap_size);
1778 print_field (reference);
1781 fputs ("\n", stdout);
1784 /*------------------------------------------------------------------------.
1785 | Scan the whole occurs table and, for each entry, output one line in the |
1786 | appropriate format. |
1787 `------------------------------------------------------------------------*/
1789 static void
1790 generate_all_output (void)
1792 int occurs_index; /* index of keyword entry being processed */
1793 OCCURS *occurs_cursor; /* current keyword entry being processed */
1795 /* The following assignments are useful to provide default values in case
1796 line contexts or references are not used, in which case these variables
1797 would never be computed. */
1799 tail.start = NULL;
1800 tail.end = NULL;
1801 tail_truncation = 0;
1803 head.start = NULL;
1804 head.end = NULL;
1805 head_truncation = 0;
1807 /* Loop over all keyword occurrences. */
1809 occurs_cursor = occurs_table[0];
1811 for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1813 /* Compute the exact size of every field and whenever truncation flags
1814 are present or not. */
1816 define_all_fields (occurs_cursor);
1818 /* Produce one output line according to selected format. */
1820 switch (output_format)
1822 case UNKNOWN_FORMAT:
1823 /* Should never happen. */
1825 case DUMB_FORMAT:
1826 output_one_dumb_line ();
1827 break;
1829 case ROFF_FORMAT:
1830 output_one_roff_line ();
1831 break;
1833 case TEX_FORMAT:
1834 output_one_tex_line ();
1835 break;
1838 /* Advance the cursor into the occurs table. */
1840 occurs_cursor++;
1844 /* Option decoding and main program. */
1846 /*------------------------------------------------------.
1847 | Print program identification and options, then exit. |
1848 `------------------------------------------------------*/
1850 void
1851 usage (int status)
1853 if (status != EXIT_SUCCESS)
1854 fprintf (stderr, _("Try `%s --help' for more information.\n"),
1855 program_name);
1856 else
1858 printf (_("\
1859 Usage: %s [OPTION]... [INPUT]... (without -G)\n\
1860 or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1861 program_name, program_name);
1862 fputs (_("\
1863 Mandatory arguments to long options are mandatory for short options too.\n\
1865 -A, --auto-reference output automatically generated references\n\
1866 -C, --copyright display Copyright and copying conditions\n\
1867 -G, --traditional behave more like System V `ptx'\n\
1868 -F, --flag-truncation=STRING use STRING for flagging line truncations\n\
1869 -M, --macro-name=STRING macro name to use instead of `xx'\n\
1870 -O, --format=roff generate output as roff directives\n\
1871 -R, --right-side-refs put references at right, not counted in -w\n\
1872 -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
1873 -T, --format=tex generate output as TeX directives\n\
1874 -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
1875 -b, --break-file=FILE word break characters in this FILE\n\
1876 -f, --ignore-case fold lower case to upper case for sorting\n\
1877 -g, --gap-size=NUMBER gap size in columns between output fields\n\
1878 -i, --ignore-file=FILE read ignore word list from FILE\n\
1879 -o, --only-file=FILE read only word list from this FILE\n\
1880 -r, --references first field of each line is a reference\n\
1881 -t, --typeset-mode - not implemented -\n\
1882 -w, --width=NUMBER output width in columns, reference excluded\n\
1883 --help display this help and exit\n\
1884 --version output version information and exit\n\
1886 With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n"),
1887 stdout);
1889 exit (status);
1892 /*----------------------------------------------------------------------.
1893 | Main program. Decode ARGC arguments passed through the ARGV array of |
1894 | strings, then launch execution. |
1895 `----------------------------------------------------------------------*/
1897 /* Long options equivalences. */
1898 static const struct option long_options[] =
1900 {"auto-reference", no_argument, NULL, 'A'},
1901 {"break-file", required_argument, NULL, 'b'},
1902 {"copyright", no_argument, NULL, 'C'},
1903 {"flag-truncation", required_argument, NULL, 'F'},
1904 {"ignore-case", no_argument, NULL, 'f'},
1905 {"gap-size", required_argument, NULL, 'g'},
1906 {"ignore-file", required_argument, NULL, 'i'},
1907 {"macro-name", required_argument, NULL, 'M'},
1908 {"only-file", required_argument, NULL, 'o'},
1909 {"references", no_argument, NULL, 'r'},
1910 {"right-side-refs", no_argument, NULL, 'R'},
1911 {"format", required_argument, NULL, 10},
1912 {"sentence-regexp", required_argument, NULL, 'S'},
1913 {"traditional", no_argument, NULL, 'G'},
1914 {"typeset-mode", no_argument, NULL, 't'},
1915 {"width", required_argument, NULL, 'w'},
1916 {"word-regexp", required_argument, NULL, 'W'},
1917 {GETOPT_HELP_OPTION_DECL},
1918 {GETOPT_VERSION_OPTION_DECL},
1919 {0, 0, 0, 0},
1922 static char const* const format_args[] =
1924 "roff", "tex", 0
1927 static enum Format const format_vals[] =
1929 ROFF_FORMAT, TEX_FORMAT
1933 main (int argc, char **argv)
1935 int optchar; /* argument character */
1936 int file_index; /* index in text input file arrays */
1938 /* Decode program options. */
1940 program_name = argv[0];
1941 setlocale (LC_ALL, "");
1942 bindtextdomain (PACKAGE, LOCALEDIR);
1943 textdomain (PACKAGE);
1945 #if HAVE_SETCHRCLASS
1946 setchrclass (NULL);
1947 #endif
1949 while (optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:",
1950 long_options, NULL),
1951 optchar != EOF)
1953 switch (optchar)
1955 default:
1956 usage (EXIT_FAILURE);
1958 case 0:
1959 break;
1961 case 'C':
1962 fputs (_("\
1963 This program is free software; you can redistribute it and/or modify\n\
1964 it under the terms of the GNU General Public License as published by\n\
1965 the Free Software Foundation; either version 2, or (at your option)\n\
1966 any later version.\n\
1968 This program is distributed in the hope that it will be useful,\n\
1969 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
1970 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
1971 GNU General Public License for more details.\n\
1973 You should have received a copy of the GNU General Public License\n\
1974 along with this program; if not, write to the Free Software Foundation,\n\
1975 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.\n"),
1976 stdout);
1978 exit (EXIT_SUCCESS);
1980 case 'G':
1981 gnu_extensions = 0;
1982 break;
1984 case 'b':
1985 break_file = optarg;
1986 break;
1988 case 'f':
1989 ignore_case = 1;
1990 break;
1992 case 'g':
1993 gap_size = atoi (optarg);
1994 break;
1996 case 'i':
1997 ignore_file = optarg;
1998 break;
2000 case 'o':
2001 only_file = optarg;
2002 break;
2004 case 'r':
2005 input_reference = 1;
2006 break;
2008 case 't':
2009 /* Yet to understand... */
2010 break;
2012 case 'w':
2013 line_width = atoi (optarg);
2014 break;
2016 case 'A':
2017 auto_reference = 1;
2018 break;
2020 case 'F':
2021 truncation_string = copy_unescaped_string (optarg);
2022 break;
2024 case 'M':
2025 macro_name = optarg;
2026 break;
2028 case 'O':
2029 output_format = ROFF_FORMAT;
2030 break;
2032 case 'R':
2033 right_reference = 1;
2034 break;
2036 case 'S':
2037 context_regex_string = copy_unescaped_string (optarg);
2038 break;
2040 case 'T':
2041 output_format = TEX_FORMAT;
2042 break;
2044 case 'W':
2045 word_regex_string = copy_unescaped_string (optarg);
2046 break;
2048 case 10:
2049 output_format = XARGMATCH ("--format", optarg,
2050 format_args, format_vals);
2051 case_GETOPT_HELP_CHAR;
2053 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2057 /* Change the default Ignore file if one is defined. */
2059 #ifdef DEFAULT_IGNORE_FILE
2060 if (!ignore_file)
2061 ignore_file = DEFAULT_IGNORE_FILE;
2062 #endif
2064 /* Process remaining arguments. If GNU extensions are enabled, process
2065 all arguments as input parameters. If disabled, accept at most two
2066 arguments, the second of which is an output parameter. */
2068 if (optind == argc)
2071 /* No more argument simply means: read standard input. */
2073 input_file_name = (const char **) xmalloc (sizeof (const char *));
2074 file_line_count = (int *) xmalloc (sizeof (int));
2075 number_input_files = 1;
2076 input_file_name[0] = NULL;
2078 else if (gnu_extensions)
2080 number_input_files = argc - optind;
2081 input_file_name
2082 = (const char **) xmalloc (number_input_files * sizeof (const char *));
2083 file_line_count
2084 = (int *) xmalloc (number_input_files * sizeof (int));
2086 for (file_index = 0; file_index < number_input_files; file_index++)
2088 input_file_name[file_index] = argv[optind];
2089 if (!*argv[optind] || strcmp (argv[optind], "-") == 0)
2090 input_file_name[0] = NULL;
2091 else
2092 input_file_name[0] = argv[optind];
2093 optind++;
2096 else
2099 /* There is one necessary input file. */
2101 number_input_files = 1;
2102 input_file_name = (const char **) xmalloc (sizeof (const char *));
2103 file_line_count = (int *) xmalloc (sizeof (int));
2104 if (!*argv[optind] || strcmp (argv[optind], "-") == 0)
2105 input_file_name[0] = NULL;
2106 else
2107 input_file_name[0] = argv[optind];
2108 optind++;
2110 /* Redirect standard output, only if requested. */
2112 if (optind < argc)
2114 fclose (stdout);
2115 if (fopen (argv[optind], "w") == NULL)
2116 error (EXIT_FAILURE, errno, "%s", argv[optind]);
2117 optind++;
2120 /* Diagnose any other argument as an error. */
2122 if (optind < argc)
2123 usage (EXIT_FAILURE);
2126 /* If the output format has not been explicitly selected, choose dumb
2127 terminal format if GNU extensions are enabled, else `roff' format. */
2129 if (output_format == UNKNOWN_FORMAT)
2130 output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2132 /* Initialize the main tables. */
2134 initialize_regex ();
2136 /* Read `Break character' file, if any. */
2138 if (break_file)
2139 digest_break_file (break_file);
2141 /* Read `Ignore words' file and `Only words' files, if any. If any of
2142 these files is empty, reset the name of the file to NULL, to avoid
2143 unnecessary calls to search_table. */
2145 if (ignore_file)
2147 digest_word_file (ignore_file, &ignore_table);
2148 if (ignore_table.length == 0)
2149 ignore_file = NULL;
2152 if (only_file)
2154 digest_word_file (only_file, &only_table);
2155 if (only_table.length == 0)
2156 only_file = NULL;
2159 /* Prepare to study all the input files. */
2161 number_of_occurs[0] = 0;
2162 total_line_count = 0;
2163 maximum_word_length = 0;
2164 reference_max_width = 0;
2166 for (file_index = 0; file_index < number_input_files; file_index++)
2169 /* Read the file in core, than study it. */
2171 swallow_file_in_memory (input_file_name[file_index], &text_buffer);
2172 find_occurs_in_text ();
2174 /* Maintain for each file how many lines has been read so far when its
2175 end is reached. Incrementing the count first is a simple kludge to
2176 handle a possible incomplete line at end of file. */
2178 total_line_count++;
2179 file_line_count[file_index] = total_line_count;
2182 /* Do the output process phase. */
2184 sort_found_occurs ();
2185 fix_output_parameters ();
2186 generate_all_output ();
2188 /* All done. */
2190 exit (EXIT_SUCCESS);