No empty .Rs/.Re
[netbsd-mini2440.git] / gnu / dist / groff / src / utils / indxbib / indxbib.cpp
blob14d27e417ef5610d8e5db8bf8ff6e7a930c551bc
1 /* $NetBSD$ */
3 // -*- C++ -*-
4 /* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004
5 Free Software Foundation, Inc.
6 Written by James Clark (jjc@jclark.com)
8 This file is part of groff.
10 groff is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free
12 Software Foundation; either version 2, or (at your option) any later
13 version.
15 groff is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 for more details.
20 You should have received a copy of the GNU General Public License along
21 with groff; see the file COPYING. If not, write to the Free Software
22 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
24 #include "lib.h"
26 #include <stdlib.h>
27 #include <assert.h>
28 #include <errno.h>
30 #include "posix.h"
31 #include "errarg.h"
32 #include "error.h"
33 #include "stringclass.h"
34 #include "cset.h"
35 #include "cmap.h"
37 #include "defs.h"
38 #include "index.h"
40 #include "nonposix.h"
42 extern "C" const char *Version_string;
44 #define DEFAULT_HASH_TABLE_SIZE 997
45 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
47 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
49 #define MALLOC_OVERHEAD 16
51 #ifdef BLOCK_SIZE
52 #undef BLOCK_SIZE
53 #endif
55 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
56 - sizeof(int)) / sizeof(int));
57 struct block {
58 block *next;
59 int used;
60 int v[BLOCK_SIZE];
62 block(block *p = 0) : next(p), used(0) { }
65 struct block;
67 union table_entry {
68 block *ptr;
69 int count;
72 struct word_list {
73 word_list *next;
74 char *str;
75 int len;
76 word_list(const char *, int, word_list *);
79 table_entry *hash_table;
80 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
81 // We make this the same size as hash_table so we only have to do one
82 // mod per key.
83 static word_list **common_words_table = 0;
84 char *key_buffer;
86 FILE *indxfp;
87 int ntags = 0;
88 string filenames;
89 char *temp_index_file = 0;
91 const char *ignore_fields = "XYZ";
92 const char *common_words_file = COMMON_WORDS_FILE;
93 int n_ignore_words = 100;
94 int truncate_len = 6;
95 int shortest_len = 3;
96 int max_keys_per_item = 100;
98 static void usage(FILE *stream);
99 static void write_hash_table();
100 static void init_hash_table();
101 static void read_common_words_file();
102 static int store_key(char *s, int len);
103 static void possibly_store_key(char *s, int len);
104 static int do_whole_file(const char *filename);
105 static int do_file(const char *filename);
106 static void store_reference(int filename_index, int pos, int len);
107 static void check_integer_arg(char opt, const char *arg, int min, int *res);
108 static void store_filename(const char *);
109 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
110 static char *get_cwd();
112 extern "C" {
113 void cleanup();
114 void catch_fatal_signals();
115 void ignore_fatal_signals();
118 int main(int argc, char **argv)
120 program_name = argv[0];
121 static char stderr_buf[BUFSIZ];
122 setbuf(stderr, stderr_buf);
124 const char *base_name = 0;
125 typedef int (*parser_t)(const char *);
126 parser_t parser = do_file;
127 const char *directory = 0;
128 const char *foption = 0;
129 int opt;
130 static const struct option long_options[] = {
131 { "help", no_argument, 0, CHAR_MAX + 1 },
132 { "version", no_argument, 0, 'v' },
133 { NULL, 0, 0, 0 }
135 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
136 long_options, NULL))
137 != EOF)
138 switch (opt) {
139 case 'c':
140 common_words_file = optarg;
141 break;
142 case 'd':
143 directory = optarg;
144 break;
145 case 'f':
146 foption = optarg;
147 break;
148 case 'h':
149 check_integer_arg('h', optarg, 1, &hash_table_size);
150 if (!is_prime(hash_table_size)) {
151 while (!is_prime(++hash_table_size))
153 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
155 break;
156 case 'i':
157 ignore_fields = optarg;
158 break;
159 case 'k':
160 check_integer_arg('k', optarg, 1, &max_keys_per_item);
161 break;
162 case 'l':
163 check_integer_arg('l', optarg, 0, &shortest_len);
164 break;
165 case 'n':
166 check_integer_arg('n', optarg, 0, &n_ignore_words);
167 break;
168 case 'o':
169 base_name = optarg;
170 break;
171 case 't':
172 check_integer_arg('t', optarg, 1, &truncate_len);
173 break;
174 case 'w':
175 parser = do_whole_file;
176 break;
177 case 'v':
178 printf("GNU indxbib (groff) version %s\n", Version_string);
179 exit(0);
180 break;
181 case CHAR_MAX + 1: // --help
182 usage(stdout);
183 exit(0);
184 break;
185 case '?':
186 usage(stderr);
187 exit(1);
188 break;
189 default:
190 assert(0);
191 break;
193 if (optind >= argc && foption == 0)
194 fatal("no files and no -f option");
195 if (!directory) {
196 char *path = get_cwd();
197 store_filename(path);
198 a_delete path;
200 else
201 store_filename(directory);
202 init_hash_table();
203 store_filename(common_words_file);
204 store_filename(ignore_fields);
205 key_buffer = new char[truncate_len];
206 read_common_words_file();
207 if (!base_name)
208 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
209 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
210 const char *sep = &DIR_SEPS[1];
211 while (*sep) {
212 p1 = strrchr(base_name, *sep);
213 if (p1 && (!p || p1 > p))
214 p = p1;
215 sep++;
217 size_t name_max;
218 if (p) {
219 char *dir = strsave(base_name);
220 dir[p - base_name] = '\0';
221 name_max = file_name_max(dir);
222 a_delete dir;
224 else
225 name_max = file_name_max(".");
226 const char *filename = p ? p + 1 : base_name;
227 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
228 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
229 if (p) {
230 p++;
231 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
232 memcpy(temp_index_file, base_name, p - base_name);
233 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
235 else {
236 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
238 catch_fatal_signals();
239 int fd = mkstemp(temp_index_file);
240 if (fd < 0)
241 fatal("can't create temporary index file: %1", strerror(errno));
242 indxfp = fdopen(fd, FOPEN_WB);
243 if (indxfp == 0)
244 fatal("fdopen failed");
245 if (fseek(indxfp, sizeof(index_header), 0) < 0)
246 fatal("can't seek past index header: %1", strerror(errno));
247 int failed = 0;
248 if (foption) {
249 FILE *fp = stdin;
250 if (strcmp(foption, "-") != 0) {
251 errno = 0;
252 fp = fopen(foption, "r");
253 if (!fp)
254 fatal("can't open `%1': %2", foption, strerror(errno));
256 string path;
257 int lineno = 1;
258 for (;;) {
259 int c;
260 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
261 if (c == '\0')
262 error_with_file_and_line(foption, lineno,
263 "nul character in pathname ignored");
264 else
265 path += c;
267 if (path.length() > 0) {
268 path += '\0';
269 if (!(*parser)(path.contents()))
270 failed = 1;
271 path.clear();
273 if (c == EOF)
274 break;
275 lineno++;
277 if (fp != stdin)
278 fclose(fp);
280 for (int i = optind; i < argc; i++)
281 if (!(*parser)(argv[i]))
282 failed = 1;
283 write_hash_table();
284 if (fclose(indxfp) < 0)
285 fatal("error closing temporary index file: %1", strerror(errno));
286 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
287 strcpy(index_file, base_name);
288 strcat(index_file, INDEX_SUFFIX);
289 #ifdef HAVE_RENAME
290 #ifdef __EMX__
291 if (access(index_file, R_OK) == 0)
292 unlink(index_file);
293 #endif /* __EMX__ */
294 if (rename(temp_index_file, index_file) < 0) {
295 #ifdef __MSDOS__
296 // RENAME could fail on plain MSDOS filesystems because
297 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
298 char *fname = p ? index_file + (p - base_name) : 0;
299 char *dot = 0;
301 // Replace the dot with an underscore and try again.
302 if (fname
303 && (dot = strchr(fname, '.')) != 0
304 && strcmp(dot, INDEX_SUFFIX) != 0)
305 *dot = '_';
306 if (rename(temp_index_file, index_file) < 0)
307 #endif
308 fatal("can't rename temporary index file: %1", strerror(errno));
310 #else /* not HAVE_RENAME */
311 ignore_fatal_signals();
312 if (unlink(index_file) < 0) {
313 if (errno != ENOENT)
314 fatal("can't unlink `%1': %2", index_file, strerror(errno));
316 if (link(temp_index_file, index_file) < 0)
317 fatal("can't link temporary index file: %1", strerror(errno));
318 if (unlink(temp_index_file) < 0)
319 fatal("can't unlink temporary index file: %1", strerror(errno));
320 #endif /* not HAVE_RENAME */
321 temp_index_file = 0;
322 return failed;
325 static void usage(FILE *stream)
327 fprintf(stream,
328 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
329 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
330 program_name);
333 static void check_integer_arg(char opt, const char *arg, int min, int *res)
335 char *ptr;
336 long n = strtol(arg, &ptr, 10);
337 if (n == 0 && ptr == arg)
338 error("argument to -%1 not an integer", opt);
339 else if (n < min)
340 error("argument to -%1 must not be less than %2", opt, min);
341 else {
342 if (n > INT_MAX)
343 error("argument to -%1 greater than maximum integer", opt);
344 else if (*ptr != '\0')
345 error("junk after integer argument to -%1", opt);
346 *res = int(n);
350 static char *get_cwd()
352 char *buf;
353 int size = 12;
355 for (;;) {
356 buf = new char[size];
357 if (getcwd(buf, size))
358 break;
359 if (errno != ERANGE)
360 fatal("cannot get current working directory: %1", strerror(errno));
361 a_delete buf;
362 if (size == INT_MAX)
363 fatal("current working directory longer than INT_MAX");
364 if (size > INT_MAX/2)
365 size = INT_MAX;
366 else
367 size *= 2;
369 return buf;
372 word_list::word_list(const char *s, int n, word_list *p)
373 : next(p), len(n)
375 str = new char[n];
376 memcpy(str, s, n);
379 static void read_common_words_file()
381 if (n_ignore_words <= 0)
382 return;
383 errno = 0;
384 FILE *fp = fopen(common_words_file, "r");
385 if (!fp)
386 fatal("can't open `%1': %2", common_words_file, strerror(errno));
387 common_words_table = new word_list * [hash_table_size];
388 for (int i = 0; i < hash_table_size; i++)
389 common_words_table[i] = 0;
390 int count = 0;
391 int key_len = 0;
392 for (;;) {
393 int c = getc(fp);
394 while (c != EOF && !csalnum(c))
395 c = getc(fp);
396 if (c == EOF)
397 break;
398 do {
399 if (key_len < truncate_len)
400 key_buffer[key_len++] = cmlower(c);
401 c = getc(fp);
402 } while (c != EOF && csalnum(c));
403 if (key_len >= shortest_len) {
404 int h = hash(key_buffer, key_len) % hash_table_size;
405 common_words_table[h] = new word_list(key_buffer, key_len,
406 common_words_table[h]);
408 if (++count >= n_ignore_words)
409 break;
410 key_len = 0;
411 if (c == EOF)
412 break;
414 n_ignore_words = count;
415 fclose(fp);
418 static int do_whole_file(const char *filename)
420 errno = 0;
421 FILE *fp = fopen(filename, "r");
422 if (!fp) {
423 error("can't open `%1': %2", filename, strerror(errno));
424 return 0;
426 int count = 0;
427 int key_len = 0;
428 int c;
429 while ((c = getc(fp)) != EOF) {
430 if (csalnum(c)) {
431 key_len = 1;
432 key_buffer[0] = c;
433 while ((c = getc(fp)) != EOF) {
434 if (!csalnum(c))
435 break;
436 if (key_len < truncate_len)
437 key_buffer[key_len++] = c;
439 if (store_key(key_buffer, key_len)) {
440 if (++count >= max_keys_per_item)
441 break;
443 if (c == EOF)
444 break;
447 store_reference(filenames.length(), 0, 0);
448 store_filename(filename);
449 fclose(fp);
450 return 1;
453 static int do_file(const char *filename)
455 errno = 0;
456 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
457 // byte counts to be consistent with fseek.
458 FILE *fp = fopen(filename, FOPEN_RB);
459 if (fp == 0) {
460 error("can't open `%1': %2", filename, strerror(errno));
461 return 0;
463 int filename_index = filenames.length();
464 store_filename(filename);
466 enum {
467 START, // at the start of the file; also in between references
468 BOL, // in the middle of a reference, at the beginning of the line
469 PERCENT, // seen a percent at the beginning of the line
470 IGNORE, // ignoring a field
471 IGNORE_BOL, // at the beginning of a line ignoring a field
472 KEY, // in the middle of a key
473 DISCARD, // after truncate_len bytes of a key
474 MIDDLE // in between keys
475 } state = START;
477 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
478 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
479 // MIDDLE space_count must be 0.
480 int space_count = 0;
481 int byte_count = 0; // bytes read
482 int key_len = 0;
483 int ref_start = -1; // position of start of current reference
484 for (;;) {
485 int c = getc(fp);
486 if (c == EOF)
487 break;
488 // We opened the file in binary mode, so we need to skip
489 // every CR character before a Newline.
490 if (c == '\r') {
491 int peek = getc(fp);
492 if (peek == '\n') {
493 byte_count++;
494 c = peek;
496 else
497 ungetc(peek, fp);
499 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
500 else if (c == 0x1a) // ^Z means EOF in text files
501 break;
502 #endif
503 byte_count++;
504 switch (state) {
505 case START:
506 if (c == ' ' || c == '\t') {
507 space_count++;
508 break;
510 if (c == '\n') {
511 space_count = 0;
512 break;
514 ref_start = byte_count - space_count - 1;
515 space_count = 0;
516 if (c == '%')
517 state = PERCENT;
518 else if (csalnum(c)) {
519 state = KEY;
520 key_buffer[0] = c;
521 key_len = 1;
523 else
524 state = MIDDLE;
525 break;
526 case BOL:
527 switch (c) {
528 case '%':
529 if (space_count > 0) {
530 space_count = 0;
531 state = MIDDLE;
533 else
534 state = PERCENT;
535 break;
536 case ' ':
537 case '\t':
538 space_count++;
539 break;
540 case '\n':
541 store_reference(filename_index, ref_start,
542 byte_count - 1 - space_count - ref_start);
543 state = START;
544 space_count = 0;
545 break;
546 default:
547 space_count = 0;
548 if (csalnum(c)) {
549 state = KEY;
550 key_buffer[0] = c;
551 key_len = 1;
553 else
554 state = MIDDLE;
556 break;
557 case PERCENT:
558 if (strchr(ignore_fields, c) != 0)
559 state = IGNORE;
560 else if (c == '\n')
561 state = BOL;
562 else
563 state = MIDDLE;
564 break;
565 case IGNORE:
566 if (c == '\n')
567 state = IGNORE_BOL;
568 break;
569 case IGNORE_BOL:
570 switch (c) {
571 case '%':
572 if (space_count > 0) {
573 state = IGNORE;
574 space_count = 0;
576 else
577 state = PERCENT;
578 break;
579 case ' ':
580 case '\t':
581 space_count++;
582 break;
583 case '\n':
584 store_reference(filename_index, ref_start,
585 byte_count - 1 - space_count - ref_start);
586 state = START;
587 space_count = 0;
588 break;
589 default:
590 space_count = 0;
591 state = IGNORE;
593 break;
594 case KEY:
595 if (csalnum(c)) {
596 if (key_len < truncate_len)
597 key_buffer[key_len++] = c;
598 else
599 state = DISCARD;
601 else {
602 possibly_store_key(key_buffer, key_len);
603 key_len = 0;
604 if (c == '\n')
605 state = BOL;
606 else
607 state = MIDDLE;
609 break;
610 case DISCARD:
611 if (!csalnum(c)) {
612 possibly_store_key(key_buffer, key_len);
613 key_len = 0;
614 if (c == '\n')
615 state = BOL;
616 else
617 state = MIDDLE;
619 break;
620 case MIDDLE:
621 if (csalnum(c)) {
622 state = KEY;
623 key_buffer[0] = c;
624 key_len = 1;
626 else if (c == '\n')
627 state = BOL;
628 break;
629 default:
630 assert(0);
633 switch (state) {
634 case START:
635 break;
636 case DISCARD:
637 case KEY:
638 possibly_store_key(key_buffer, key_len);
639 // fall through
640 case BOL:
641 case PERCENT:
642 case IGNORE_BOL:
643 case IGNORE:
644 case MIDDLE:
645 store_reference(filename_index, ref_start,
646 byte_count - ref_start - space_count);
647 break;
648 default:
649 assert(0);
651 fclose(fp);
652 return 1;
655 static void store_reference(int filename_index, int pos, int len)
657 tag t;
658 t.filename_index = filename_index;
659 t.start = pos;
660 t.length = len;
661 fwrite_or_die(&t, sizeof(t), 1, indxfp);
662 ntags++;
665 static void store_filename(const char *fn)
667 filenames += fn;
668 filenames += '\0';
671 static void init_hash_table()
673 hash_table = new table_entry[hash_table_size];
674 for (int i = 0; i < hash_table_size; i++)
675 hash_table[i].ptr = 0;
678 static void possibly_store_key(char *s, int len)
680 static int last_tagno = -1;
681 static int key_count;
682 if (last_tagno != ntags) {
683 last_tagno = ntags;
684 key_count = 0;
686 if (key_count < max_keys_per_item) {
687 if (store_key(s, len))
688 key_count++;
692 static int store_key(char *s, int len)
694 if (len < shortest_len)
695 return 0;
696 int is_number = 1;
697 for (int i = 0; i < len; i++)
698 if (!csdigit(s[i])) {
699 is_number = 0;
700 s[i] = cmlower(s[i]);
702 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
703 return 0;
704 int h = hash(s, len) % hash_table_size;
705 if (common_words_table) {
706 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
707 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
708 return 0;
710 table_entry *pp = hash_table + h;
711 if (!pp->ptr)
712 pp->ptr = new block;
713 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
714 return 1;
715 else if (pp->ptr->used >= BLOCK_SIZE)
716 pp->ptr = new block(pp->ptr);
717 pp->ptr->v[(pp->ptr->used)++] = ntags;
718 return 1;
721 static void write_hash_table()
723 const int minus_one = -1;
724 int li = 0;
725 for (int i = 0; i < hash_table_size; i++) {
726 block *ptr = hash_table[i].ptr;
727 if (!ptr)
728 hash_table[i].count = -1;
729 else {
730 hash_table[i].count = li;
731 block *rev = 0;
732 while (ptr) {
733 block *tem = ptr;
734 ptr = ptr->next;
735 tem->next = rev;
736 rev = tem;
738 while (rev) {
739 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
740 li += rev->used;
741 block *tem = rev;
742 rev = rev->next;
743 delete tem;
745 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
746 li += 1;
749 if (sizeof(table_entry) == sizeof(int))
750 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
751 else {
752 // write it out word by word
753 for (int i = 0; i < hash_table_size; i++)
754 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
756 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
757 if (fseek(indxfp, 0, 0) < 0)
758 fatal("error seeking on index file: %1", strerror(errno));
759 index_header h;
760 h.magic = INDEX_MAGIC;
761 h.version = INDEX_VERSION;
762 h.tags_size = ntags;
763 h.lists_size = li;
764 h.table_size = hash_table_size;
765 h.strings_size = filenames.length();
766 h.truncate = truncate_len;
767 h.shortest = shortest_len;
768 h.common = n_ignore_words;
769 fwrite_or_die(&h, sizeof(h), 1, indxfp);
772 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
774 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
775 fatal("fwrite failed: %1", strerror(errno));
778 void fatal_error_exit()
780 cleanup();
781 exit(3);
784 extern "C" {
786 void cleanup()
788 if (temp_index_file)
789 unlink(temp_index_file);