doc/info/build_index.pl

   1 #!/usr/bin/env perl
   2 # -----------------------------
   3 #  Perl 5.8 or later required
   4 # -----------------------------
   5 require 5.008;
   6
   7 $main_info = $ARGV[0];
   8 $infofile_encoding = ":utf8";
   9
  10 binmode STDOUT, $infofile_encoding;
  11
  12 $unit_separator = "\x1f";
  13
  14 $item_cnt = 0;
  15 $section_cnt = 0;
  16
  17 # ------------------------------------------------------------------
  18 # PART 1. BUILD INDEX FOR @DEFFN AND @DEFVR ITEMS
  19 # ------------------------------------------------------------------
  20
  21 # (1.1)  Build index tables.
  22
  23 # (1.1a) Scan the *.info-* files for unit separator characters;
  24 #        those mark the start of each texinfo node.
  25 #        Build a hash table which associates the node name with the filename
  26 #        and byte offset (NOT character offset) of the unit separator.
  27 #
  28 #        Do NOT use the indirect table + tag table (generated by makeinfo),
  29 #        because those tables give character offsets; we want byte offsets.
  30 #        It is easier to construct a byte offset table by hand,
  31 #        rather than attempting to fix up the character offsets.
  32 #        (Which are strange anyway.)
  33
  34 open (FH, "<" . $infofile_encoding, $main_info);
  35 read (FH, $stuff, -s FH);
  36
  37 # check which version of makeinfo produced $main_info
  38 # for purposes of bug workaround
  39 ($makeinfo_major_version, $makeinfo_minor_version) =
  40     $stuff =~ /makeinfo version (\d+)\.(\d+)/;
  41 # print STDERR "makeinfo version $makeinfo_major_version . $makeinfo_minor_version\n";
  42
  43 $filename = $main_info;
  44 push @info_filenames, $filename;
  45
  46 while ($stuff =~ m/\G.*?(?=\n$unit_separator)/cgsm) {
  47     $offset = pos $stuff;
  48
  49     if ($stuff =~ m/^File:.*?Node: (.*?),/csgm) {
  50         $node_name = $1;
  51         $last_node_name = $node_name;
  52     }
  53
  54     # print ";; IN SEC 1.1a, SEARCH MAIN INFO; NODE NAME=$node_name, FILENAME=$filename, OFFSET=$offset\n";
  55     $node_offset{$node_name} = [($filename, int($offset))];
  56 }
  57
  58 close $FH;
  59
  60 open (FH, "<" . $infofile_encoding, $main_info);
  61 read (FH, $stuff, -s FH);
  62
  63 while ($stuff =~ m/^($main_info-\d+): (\d+)/cgsm) {
  64     $filename = $1;
  65     push @info_filenames, $filename;
  66
  67     open FH2, "<" . $infofile_encoding, $filename;
  68     read FH2, $stuff2, -s FH2;
  69
  70     while ($stuff2 =~ m/\G.*?(?=\n$unit_separator)/cgsm) {
  71         $offset = pos $stuff2;
  72
  73         if ($stuff2 =~ m/^File:.*?Node: (.*?),/csgm) {
  74             $node_name = $1;
  75             $last_node_name = $node_name;
  76         }
  77
  78         # print ";; IN SEC 1.1a, SEARCH SUBSIDIARY INFO; NODE NAME=$node_name, FILENAME=$filename, OFFSET=$offset\n";
  79         $node_offset{$node_name} = [($filename, int($offset))];
  80     }
  81
  82     close $FH2;
  83 }
  84
  85 close FH;
  86
  87 # (1.1b) Read the info index, which gives the node name and number of lines offset
  88 #        for each indexed item.
  89
  90 # ASSUME THAT THE INFO INDEX IS THE LAST NODE.
  91 # (GETTING THE NODE NAME FROM THE COMMAND LINE IS PROBLEMATIC.)
  92 $index_node_name = $last_node_name;
  93
  94 ($index_filename, $index_node_offset) = @{$node_offset{$index_node_name}};
  95 # print ";; IN SEC 1.1b, INDEX NODE NAME=$index_node_name, INDEX FILENAME=$index_filename, INDEX NODE OFFSET=$index_node_offset\n";
  96
  97 open (FH, "<" . $infofile_encoding, $index_filename);
  98 read (FH, $stuff, -s FH);
  99
 100 while ($stuff =~ m/^File:.*?Node: $index_node_name/icgsm) {
 101     while ($stuff =~ m/\G.*?^\* (?!Menu)(\S+|[^:]+):\s+(.*?)\.\s+\(line\s+(\d+)\)/cgsm) {
 102         $topic_name = $1;
 103         $node_name = $2;
 104         $lines_offset = $3;
 105         # print ";; IN SEC 1.1b, TOPIC NAME=$topic_name, NODE NAME=$node_name, LINES OFFSET=$lines_offset\n";
 106         $topic_locator{$topic_name} = [($node_name, $lines_offset)];
 107     }
 108 }
 109
 110 close FH;
 111
 112 # (1.2)  Translate node name and number of lines offset into file name and byte offset
 113 #        for each indexed item.
 114 #        Also find the length of each item.
 115
 116 foreach $key (sort keys %topic_locator) {
 117     ($node_name, $lines_offset) = @{$topic_locator{$key}};
 118     ($filename, $character_offset) = @{$node_offset{$node_name}};
 119     $byte_offset = seek_lines($filename, $character_offset, $lines_offset);
 120
 121     open FH, "<" . $infofile_encoding, $filename;
 122     seek FH, $byte_offset, 0;
 123     read FH, $stuff, -s FH;
 124     if ($stuff =~ m/(.*?)(?:\n\n(?= -- )|\n(?=[0-9])|(?=$unit_separator))/cgsm) {
 125         $text_length = length $1;
 126     }
 127     else {
 128         # Eat everything up til end of file.
 129         $stuff =~ m/(.*)/cgsm;
 130         $text_length = length $1;
 131     }
 132     close FH;
 133
 134     # print ";; IN SEC 1.2, KEY=$key, NODE NAME=$node_name, FILENAME=$filename, BYTE OFFSET=$byte_offset, TEXT LENGTH=$text_length\n";
 135     $topic_locator{$key} = [($node_name, $filename, $byte_offset, $text_length)];
 136 }
 137
 138 # (1.3)  Generate Lisp code. The functions in info.lisp expect this stuff.
 139
 140 print "(in-package :cl-info)\n";
 141
 142 #        Pairs of the form (<index topic> . (<filename> <byte offset> <length> <node name>))
 143
 144 print "(let (\n";
 145 print "(deffn-defvr-pairs '(\n";
 146 print "; CONTENT: (<INDEX TOPIC> . (<FILENAME> <BYTE OFFSET> <LENGTH IN CHARACTERS> <NODE NAME>))\n";
 147
 148 foreach $key (sort keys %topic_locator) {
 149     $item_cnt++;
 150     my $sanitized_key = $key;
 151     $sanitized_key =~ s/"/\\"/g;
 152     my $file_name = $topic_locator{$key}[1];
 153     my $byte_offset = $topic_locator{$key}[2];
 154     my $nchars = $topic_locator{$key}[3];
 155     my $node_name = $topic_locator{$key}[0];
 156     if ($sanitized_key eq '' or $file_name eq '' or $byte_offset < 0 or $nchars < 0 or $node_name eq '') {
 157         print STDERR "build_index.pl: something seems wrong for key=\"$sanitized_key\"; emit it anyway.\n";
 158         print STDERR "build_index.pl: sanitized_key=\"$sanitized_key\", file_name=\"$file_name\", byte_offset=$byte_offset, nchars=$nchars, node_name=\"$node_name\"\n";
 159         print ";; build_index.pl: something seems wrong for this next item\n";
 160     }
 161     print "(\"$sanitized_key\" . (\"$file_name\" $byte_offset $nchars \"$node_name\"))\n";
 162 }
 163
 164 print "))\n";
 165
 166 # ------------------------------------------------------------------
 167 # PART 2. BUILD INDEX FOR @NODE ITEMS
 168 # ------------------------------------------------------------------
 169
 170 # (2.1)  Search for 'mmm.nnn' at the start of a line,
 171 #        and take each one of those to be the start of a node.
 172 #
 173 #        We could use the node table ($node_offset here), but we don't.
 174
 175 #        (a) The node table indexes nodes which contain only menus.
 176 #            We don't want those because they have no useful text.
 177 #
 178 #        (b) The offset stated in the node table tells the location
 179 #            of the "File: ..." header. We would have to cut off that stuff.
 180 #
 181 #        (c) Offsets computed by makeinfo are character offsets,
 182 #            so we would have to convert those to byte offsets.
 183 #            (But we have to do that anyway, so I guess there's no
 184 #            advantage either way on that point.)
 185
 186 for $filename (@info_filenames) {
 187
 188     open (FH, "<" . $infofile_encoding, $filename);
 189     read (FH, $stuff, -s FH);
 190
 191     while ($stuff =~ m/\G(.*?)(?=^\d+\.\d+ .*?\n)/cgsm) {
 192
 193         # Since FH was opened with $infofile_encoding,
 194         # pos returns a CHARACTER offset.
 195         $begin_node_offset = pos($stuff);
 196
 197         if ($stuff =~ m/((^\d+\.\d+) (.*?)\n)/cgsm) {
 198             $node_title = $3;
 199             $node_length = length $1;
 200         }
 201
 202         # Node text ends at a unit separator character,
 203         # or at the end of the file.
 204
 205         if ($stuff =~ m/\G(.*?)($unit_separator)/cgsm) {
 206             $node_length += length $1;
 207         }
 208         else {
 209             $stuff =~ m/\G(.*)/csgm;
 210             $node_length += length $1;
 211         }
 212
 213         $node_locator{$node_title} = [($filename, $begin_node_offset, $node_length)];
 214     }
 215
 216     close FH;
 217 }
 218
 219 # Translate character offsets to byte offsets.
 220
 221 foreach $node_title (sort keys %node_locator) {
 222     ($filename, $begin_node_offset, $node_length) = @{$node_locator{$node_title}};
 223     open FH, "<" . $infofile_encoding, $filename;
 224     read FH, $stuff, $begin_node_offset;
 225     my $begin_node_offset_bytes = tell FH;
 226     close FH;
 227
 228     $node_locator{$node_title} = [($filename, $begin_node_offset_bytes, $node_length)];
 229 }
 230
 231 # (2.2)  Generate Lisp code.
 232 #
 233 #        Pairs of the form (<node name> . (<filename> <byte offset> <length>))
 234
 235 print "(section-pairs '(\n";
 236 print "; CONTENT: (<NODE NAME> . (<FILENAME> <BYTE OFFSET> <LENGTH IN CHARACTERS>))\n";
 237
 238 foreach $node_title (sort keys %node_locator) {
 239     $section_cnt++;
 240     ($filename, $begin_node_offset, $length) = @{$node_locator{$node_title}};
 241     my $sanitized_title = $node_title;
 242     $sanitized_title =~ s/"/\\"/g;
 243     if ($sanitized_title eq '' or $filename eq '' or $begin_node_offset < 0 or $length < 0) {
 244         print STDERR "build_index.pl: something seems wrong for title=\"$sanitized_title\"; emit it anyway.\n";
 245         print STDERR "build_index.pl: sanitized_title=\"$sanitized_title\", filename=\"$filename\", begin_node_offset=$begin_node_offset, length=$length\n";
 246         print ";; build_index.pl: something seems wrong for this next item\n";
 247     }
 248     print "(\"$sanitized_title\" . (\"$filename\" $begin_node_offset ", $length, "))\n";
 249 }
 250
 251 print ")))\n";
 252
 253 #        Construct hashtables from the lists given above.
 254
 255 print "(load-info-hashtables (maxima::maxima-load-pathname-directory) deffn-defvr-pairs section-pairs))\n";
 256
 257 # (2.3)  Do we have any items or sections?
 258 #
 259 #        Warn if no index items or secions found.
 260
 261 ($item_cnt+$section_cnt)>0 ||
 262     print STDERR "WARNING: Empty index. Not sure what's going on.\n";
 263
 264 # ------------------------------------------------------------------
 265 # Helper functions
 266 # ------------------------------------------------------------------
 267
 268 sub seek_lines {
 269     my ($filename, $character_offset, $lines_offset) = @_;
 270     open FH, "<" . $infofile_encoding, $filename;
 271     read FH, $stuff, $character_offset;
 272
 273     # MAKEINFO BUG: LINE OFFSET IS LINE NUMBER OF LAST LINE IN FUNCTION DEFINITION
 274     # (BUT WE NEED THE FIRST LINE OF THE FUNCTION DEFINITION)
 275     #
 276     # EXAMPLE. THE PROBLEM IS THAT THE FUNCTION DEFINITION IS BROKEN ACROSS TWO
 277     # OR MORE LINES (NOT THAT THERE ARE MULTIPLE FUNCTION DEFINITIONS):
 278     #  -- Function: setup_autoload (<filename>, <function_1>, ...,
 279     #            <function_n>)
 280     #
 281     # BUG IS PRESENT IN MAKEINFO 4.8, NOT PRESENT IN MAKEINFO 5.1
 282
 283     my $x;
 284     if ($makeinfo_major_version == 4) {
 285         $x = -1;
 286         my $x_maybe;
 287
 288         for (1 .. $lines_offset + 1) {
 289             $x_maybe = tell FH;
 290             my $line = <FH>;
 291             if ($line =~ /^ -- \S/) {
 292                 $x = $x_maybe;
 293             }
 294         }
 295
 296         if ($x == -1) {
 297             # We didn't encounter any match for "^ -- \S".
 298             $x = $x_maybe;
 299         }
 300     } else {
 301         # VERSION WITHOUT BUG WORKAROUND,
 302         # FOR MAKEINFO VERSION 5
 303         <FH> for 1 .. $lines_offset;
 304         $x = tell FH;
 305     }
 306
 307     close FH;
 308     return $x;
 309 }