dictionaries/util/th_gen_idx.pl

   1 :
   2 eval 'exec perl -wS $0 ${1+"$@"}'
   3     if 0;
   4 #*************************************************************************
   5 #
   6 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7 #
   8 # Copyright 2008 by Sun Microsystems, Inc.
   9 #
  10 # OpenOffice.org - a multi-platform office productivity suite
  11 #
  12 # $RCSfile: th_gen_idx.pl,v $
  13 #
  14 # $Revision: 1.5 $
  15 #
  16 # This file is part of OpenOffice.org.
  17 #
  18 # OpenOffice.org is free software: you can redistribute it and/or modify
  19 # it under the terms of the GNU Lesser General Public License version 3
  20 # only, as published by the Free Software Foundation.
  21 #
  22 # OpenOffice.org is distributed in the hope that it will be useful,
  23 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 # GNU Lesser General Public License version 3 for more details
  26 # (a copy is included in the LICENSE file that accompanied this code).
  27 #
  28 # You should have received a copy of the GNU Lesser General Public License
  29 # version 3 along with OpenOffice.org.  If not, see
  30 # <http://www.openoffice.org/license.html>
  31 # for a copy of the LGPLv3 License.
  32 #
  33 #*************************************************************************
  34
  35 sub by_entry {
  36     my ($aent, $aoff) = split('\|',$a);
  37     my ($bent, $boff) = split('\|',$b);
  38     $aent cmp $bent;
  39 }
  40
  41 #FIXME: someone may want "infile" or even parameter parsing
  42 sub get_outfile {
  43     my $next_is_file = 0;
  44     foreach ( @ARGV ) {
  45         if ( $next_is_file ) {
  46             return $_
  47         }
  48         if ( $_ eq "-o" ) {
  49             $next_is_file = 1;
  50         }
  51     }
  52     return "";
  53 }
  54
  55 sub usage {
  56     print "usage:\n";
  57     print "$0 -o outfile < input\n";
  58
  59     exit 99;
  60 }
  61
  62 # main routine
  63 my $ne = 0;       # number of entries in index
  64 my @tindex=();    # the index itself
  65 my $foffset = 0;  # file position offset into thesaurus
  66 my $rec="";       # current string and related pieces
  67 my $rl=0;         # misc string length
  68 my $entry="";     # current word being processed
  69 my $nm=0;         # number of meaning for the current word
  70 my $meaning="";   # current meaning and synonyms
  71 my $p;            # misc uses
  72 my $encoding;     # encoding used by text file
  73 my $outfile = "";
  74
  75 $outfile = get_outfile();
  76 usage() if ( $outfile eq "" );
  77
  78 # top line of thesaurus provides encoding
  79 $encoding=<STDIN>;
  80 $foffset = $foffset + length($encoding);
  81 chomp($encoding);
  82
  83 # read thesaurus line by line
  84 # first line of every block is an entry and meaning count
  85 while ($rec=<STDIN>){
  86     $rl = length($rec);
  87     chomp($rec);
  88     ($entry, $nm) = split('\|',$rec);
  89     $p = 0;
  90     while ($p < $nm) {
  91         $meaning=<STDIN>;
  92         $rl = $rl + length($meaning);
  93         chomp($meaning);
  94         $p++;
  95     }
  96     push(@tindex,"$entry|$foffset");
  97     $ne++;
  98     $foffset = $foffset + $rl;
  99 }
 100
 101 # now we have all of the information
 102 # so sort it and then output the encoding, count and index data
 103 @tindex = sort by_entry @tindex;
 104
 105 print "$outfile\n";
 106 open OUTFILE, ">$outfile" or die "ERROR: Can't open $outfile for writing!";
 107 print OUTFILE "$encoding\n";
 108 print OUTFILE "$ne\n";
 109 foreach $one (@tindex) {
 110     print OUTFILE "$one\n";
 111 }
 112 close OUTFILE;
 113