bin/text_cat/text_cat

   1 #!/usr/bin/perl -w
   2 # © Gertjan van Noord, 1997.
   3 # mailto:vannoord@let.rug.nl
   4
   5 use strict;
   6 use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
   7 use Getopt::Std;
   8 use Benchmark;
   9
  10 my $non_word_characters='0-9\s';
  11 my @languages; # languages (sorted by name)
  12 my %ngram_for; # map language x ngram => rang
  13
  14 # OPTIONS
  15 getopts('a:d:f:hi:lnst:u:v');
  16
  17 # defaults: set $opt_X unless already defined (Perl Cookbook p. 6):
  18 $opt_a ||= 10;
  19 $opt_d ||= '/users1/vannoord/Perl/TextCat/LM';
  20 $opt_f ||= 0;
  21 $opt_t ||= 400;
  22 $opt_u ||= 1.05;
  23
  24 $| = 1; # auto-flush stdout
  25
  26 sub help {
  27     print <<HELP
  28 Text Categorization. Typically used to determine the language of a
  29 given document.
  30
  31 Usage
  32 -----
  33
  34 * print help message:
  35
  36 $0 -h
  37
  38 * for guessing:
  39
  40 $0 [-a Int] [-d Dir] [-f Int] [-i N] [-l] [-t Int] [-u Int] [-v]
  41
  42     -a    the program returns the best-scoring language together
  43           with all languages which are $opt_u times worse (cf option -u).
  44           If the number of languages to be printed is larger than the value
  45           of this option (default: $opt_a) then no language is returned, but
  46           instead a message that the input is of an unknown language is
  47           printed. Default: $opt_a.
  48     -d    indicates in which directory the language models are
  49           located (files ending in .lm). Currently only a single
  50           directory is supported. Default: $opt_d.
  51     -f    Before sorting is performed the Ngrams which occur this number
  52           of times or less are removed. This can be used to speed up
  53           the program for longer inputs. For short inputs you should use
  54           -f 0.
  55           Default: $opt_f.
  56     -i N  only read first N lines
  57     -l    indicates that input is given as an argument on the command line,
  58           e.g. text_cat -l "this is english text"
  59           Cannot be used in combination with -n.
  60     -s    Determine language of each line of input. Not very efficient yet,
  61           because language models are re-loaded after each line.
  62     -t    indicates the topmost number of ngrams that should be used.
  63           If used in combination with -n this determines the size of the
  64           output. If used with categorization this determines
  65           the number of ngrams that are compared with each of the language
  66           models (but each of those models is used completely).
  67     -u    determines how much worse result must be in order not to be
  68           mentioned as an alternative. Typical value: 1.05 or 1.1.
  69           Default: $opt_u.
  70     -v    verbose. Continuation messages are written to standard error.
  71
  72 * for creating new language model, based on text read from standard input:
  73
  74 $0 -n [-v]
  75
  76     -v    verbose. Continuation messages are written to standard error.
  77
  78
  79 HELP
  80 }
  81
  82 if ($opt_h) { help(); exit 0; };
  83
  84 if ($opt_n) {
  85     my %ngram=();
  86     my @result = create_lm(input(),\%ngram);
  87     print join("\n",map { "$_\t $ngram{$_}" ; } @result),"\n";
  88 } elsif ($opt_l) {
  89     classify($ARGV[0]);
  90 } elsif ($opt_s) {
  91     while (<>) {
  92         chomp;
  93         classify($_);
  94     }
  95 } else {
  96     classify(input());
  97 }
  98
  99 sub read_model {
 100     my ($file) = @_;
 101     open(LM,"$file") or  die "cannot open $file: $!\n";
 102     my %ngram;
 103     my $rang = 1;
 104     while (<LM>) {
 105         chomp;
 106         # only use lines starting with appropriate character. Others are
 107         # ignored.
 108         if (/^[^$non_word_characters]+/o) {
 109             $ngram{$&} = $rang++;
 110         }
 111     }
 112     return \%ngram;
 113 }
 114
 115 sub read_models {
 116   # open directory to find which languages are supported
 117   opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
 118   @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
 119   closedir DIR;
 120   @languages or die "sorry, can't read any language models from $opt_d\n" .
 121     "language models must reside in files with .lm ending\n";
 122
 123   foreach my $language (@languages) {
 124       $ngram_for{$language} = read_model("$opt_d/$language.lm");
 125   }
 126 }
 127
 128 # CLASSIFICATION
 129 sub classify {
 130   my ($input)=@_;
 131   my %results=();
 132   my $maxp = $opt_t;
 133   read_models() if  !@languages;
 134
 135   # create ngrams for input. Note that hash %unknown is not used;
 136   # it contains the actual counts which are only used under -n: creating
 137   # new language model (and even then they are not really required).
 138   my @unknown=create_lm($input);
 139
 140   my $t1 = new Benchmark;
 141   foreach my $language (@languages) {
 142       # compares the language model with input ngrams list
 143       my $ngram = $ngram_for{$language} or die "no ngrams for $language";
 144
 145       my ($i,$p)=(0,0);
 146       while ($i < @unknown) {
 147           if ($ngram->{$unknown[$i]}) {
 148               $p=$p+abs($ngram->{$unknown[$i]}-$i);
 149           } else {
 150               $p=$p+$maxp;
 151           }
 152           ++$i;
 153       }
 154       #print STDERR "$language: $p\n" if $opt_v;
 155
 156       $results{$language} = $p;
 157   }
 158   print STDERR "read language models done (" .
 159     timestr(timediff(new Benchmark, $t1)) .
 160       ".\n" if $opt_v;
 161   my @results = sort { $results{$a} <=> $results{$b} } keys %results;
 162
 163   print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
 164   my $a = $results{$results[0]};
 165
 166   my @answers=(shift(@results));
 167   while (@results && $results{$results[0]} < ($opt_u *$a)) {
 168     @answers=(@answers,shift(@results));
 169   }
 170   if (@answers > $opt_a) {
 171     print "I don't know; " .
 172       "Perhaps this is a language I haven't seen before?\n";
 173   } else {
 174     print join(" or ", @answers), "\n";
 175   }
 176 }
 177
 178 # first and only argument is reference to hash.
 179 # this hash is filled, and a sorted list (opt_n elements)
 180 # is returned.
 181 sub input {
 182     my $read="";
 183     if ($opt_i) {
 184         while(<>) {
 185             if ($. == $opt_i) {
 186                 return $read . $_;
 187             }
 188             $read = $read . $_;
 189         }
 190         return $read;
 191     } else {
 192         local $/;     # so it doesn't affect $/ elsewhere
 193         undef $/;
 194         $read = <>;      # swallow input.
 195         $read || die "determining the language of an empty file is hard...\n";
 196         return $read;
 197     }
 198 }
 199
 200
 201 sub create_lm {
 202   my $t1 = new Benchmark;
 203   my $ngram;
 204   ($_,$ngram) = @_;  #$ngram contains reference to the hash we build
 205     # then add the ngrams found in each word in the hash
 206   my $word;
 207   foreach $word (split("[$non_word_characters]+")) {
 208     $word = "_" . $word . "_";
 209     my $len = length($word);
 210     my $flen=$len;
 211     my $i;
 212     for ($i=0;$i<$flen;$i++) {
 213       $$ngram{substr($word,$i,5)}++ if $len > 4;
 214       $$ngram{substr($word,$i,4)}++ if $len > 3;
 215       $$ngram{substr($word,$i,3)}++ if $len > 2;
 216       $$ngram{substr($word,$i,2)}++ if $len > 1;
 217       $$ngram{substr($word,$i,1)}++;
 218       $len--;
 219     }
 220   }
 221   ###print "@{[%$ngram]}";
 222   my $t2 = new Benchmark;
 223   print STDERR "count_ngrams done (".
 224     timestr(timediff($t2, $t1)) .").\n" if $opt_v;
 225
 226   # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
 227   # sorting by removing singletons
 228   map { my $key=$_; if ($$ngram{$key} <= $opt_f)
 229              { delete $$ngram{$key}; }; } keys %$ngram;
 230   #however I have very bad results for short inputs, this way
 231
 232
 233   # sort the ngrams, and spit out the $opt_t frequent ones.
 234   # adding  `or $a cmp $b' in the sort block makes sorting five
 235   # times slower..., although it would be somewhat nicer (unique result)
 236   my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
 237   splice(@sorted,$opt_t) if (@sorted > $opt_t);
 238   print STDERR "sorting done (" .
 239     timestr(timediff(new Benchmark, $t2)) .
 240       ").\n" if $opt_v;
 241   return @sorted;
 242 }