Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / bin / text_cat / text_cat
blob74dae861d802f712da4c01882ae397008ed01aaa
1 #!/usr/bin/perl -w
2 # © Gertjan van Noord, 1997.
3 # mailto:vannoord@let.rug.nl
5 use strict;
6 use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
7 use Getopt::Std;
8 use Benchmark;
10 my $non_word_characters='0-9\s';
11 my @languages; # languages (sorted by name)
12 my %ngram_for; # map language x ngram => rang
14 # OPTIONS
15 getopts('a:d:f:hi:lnst:u:v');
17 # defaults: set $opt_X unless already defined (Perl Cookbook p. 6):
18 $opt_a ||= 10;
19 $opt_d ||= '/users1/vannoord/Perl/TextCat/LM';
20 $opt_f ||= 0;
21 $opt_t ||= 400;
22 $opt_u ||= 1.05;
24 $| = 1; # auto-flush stdout
26 sub help {
27 print <<HELP
28 Text Categorization. Typically used to determine the language of a
29 given document.
31 Usage
32 -----
34 * print help message:
36 $0 -h
38 * for guessing:
40 $0 [-a Int] [-d Dir] [-f Int] [-i N] [-l] [-t Int] [-u Int] [-v]
42 -a the program returns the best-scoring language together
43 with all languages which are $opt_u times worse (cf option -u).
44 If the number of languages to be printed is larger than the value
45 of this option (default: $opt_a) then no language is returned, but
46 instead a message that the input is of an unknown language is
47 printed. Default: $opt_a.
48 -d indicates in which directory the language models are
49 located (files ending in .lm). Currently only a single
50 directory is supported. Default: $opt_d.
51 -f Before sorting is performed the Ngrams which occur this number
52 of times or less are removed. This can be used to speed up
53 the program for longer inputs. For short inputs you should use
54 -f 0.
55 Default: $opt_f.
56 -i N only read first N lines
57 -l indicates that input is given as an argument on the command line,
58 e.g. text_cat -l "this is english text"
59 Cannot be used in combination with -n.
60 -s Determine language of each line of input. Not very efficient yet,
61 because language models are re-loaded after each line.
62 -t indicates the topmost number of ngrams that should be used.
63 If used in combination with -n this determines the size of the
64 output. If used with categorization this determines
65 the number of ngrams that are compared with each of the language
66 models (but each of those models is used completely).
67 -u determines how much worse result must be in order not to be
68 mentioned as an alternative. Typical value: 1.05 or 1.1.
69 Default: $opt_u.
70 -v verbose. Continuation messages are written to standard error.
72 * for creating new language model, based on text read from standard input:
74 $0 -n [-v]
76 -v verbose. Continuation messages are written to standard error.
79 HELP
82 if ($opt_h) { help(); exit 0; };
84 if ($opt_n) {
85 my %ngram=();
86 my @result = create_lm(input(),\%ngram);
87 print join("\n",map { "$_\t $ngram{$_}" ; } @result),"\n";
88 } elsif ($opt_l) {
89 classify($ARGV[0]);
90 } elsif ($opt_s) {
91 while (<>) {
92 chomp;
93 classify($_);
95 } else {
96 classify(input());
99 sub read_model {
100 my ($file) = @_;
101 open(LM,"$file") or die "cannot open $file: $!\n";
102 my %ngram;
103 my $rang = 1;
104 while (<LM>) {
105 chomp;
106 # only use lines starting with appropriate character. Others are
107 # ignored.
108 if (/^[^$non_word_characters]+/o) {
109 $ngram{$&} = $rang++;
112 return \%ngram;
115 sub read_models {
116 # open directory to find which languages are supported
117 opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
118 @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
119 closedir DIR;
120 @languages or die "sorry, can't read any language models from $opt_d\n" .
121 "language models must reside in files with .lm ending\n";
123 foreach my $language (@languages) {
124 $ngram_for{$language} = read_model("$opt_d/$language.lm");
128 # CLASSIFICATION
129 sub classify {
130 my ($input)=@_;
131 my %results=();
132 my $maxp = $opt_t;
133 read_models() if !@languages;
135 # create ngrams for input. Note that hash %unknown is not used;
136 # it contains the actual counts which are only used under -n: creating
137 # new language model (and even then they are not really required).
138 my @unknown=create_lm($input);
140 my $t1 = new Benchmark;
141 foreach my $language (@languages) {
142 # compares the language model with input ngrams list
143 my $ngram = $ngram_for{$language} or die "no ngrams for $language";
145 my ($i,$p)=(0,0);
146 while ($i < @unknown) {
147 if ($ngram->{$unknown[$i]}) {
148 $p=$p+abs($ngram->{$unknown[$i]}-$i);
149 } else {
150 $p=$p+$maxp;
152 ++$i;
154 #print STDERR "$language: $p\n" if $opt_v;
156 $results{$language} = $p;
158 print STDERR "read language models done (" .
159 timestr(timediff(new Benchmark, $t1)) .
160 ".\n" if $opt_v;
161 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
163 print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
164 my $a = $results{$results[0]};
166 my @answers=(shift(@results));
167 while (@results && $results{$results[0]} < ($opt_u *$a)) {
168 @answers=(@answers,shift(@results));
170 if (@answers > $opt_a) {
171 print "I don't know; " .
172 "Perhaps this is a language I haven't seen before?\n";
173 } else {
174 print join(" or ", @answers), "\n";
178 # first and only argument is reference to hash.
179 # this hash is filled, and a sorted list (opt_n elements)
180 # is returned.
181 sub input {
182 my $read="";
183 if ($opt_i) {
184 while(<>) {
185 if ($. == $opt_i) {
186 return $read . $_;
188 $read = $read . $_;
190 return $read;
191 } else {
192 local $/; # so it doesn't affect $/ elsewhere
193 undef $/;
194 $read = <>; # swallow input.
195 $read || die "determining the language of an empty file is hard...\n";
196 return $read;
201 sub create_lm {
202 my $t1 = new Benchmark;
203 my $ngram;
204 ($_,$ngram) = @_; #$ngram contains reference to the hash we build
205 # then add the ngrams found in each word in the hash
206 my $word;
207 foreach $word (split("[$non_word_characters]+")) {
208 $word = "_" . $word . "_";
209 my $len = length($word);
210 my $flen=$len;
211 my $i;
212 for ($i=0;$i<$flen;$i++) {
213 $$ngram{substr($word,$i,5)}++ if $len > 4;
214 $$ngram{substr($word,$i,4)}++ if $len > 3;
215 $$ngram{substr($word,$i,3)}++ if $len > 2;
216 $$ngram{substr($word,$i,2)}++ if $len > 1;
217 $$ngram{substr($word,$i,1)}++;
218 $len--;
221 ###print "@{[%$ngram]}";
222 my $t2 = new Benchmark;
223 print STDERR "count_ngrams done (".
224 timestr(timediff($t2, $t1)) .").\n" if $opt_v;
226 # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
227 # sorting by removing singletons
228 map { my $key=$_; if ($$ngram{$key} <= $opt_f)
229 { delete $$ngram{$key}; }; } keys %$ngram;
230 #however I have very bad results for short inputs, this way
233 # sort the ngrams, and spit out the $opt_t frequent ones.
234 # adding `or $a cmp $b' in the sort block makes sorting five
235 # times slower..., although it would be somewhat nicer (unique result)
236 my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
237 splice(@sorted,$opt_t) if (@sorted > $opt_t);
238 print STDERR "sorting done (" .
239 timestr(timediff(new Benchmark, $t2)) .
240 ").\n" if $opt_v;
241 return @sorted;