2 # Copyright (C) 2006 Alex Schroeder <alex@emacswiki.org>
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with this program; if not, write to the
16 # Free Software Foundation, Inc.
17 # 59 Temple Place, Suite 330
18 # Boston, MA 02111-1307 USA
21 use Term
::ProgressBar
;
23 use Unicode
::Normalize
;
26 my $LogFile = 'access.log';
27 my $ReportFile = 'age-vs-popularity.csv';
31 # $UrlFilter must match the requested URL, and $1 must be the pagename
32 my $FreeLinkPattern = "([-,.()' _0-9A-Za-z\x80-\xff]+)";
33 my $UrlFilter = "^/(?:cw|en|de|fr)[/?]$FreeLinkPattern\$";
34 warn "URL filter: $UrlFilter\n";
37 # my $InterSitePattern = '[A-Z\x80-\xff]+[A-Za-z\x80-\xff]+';
41 $str =~ s/%([0-9a-f][0-9a-f])/chr(hex($1))/ge;
48 $line =~ m/"(\S+)\s+(\S+)\s+HTTP\/[10.]+"\s+(\d+)/ or die "Cannot parse
:\n$_";
50 my $url = UrlDecode($2);
52 return unless $type eq 'GET';
53 return unless $code == 200; # Forget 304 Not Modified
54 return $1 if $url =~ m!$UrlFilter!;
56 # return $url if $url =~ m!^/odd/$InterSitePattern/$FreeLinkPattern$!;
63 while ($data =~ /(\S+?): (.*?)(?=\n[^ \t]|\Z)/sg) {
64 my ($key, $value) = ($1, $2);
65 $value =~ s/\n\t/\n/g;
66 $result{$key} = $value;
75 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
76 $atime,$mtime,$ctime,$blksize,$blocks)
78 my $progress = Term::ProgressBar->new({name => 'Log',
81 $progress->max_update_rate(1);
84 open(F, $LogFile) or die "Cannot
read $Logfile: $!";
87 my $page = ParseLogLine($_);
90 $next_update = $progress->update($count) if $count++ >= $next_update;
93 $progress->update($size) if $size >= $next_update;
98 my @files = glob("$PageDir/*/*.pg
$PageDir/*/.*.pg
");
99 my $progress = Term::ProgressBar->new({name => 'Pages',
102 $progress->max_update_rate(1);
105 foreach my $file (@files) {
106 next unless $file =~ m|/.*/(.+)\.pg$|;
107 my $page = encode_utf8(NFC(decode_utf8($1))); # normalize on HFS+ filesystems
108 local $/ = undef; # Read complete files
109 open(F, $file) or die "Cannot
read $page file
: $!";
112 my %result = ParseData($data);
113 my $days = ($Now - $result{ts}) / (24 * 60 * 60);
115 $next_update = $progress->update($count) if $count++ >= $next_update;
117 $progress->update($#files) if $#files >= $next_update;
121 open(F, "> $ReportFile") or die "Cannot
write $ReportFile: $!";
122 print F "Days
,Hits
,Name
\n";
123 for my $page (keys %Age) {
124 print F "$Age{$page},$Hits{$page},$page\n";