updated on Thu Jan 12 08:01:00 UTC 2012
[aur-mirror.git] / theyoke / theyoke.txt
blob2ebae52d2fc3618692578838a611910cc9302ca8
1 #!/usr/local/bin/perl
3 # $Id: theyoke 34 2007-02-18 18:50:19Z mackers $
4 # http://www.mackers.com/projects/theyoke/
6 # TheYoke is an ultra-simple RSS aggregrator designed for use on the UNIX command line.
8 use strict;
9 use XML::RSS;
10 use URI;
11 use LWP::UserAgent;
12 use Digest::MD5 qw(md5_base64);
13 use Encode qw(encode_utf8);
14 use Term::Size;
15 use Getopt::Long;
16 use HTML::FormatText;
18 my($USAGE) = "Usage: $0: [[--debug]|[-d]]+ [--test] [--description] [--link] [--no-title] [--no-feedname] [[--version] [-V]] [[--columns=int] [-c=int]] [--numfeeds=number] [--onlyfeeds=regexp] [--reversetime] [feedurl]...\n";
19 my $version = "1.23-baka";
20 my $config_dir = $ENV{'HOME'} . "/.theyoke/";
21 my $feeds_dir = $config_dir . ".feeds/";
22 my $feeds_file = $config_dir . "feeds";
23 my $agent = "TheYoke/$version (+http://www.mackers.com/projects/theyoke/) ";
24 my @OUTPUT;
25 my @feed_urls;
26 my (%OPTIONS);
27 my $exit_val = 0;
29 Getopt::Long::Configure("bundling", "no_ignore_case", "no_auto_abbrev", "no_getopt_compat", "require_order");
30 GetOptions(\%OPTIONS, 'debug|d+', 'test', 'description', 'link', 'title', 'no-title', 'no-feedname', 'version|V+', 'columns|c=i', 'numfeeds=i', 'onlyfeeds=s', 'reversetime') || die $USAGE;
34 if ($OPTIONS{'version'}) {
35 print "$0 version $version\n";
36 exit(0);
39 ### Check for files and dirs. If none, create ###
40 unless (-d $config_dir || $#ARGV >= 0) {
41 mkdir ($config_dir) || die ("Couldn't create directory $config_dir");
43 unless (-d $feeds_dir || $#ARGV >= 0) {
44 mkdir ($feeds_dir) || die ("Couldn't create directory $feeds_dir");
46 unless (-f $feeds_file || $#ARGV >= 0) {
47 if (open(FEEDS, ">> $feeds_file")) {
48 # TODO print feeds file content comments
49 print FEEDS "";
50 close FEEDS;
52 exit(0);
55 ### Read feeds file ###
56 if ($#ARGV >= 0) {
57 foreach (@ARGV) {
58 my $u1 = URI->new($_);
59 push(@feed_urls, $u1);
61 } else {
62 if (open(FEEDS, "< $feeds_file")) {
63 while (<FEEDS>) {
64 next if (/^#/);
65 next unless (/\w/);
66 my $u1 = URI->new($_);
67 push (@feed_urls, $u1);
69 } else {
70 print STDERR "theyoke: could not open $feeds_file\n";
71 exit(-1);
75 if (scalar(@feed_urls) == 0) {
76 print STDERR "theyoke: no feeds found. please enter some URLs in $feeds_file (or as command line argument)\n";
79 ### Create new files if necessary
80 foreach my $feed_url (@feed_urls) {
81 $feed_url = $feed_url->as_string;
82 my $file_path = $feeds_dir . &get_checksum($feed_url);
83 unless (-f $file_path || $#ARGV >= 0) {
84 print STDERR "theyoke: adding feed for $feed_url\n";
85 if (open (FEED, "> $file_path")) {
86 print FEED "$feed_url\n0\nno_digest_yet\nno_title_yet\nno_etag_yet\n";
87 close FEED;
88 } else {
89 print STDERR "theyoke: couldn't write to $feed_url\n";
94 ### Create the user agent ###
95 my $ua = LWP::UserAgent->new(
96 env_proxy => 1,
97 keep_alive => 0,
98 timeout => 30,
99 agent => $agent,
102 ### For each feed file ###
103 my $count = 0;
104 my $dont_have_content = 1;
105 print STDERR "theyoke: Syndicating first $OPTIONS{'numfeeds'} feeds.\n" if ($OPTIONS{'debug'} && defined($OPTIONS{'numfeeds'}));
106 foreach my $feed_url (@feed_urls) {
107 last if (defined($OPTIONS{'numfeeds'}) && $count++ == $OPTIONS{'numfeeds'});
109 if ($OPTIONS{'onlyfeeds'} && $feed_url !~ /$OPTIONS{'onlyfeeds'}/) {
110 print STDERR "theyoke: Skipping... not in /$OPTIONS{'onlyfeeds'}/\n" if ($OPTIONS{'debug'});
111 next;
114 ### get the filename
115 my $file_digest = &get_checksum($feed_url);
116 my $file_digest_path = $feeds_dir . $file_digest;
118 #### open the file
119 my $previous_content_digest = "no_digest_yet";
120 my $last_title = "no_title_yet";
121 my $last_mod = 0;
122 my $etag = "no_etag_yet";
123 if ($#ARGV < 0) {
124 if (open(FEED, "< $file_digest_path")) {
125 # 1st line: url
126 my $this_url = <FEED>;
127 # 2nd line: last modified system time
128 $last_mod = <FEED>;
129 chomp($last_mod);
130 # 3rd line: previous checksum for whole body
131 $previous_content_digest = <FEED>;
132 chomp($previous_content_digest);
133 # 4th line: previous checksum for last known item
134 $last_title = <FEED>;
135 chomp($last_title);
136 # 5th line: etag
137 $etag = <FEED>;
138 chomp($etag);
139 close FEED;
140 unless (($previous_content_digest ne "") && ($last_title ne "") && ($etag ne "")) {
141 print STDERR "theyoke: $file_digest_path is corrupt or you're using a new version of theyoke. will regenerate next time...\n";
142 unlink $file_digest_path;
143 next;
145 } else {
146 die ("theyoke: couldn't open $file_digest_path");
150 ### send request to see if not modified
151 $| = 1;
152 print STDERR "theyoke: Getting \"$feed_url\" - " if ($OPTIONS{'debug'});
153 my $head = HTTP::Headers->new;
154 $head->if_modified_since($last_mod);
155 $head->push_header("If-None-Match" => $etag);
156 my $req = HTTP::Request->new("GET", $feed_url, $head);
157 print STDERR $req->as_string if ($OPTIONS{'debug'} > 1);
158 my $resp = $ua->request($req);
159 my $content;
160 if ($resp->code == 304) {
161 print STDERR " got a 304, skipping\n" if ($OPTIONS{'debug'});
162 $| = 0;
163 next;
164 } elsif ($resp->is_success) {
165 print STDERR " got " . $resp->code . "\n" if ($OPTIONS{'debug'});
166 $content = $resp->content();
167 $| = 0;
168 } else {
169 print STDERR "theyoke: \"$feed_url\": got " . $resp->code . ", skipping\n";
170 $| = 0;
171 next;
174 ### skip if checksums match (i.e. head lied - no new content)
175 my $new_last_title = "";
176 my $new_content_digest = &get_checksum($content);
177 if ($new_content_digest eq $previous_content_digest) {
178 print STDERR "theyoke: checksums match, skipping\n" if ($OPTIONS{'debug'});
179 } else {
181 ### new content - parse the rss
182 my $newtitle = 0;
183 my $rss = new XML::RSS;
184 my $rss_title = "";
186 # XML::RSS seems to always through a DIE
187 #local $SIG{__DIE__} = sub { print STDERR "theyoke: RSS parser error on \"$feed_url\".\n"; };
188 eval {
189 $rss->parse($content);
191 if ($@) {
192 print STDERR "theyoke: RSS parser error on \"$feed_url\": $@\n";
193 next;
196 $rss_title = $rss->channel('title');
198 ### check for no items
199 if (@{$rss->{'items'}} == 0) {
200 print STDERR "theyoke: no RSS items found in \"$feed_url\". bad RSS?\n";
201 next;
204 ### check for no title
205 if ($rss_title eq "") {
206 print STDERR "theyoke: no channel title found for \"$feed_url\". bad RSS?\n";
207 next;
210 ### look for new items
211 foreach my $item (@{$rss->{'items'}}) {
212 my $this_description = $item->{'description'};
213 my $this_title = $item->{'title'};
214 my $this_link = $item->{'link'};
215 my $wassname = "";
216 if ($this_title ne "") {
217 $wassname = $this_title;
218 } elsif ($this_description ne "") {
219 #$wassname = substr($this_description,0,30) . "...";
220 $wassname = $this_description;
221 } elsif ($this_link ne "") {
222 $wassname = $this_link;
223 } else {
224 next;
226 my $this_wassname_digest = &get_checksum($wassname);
227 if ($this_wassname_digest ne $last_title) {
228 # aha! new content
229 my ($columns, $rows) = Term::Size::chars *STDOUT{IO};
231 $columns = $OPTIONS{'columns'} if (defined($OPTIONS{'columns'}));
232 $columns = 32768 if ($columns < 10);
234 $wassname = HTML::FormatText->format_string($wassname);
235 $wassname =~ s/[\r\n]/ /g;
236 $wassname =~ s/\s+$//g;
238 my $printy = "";
240 if (!$OPTIONS{'no-feedname'} && $this_title) {
241 $printy .= "$rss_title: ";
244 if (!$OPTIONS{'no-title'} && $this_title) {
245 $printy .= $wassname;
246 if (length($printy) > $columns-4) {
247 $printy = substr($printy,0,$columns-4) . "...";
251 if ($printy ne "") {
252 push(@OUTPUT, $printy . "\n");
255 if ($OPTIONS{'description'} && $this_title) {
256 $this_description = HTML::FormatText->format_string($this_description);
257 $this_description =~ s/[\r\n]\s*/\n\t/g;
258 $this_description = "\t$this_description" if (!$OPTIONS{'no-feedname'});
259 push(@OUTPUT, "$this_description\n");
262 if ($OPTIONS{'link'} && $this_title) {
263 $this_link = "\t$this_link" if (!$OPTIONS{'no-feedname'});
264 push(@OUTPUT, "$this_link\n");
266 $dont_have_content = 0;
267 # save latest title
268 if ($new_last_title eq "") {
269 $new_last_title = $this_wassname_digest;
271 } else {
272 last;
276 # check for badness
277 if ($new_content_digest eq "") {
278 print STDERR "theyoke: empty badness for new_content_digest on $feed_url\n";
279 next;
283 # check for changed rss file but not changed headings
284 if ($new_last_title eq "") {
285 if ($OPTIONS{'debug'}) {
286 if ($new_content_digest ne $previous_content_digest) {
287 print STDERR "theyoke: checksums don't match, but ";
288 } else {
289 print STDERR "theyoke: ";
291 print STDERR "no new headlines from on $feed_url\n";
293 $new_last_title = $last_title;
296 if ($#OUTPUT >= 0) {
297 if ($OPTIONS{'reversetime'}) {
298 print reverse @OUTPUT;
299 } else {
300 print @OUTPUT;
302 undef(@OUTPUT);
305 ### save checksum
306 if ($ARGV < 1 && !$OPTIONS{'test'}) {
307 if (open(FEED, "> $file_digest_path")) {
308 # url
309 print FEED $feed_url . "\n";
310 # last mod
311 if ($resp->last_modified) {
312 print FEED $resp->last_modified . "\n";
313 } else {
314 print FEED $resp->date . "\n";
316 # content checksum
317 print FEED $new_content_digest . "\n";
318 # title checksum
319 print FEED $new_last_title . "\n";
320 # etag
321 if ($resp->header("ETag")) {
322 print FEED $resp->header("ETag") . "\n";
323 } else {
324 print FEED "no_etag\n";
326 close FEED;
327 } else {
328 die ("Couldn't write to $file_digest_path");
333 $exit_val = 2 if $dont_have_content;
335 exit($exit_val);
337 sub get_checksum {
338 my $tent = md5_base64(encode_utf8($_[0]));
339 $tent =~ s/\W/_/g;
340 print STDERR $_[0] . " encoding as $tent\n" if ($OPTIONS{'debug'} > 1);
341 return $tent;