cgi-bin/parsefeed.pl

   1 #!/usr/bin/perl
   2
   3 use strict;
   4
   5 package LJ::ParseFeed;
   6
   7 use XML::RSS;
   8 use XML::Parser;
   9
  10
  11 # <LJFUNC>
  12 # name: LJ::ParseFeed::parse_feed
  13 # des: Parses an RSS/Atom feed.
  14 # class:
  15 # args: content, type?
  16 # des-content: Feed content.
  17 # des-type: Optional; can be "atom" or "rss".
  18 #           If type isn't supplied, the function will try to guess it
  19 #           based on contents.
  20 # info: items - An arrayref of item hashes, in the same order they were
  21 #       in the feed.
  22 #       Each item contains: link - URL of the item; id - unique identifier (optional);
  23 #        text - text of the item; subject - subject;
  24 #        time - in format 'yyyy-mm-dd hh:mm' (optional).
  25 # returns: Three arguments: $feed, $error, arrayref of items.
  26 #          $feed, which is a hash with the following keys:
  27 #          type - 'atom' or 'rss'; version - version of the feed in its
  28 #          standard; link - URL of the feed; title - title of the feed;
  29 #           description - description of the feed.
  30 #           The second argument returned is $error, which, if defined, is a
  31 #           human-readable error string. The third argument is an
  32 #           arrayref of items, same as $feed->{'items'}.
  33 # </LJFUNC>
  34 sub parse_feed
  35 {
  36     my ($content, $type) = @_;
  37     my ($feed, $items, $error);
  38     my $parser;
  39
  40     # is it RSS or Atom?
  41     # Atom feeds are rare for now, so prefer to err in favor of RSS
  42     # simple heuristic: Atom feeds will have '<feed' somewhere
  43     # TODO: maybe store the feed's type on creation in a userprop and not guess here
  44
  45     if ($type eq 'atom' || $content =~ m!\<feed!) {
  46         # try treating it as an atom feed
  47         $parser = new XML::Parser(Style=>'Stream', Pkg=>'LJ::ParseFeed::Atom');
  48         return ("", "failed to create XML parser") unless $parser;
  49         eval {
  50             $parser->parse($content);
  51         };
  52         if ($@) {
  53             $error = "XML parser error: $@";
  54         } else {
  55             ($feed, $items, $error) = LJ::ParseFeed::Atom::results();
  56         };
  57
  58         if ($feed || $type eq 'atom') {
  59             # there was a top-level <feed> there, or we're forced to treat
  60             # as an Atom feed, so even if $error is set,
  61             # don't try RSS
  62             $feed->{'type'} = 'atom';
  63             return ($feed, $error, $items);
  64         }
  65     }
  66
  67     # try parsing it as RSS
  68     $parser = new XML::RSS;
  69     return ("", "failed to create RSS parser") unless $parser;
  70     eval {
  71         $parser->parse($content);
  72     };
  73     if ($@) {
  74         $error = "RSS parser error: $@";
  75         return ("", $error);
  76     }
  77
  78     $feed = {};
  79     $feed->{'type'} = 'rss';
  80     $feed->{'version'} = $parser->{'version'};
  81
  82     foreach (qw (link title description)) {
  83         $feed->{$_} = $parser->{'channel'}->{$_}
  84             if $parser->{'channel'}->{$_};
  85     }
  86
  87     $feed->{'items'} = [];
  88
  89     foreach(@{$parser->{'items'}}) {
  90         my $item = {};
  91         $item->{'subject'} = $_->{'title'};
  92         $item->{'text'} = $_->{'description'};
  93         $item->{'link'} = $_->{'link'} if $_->{'link'};
  94         $item->{'id'} = $_->{'guid'} if $_->{'guid'};
  95
  96         my $nsdc = 'http://purl.org/dc/elements/1.1/';
  97         my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
  98         if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
  99             # prefer content:encoded if present
 100             $item->{'text'} = $_->{$nsenc}->{'encoded'}
 101                 if defined $_->{$nsenc}->{'encoded'};
 102         }
 103
 104         if ($_->{'pubDate'}) {
 105             my $time = time822_to_time($_->{'pubDate'});
 106             $item->{'time'} = $time if $time;
 107         }
 108         if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {
 109             if ($_->{$nsdc}->{date}) {
 110                 my $time = w3cdtf_to_time($_->{$nsdc}->{date});
 111                 $item->{'time'} = $time if $time;
 112             }
 113         }
 114         push @{$feed->{'items'}}, $item;
 115     }
 116
 117     return ($feed, undef, $feed->{'items'});
 118 }
 119
 120 # convert rfc822-time in RSS's <pubDate> to our time
 121 # see http://www.faqs.org/rfcs/rfc822.html
 122 # RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
 123 # but real RSS2.0 feeds apparently use 4 digits.
 124 sub time822_to_time {
 125     my $t822 = shift;
 126     # remove day name if present
 127     $t822 =~ s/^\s*\w+\s*,//;
 128     # remove whitespace
 129     $t822 =~ s/^\s*//;
 130     # break it up
 131     if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {
 132         my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);
 133         $day = "0" . $day if length($day) == 1;
 134         $hour = "0" . $hour if length($hour) == 1;
 135         $mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',
 136                 'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',
 137                 'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};
 138         return undef unless $mon;
 139         return "$year-$mon-$day $hour:$min";
 140     } else {
 141         return undef;
 142     }
 143 }
 144
 145 # convert W3C-DTF to our internal format
 146 # see http://www.w3.org/TR/NOTE-datetime
 147 # Based very loosely on code from DateTime::Format::W3CDTF,
 148 # which isn't stable yet so we can't use it directly.
 149 sub w3cdtf_to_time {
 150     my $tw3 = shift;
 151
 152     # TODO: Should somehow return the timezone offset
 153     #   so that it can stored... but we don't do timezones
 154     #   yet anyway. For now, just strip the timezone
 155     #   portion if it is present, along with the decimal
 156     #   fractions of a second.
 157
 158     $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
 159     $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace
 160
 161     # We can only use complete times, so anything which
 162     # doesn't feature the time part is considered invalid.
 163
 164     # This is working around clients that don't implement W3C-DTF
 165     # correctly, and only send single digit values in the dates.
 166     # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
 167     # If it's more messed up than that, reject it outright.
 168     $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
 169         or return undef;
 170
 171     my %pd; # parsed date
 172     $pd{Y} = $1; $pd{M} = $2; $pd{D} = $3;
 173     $pd{h} = $4; $pd{m} = $5; $pd{s} = $6;
 174
 175     # force double digits
 176     foreach (qw/ M D h m s /) {
 177         next unless defined $pd{$_};
 178         $pd{$_} = sprintf "%02d", $pd{$_};
 179     }
 180
 181     return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :
 182                     "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
 183 }
 184
 185 package LJ::ParseFeed::Atom;
 186
 187 our ($feed, $item, $data);
 188 our ($ddepth, $dholder); # for accumulating;
 189 our @items;
 190 our $error;
 191
 192 sub err {
 193     $error = shift unless $error;
 194 }
 195
 196 sub results {
 197     return ($feed, \@items, $error);
 198 }
 199
 200 # $name under which we'll store accumulated data may be different
 201 # from $tag which causes us to store it
 202 # $name may be a scalarref pointing to where we should store
 203 # swallowing is achieved by calling startaccum('');
 204
 205 sub startaccum {
 206     my $name = shift;
 207
 208     return err("Tag found under neither <feed> nor <entry>")
 209         unless $feed || $item;
 210     $data = ""; # defining $data triggers accumulation
 211     $ddepth = 1;
 212
 213     $dholder = undef
 214         unless $name;
 215     # if $name is a scalarref, it's actually our $dholder
 216     if (ref($name) eq 'SCALAR') {
 217         $dholder = $name;
 218     } else {
 219         $dholder = ($item ? \$item->{$name} : \$feed->{$name})
 220             if $name;
 221     }
 222     return;
 223 }
 224
 225 sub swallow {
 226     return startaccum('');
 227 }
 228
 229 sub StartDocument {
 230     ($feed, $item, $data) = (undef, undef, undef);
 231     @items = ();
 232     undef $error;
 233 }
 234
 235 sub StartTag {
 236     # $_ carries the unparsed tag
 237     my ($p, $tag) = @_;
 238     my $holder;
 239
 240     # do nothing if there has been an error
 241     return if $error;
 242
 243     # are we just accumulating data?
 244     if (defined $data) {
 245         $data .= $_;
 246         $ddepth++;
 247         return;
 248     }
 249
 250     # where we'll usually store info
 251     $holder = $item ? $item : $feed;
 252
 253     TAGS: {
 254         if ($tag eq 'feed') {
 255             return err("Nested <feed> tags")
 256                 if $feed;
 257             $feed = {};
 258             $feed->{'standard'} = 'atom';
 259             $feed->{'version'} = $_{'version'};
 260             return err("Incompatible version specified in <feed>")
 261                 if $feed->{'version'} && $feed->{'version'} < 0.3;
 262             last TAGS;
 263         }
 264         if ($tag eq 'entry') {
 265             return err("Nested <entry> tags")
 266                 if $item;
 267             $item = {};
 268             last TAGS;
 269         }
 270
 271         # at this point, we must have a top-level <feed> or <entry>
 272         # to write into
 273         return err("Tag found under neither <feed> nor <entry>")
 274             unless $holder;
 275
 276         if ($tag eq 'link') {
 277             # ignore links with rel= anything but alternate
 278             # and treat links as rel=alternate if not explicit
 279             unless ($_{'rel'} eq 'alternate' || !$_{'rel'}) {
 280                 swallow();
 281                 last TAGS;
 282             }
 283             $holder->{'link'} = $_{'href'};
 284             return err("No href attribute in <link>")
 285                 unless $holder->{'link'};
 286             last TAGS;
 287         }
 288
 289         if ($tag eq 'content') {
 290             return err("<content> outside <entry>")
 291                 unless $item;
 292             # if type is multipart/alternative, we continue recursing
 293             # otherwise we accumulate
 294             my $type = $_{'type'} || "text/plain";
 295             unless ($type eq "multipart/alternative") {
 296                 push @{$item->{'contents'}}, [$type, ""];
 297                 startaccum(\$item->{'contents'}->[-1]->[1]);
 298                 last TAGS;
 299             }
 300             # it's multipart/alternative, so recurse, but don't swallow
 301             last TAGS;
 302         }
 303
 304         # store tags which should require no further
 305         # processing as they are, and others under _atom_*, to be processed
 306         # in EndTag under </entry>
 307         if ($tag eq 'title') {
 308             if ($item) { # entry's subject
 309                 startaccum("subject");
 310             } else { # feed's title
 311                 startaccum($tag);
 312             }
 313             last TAGS;
 314         }
 315         if ($tag eq 'id') {
 316             unless ($item) {
 317                 swallow(); # we don't need feed-level <id>
 318             } else {
 319                 startaccum($tag);
 320             }
 321             last TAGS;
 322         }
 323
 324         if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"
 325             startaccum("description");
 326             last TAGS;
 327         }
 328
 329         # accumulate and store
 330         startaccum("_atom_" . $tag);
 331         last TAGS;
 332     }
 333
 334     return;
 335 }
 336
 337 sub EndTag {
 338     # $_ carries the unparsed tag
 339     my ($p, $tag) = @_;
 340
 341     # do nothing if there has been an error
 342     return if $error;
 343
 344     # are we accumulating data?
 345     if (defined $data) {
 346         $ddepth--;
 347         if ($ddepth == 0) { # stop accumulating
 348             $$dholder = $data
 349                 if $dholder;
 350             undef $data;
 351             return;
 352         }
 353         $data .= $_;
 354         return;
 355     }
 356
 357     TAGS: {
 358         if ($tag eq 'entry') {
 359             # finalize item...
 360             # generate suitable text from $item->{'contents'}
 361             my $content;
 362             $item->{'contents'} ||= [];
 363             unless (scalar(@{$item->{'contents'}}) >= 1) {
 364                 # this item had no <content>
 365                 # maybe it has <summary>? if so, use <summary>
 366                 # TODO: type= or encoding issues here? perhaps unite
 367                 # handling of <summary> with that of <content>?
 368                 if ($item->{'_atom_summary'}) {
 369                     $item->{'text'} = $item->{'_atom_summary'};
 370                     delete $item->{'contents'};
 371                 } else {
 372                     # nothing to display, so ignore this entry
 373                     undef $item;
 374                     last TAGS;
 375                 }
 376             }
 377
 378             unless ($item->{'text'}) { # unless we already have text
 379                 if (scalar(@{$item->{'contents'}}) == 1) {
 380                     # only one <content> section
 381                     $content = $item->{'contents'}->[0];
 382                 } else {
 383                     # several <content> section, must choose the best one
 384                     foreach (@{$item->{'contents'}}) {
 385                         if ($_->[0] eq "application/xhtml+xml") { # best match
 386                             $content = $_;
 387                             last; # don't bother to look at others
 388                         }
 389                         if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.
 390                             # choose this unless we've already chosen some html
 391                             $content = $_
 392                                 unless $content->[0] =~ m!html!;
 393                             next;
 394                         }
 395                         if ($_->[0] eq "text/plain") {
 396                             # choose this unless we have some html already
 397                             $content = $_
 398                                 unless $content->[0] =~ m!html!;
 399                             next;
 400                         }
 401                     }
 402                     # if we didn't choose anything, pick the first one
 403                     $content =  $item->{'contents'}->[0]
 404                         unless $content;
 405                 }
 406
 407                 # we ignore the 'mode' attribute of <content>. If it's "xml", we've
 408                 # stringified it by accumulation; if it's "escaped", our parser
 409                 # unescaped it
 410                 # TODO: handle mode=base64?
 411
 412                 $item->{'text'} = $content->[1];
 413                 delete $item->{'contents'};
 414             }
 415
 416             # generate time
 417             my $w3time = $item->{'_atom_created'} || $item->{'_atom_published'} ||
 418                          $item->{'_atom_modified'} || $item->{'_atom_updated'};
 419
 420             my $time;
 421             if ($w3time) {
 422                 # see http://www.w3.org/TR/NOTE-datetime for format
 423                 # we insist on having granularity up to a minute,
 424                 # and ignore finer data as well as the timezone, for now
 425                 if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {
 426                     $time = "$1-$2-$3 $4:$5";
 427                 }
 428             }
 429             if ($time) {
 430                 $item->{'time'} = $time;
 431             }
 432
 433             # get rid of all other tags we don't need anymore
 434             foreach (keys  %$item) {
 435                 delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
 436             }
 437
 438             push @items, $item;
 439             undef $item;
 440             last TAGS;
 441         }
 442         if ($tag eq 'feed') {
 443             # finalize feed
 444             # get rid of all other tags we don't need anymore
 445             foreach (keys  %$feed) {
 446                 delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
 447             }
 448
 449             # link the feed with its itms
 450             $feed->{'items'} = \@items
 451                 if $feed;
 452             last TAGS;
 453         }
 454     }
 455     return;
 456 }
 457
 458 sub Text {
 459     my $p = shift;
 460
 461     # do nothing if there has been an error
 462     return if $error;
 463
 464     $data .= $_ if defined $data;
 465 }
 466
 467 sub PI {
 468     # ignore processing instructions
 469     return;
 470 }
 471
 472 sub EndDocument {
 473     # if we parsed a feed, link items to it
 474     $feed->{'items'} = \@items
 475         if $feed;
 476     return;
 477 }
 478
 479
 480 1;