LJSUP-17669: Login.bml form refactoring
[livejournal.git] / cgi-bin / parsefeed.pl
blobdc354054469b1e3cece0eedcdadd5bdc3ffd9567
1 #!/usr/bin/perl
3 use strict;
5 package LJ::ParseFeed;
7 use XML::RSS;
8 use XML::Parser;
11 # <LJFUNC>
12 # name: LJ::ParseFeed::parse_feed
13 # des: Parses an RSS/Atom feed.
14 # class:
15 # args: content, type?
16 # des-content: Feed content.
17 # des-type: Optional; can be "atom" or "rss".
18 # If type isn't supplied, the function will try to guess it
19 # based on contents.
20 # info: items - An arrayref of item hashes, in the same order they were
21 # in the feed.
22 # Each item contains: link - URL of the item; id - unique identifier (optional);
23 # text - text of the item; subject - subject;
24 # time - in format 'yyyy-mm-dd hh:mm' (optional).
25 # returns: Three arguments: $feed, $error, arrayref of items.
26 # $feed, which is a hash with the following keys:
27 # type - 'atom' or 'rss'; version - version of the feed in its
28 # standard; link - URL of the feed; title - title of the feed;
29 # description - description of the feed.
30 # The second argument returned is $error, which, if defined, is a
31 # human-readable error string. The third argument is an
32 # arrayref of items, same as $feed->{'items'}.
33 # </LJFUNC>
34 sub parse_feed
36 my ($content, $type) = @_;
37 my ($feed, $items, $error);
38 my $parser;
40 # is it RSS or Atom?
41 # Atom feeds are rare for now, so prefer to err in favor of RSS
42 # simple heuristic: Atom feeds will have '<feed' somewhere
43 # TODO: maybe store the feed's type on creation in a userprop and not guess here
45 if ($type eq 'atom' || $content =~ m!\<feed!) {
46 # try treating it as an atom feed
47 $parser = new XML::Parser(Style=>'Stream', Pkg=>'LJ::ParseFeed::Atom');
48 return ("", "failed to create XML parser") unless $parser;
49 eval {
50 $parser->parse($content);
52 if ($@) {
53 $error = "XML parser error: $@";
54 } else {
55 ($feed, $items, $error) = LJ::ParseFeed::Atom::results();
58 if ($feed || $type eq 'atom') {
59 # there was a top-level <feed> there, or we're forced to treat
60 # as an Atom feed, so even if $error is set,
61 # don't try RSS
62 $feed->{'type'} = 'atom';
63 return ($feed, $error, $items);
67 # try parsing it as RSS
68 $parser = new XML::RSS;
69 return ("", "failed to create RSS parser") unless $parser;
70 eval {
71 $parser->parse($content);
73 if ($@) {
74 $error = "RSS parser error: $@";
75 return ("", $error);
78 $feed = {};
79 $feed->{'type'} = 'rss';
80 $feed->{'version'} = $parser->{'version'};
82 foreach (qw (link title description)) {
83 $feed->{$_} = $parser->{'channel'}->{$_}
84 if $parser->{'channel'}->{$_};
87 $feed->{'items'} = [];
89 foreach(@{$parser->{'items'}}) {
90 my $item = {};
91 $item->{'subject'} = $_->{'title'};
92 $item->{'text'} = $_->{'description'};
93 $item->{'link'} = $_->{'link'} if $_->{'link'};
94 $item->{'id'} = $_->{'guid'} if $_->{'guid'};
96 my $nsdc = 'http://purl.org/dc/elements/1.1/';
97 my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
98 if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
99 # prefer content:encoded if present
100 $item->{'text'} = $_->{$nsenc}->{'encoded'}
101 if defined $_->{$nsenc}->{'encoded'};
104 if ($_->{'pubDate'}) {
105 my $time = time822_to_time($_->{'pubDate'});
106 $item->{'time'} = $time if $time;
108 if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {
109 if ($_->{$nsdc}->{date}) {
110 my $time = w3cdtf_to_time($_->{$nsdc}->{date});
111 $item->{'time'} = $time if $time;
114 push @{$feed->{'items'}}, $item;
117 return ($feed, undef, $feed->{'items'});
120 # convert rfc822-time in RSS's <pubDate> to our time
121 # see http://www.faqs.org/rfcs/rfc822.html
122 # RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
123 # but real RSS2.0 feeds apparently use 4 digits.
124 sub time822_to_time {
125 my $t822 = shift;
126 # remove day name if present
127 $t822 =~ s/^\s*\w+\s*,//;
128 # remove whitespace
129 $t822 =~ s/^\s*//;
130 # break it up
131 if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {
132 my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);
133 $day = "0" . $day if length($day) == 1;
134 $hour = "0" . $hour if length($hour) == 1;
135 $mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',
136 'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',
137 'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};
138 return undef unless $mon;
139 return "$year-$mon-$day $hour:$min";
140 } else {
141 return undef;
145 # convert W3C-DTF to our internal format
146 # see http://www.w3.org/TR/NOTE-datetime
147 # Based very loosely on code from DateTime::Format::W3CDTF,
148 # which isn't stable yet so we can't use it directly.
149 sub w3cdtf_to_time {
150 my $tw3 = shift;
152 # TODO: Should somehow return the timezone offset
153 # so that it can stored... but we don't do timezones
154 # yet anyway. For now, just strip the timezone
155 # portion if it is present, along with the decimal
156 # fractions of a second.
158 $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
159 $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace
161 # We can only use complete times, so anything which
162 # doesn't feature the time part is considered invalid.
164 # This is working around clients that don't implement W3C-DTF
165 # correctly, and only send single digit values in the dates.
166 # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
167 # If it's more messed up than that, reject it outright.
168 $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
169 or return undef;
171 my %pd; # parsed date
172 $pd{Y} = $1; $pd{M} = $2; $pd{D} = $3;
173 $pd{h} = $4; $pd{m} = $5; $pd{s} = $6;
175 # force double digits
176 foreach (qw/ M D h m s /) {
177 next unless defined $pd{$_};
178 $pd{$_} = sprintf "%02d", $pd{$_};
181 return $pd{s} ? "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :
182 "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
185 package LJ::ParseFeed::Atom;
187 our ($feed, $item, $data);
188 our ($ddepth, $dholder); # for accumulating;
189 our @items;
190 our $error;
192 sub err {
193 $error = shift unless $error;
196 sub results {
197 return ($feed, \@items, $error);
200 # $name under which we'll store accumulated data may be different
201 # from $tag which causes us to store it
202 # $name may be a scalarref pointing to where we should store
203 # swallowing is achieved by calling startaccum('');
205 sub startaccum {
206 my $name = shift;
208 return err("Tag found under neither <feed> nor <entry>")
209 unless $feed || $item;
210 $data = ""; # defining $data triggers accumulation
211 $ddepth = 1;
213 $dholder = undef
214 unless $name;
215 # if $name is a scalarref, it's actually our $dholder
216 if (ref($name) eq 'SCALAR') {
217 $dholder = $name;
218 } else {
219 $dholder = ($item ? \$item->{$name} : \$feed->{$name})
220 if $name;
222 return;
225 sub swallow {
226 return startaccum('');
229 sub StartDocument {
230 ($feed, $item, $data) = (undef, undef, undef);
231 @items = ();
232 undef $error;
235 sub StartTag {
236 # $_ carries the unparsed tag
237 my ($p, $tag) = @_;
238 my $holder;
240 # do nothing if there has been an error
241 return if $error;
243 # are we just accumulating data?
244 if (defined $data) {
245 $data .= $_;
246 $ddepth++;
247 return;
250 # where we'll usually store info
251 $holder = $item ? $item : $feed;
253 TAGS: {
254 if ($tag eq 'feed') {
255 return err("Nested <feed> tags")
256 if $feed;
257 $feed = {};
258 $feed->{'standard'} = 'atom';
259 $feed->{'version'} = $_{'version'};
260 return err("Incompatible version specified in <feed>")
261 if $feed->{'version'} && $feed->{'version'} < 0.3;
262 last TAGS;
264 if ($tag eq 'entry') {
265 return err("Nested <entry> tags")
266 if $item;
267 $item = {};
268 last TAGS;
271 # at this point, we must have a top-level <feed> or <entry>
272 # to write into
273 return err("Tag found under neither <feed> nor <entry>")
274 unless $holder;
276 if ($tag eq 'link') {
277 # ignore links with rel= anything but alternate
278 # and treat links as rel=alternate if not explicit
279 unless ($_{'rel'} eq 'alternate' || !$_{'rel'}) {
280 swallow();
281 last TAGS;
283 $holder->{'link'} = $_{'href'};
284 return err("No href attribute in <link>")
285 unless $holder->{'link'};
286 last TAGS;
289 if ($tag eq 'content') {
290 return err("<content> outside <entry>")
291 unless $item;
292 # if type is multipart/alternative, we continue recursing
293 # otherwise we accumulate
294 my $type = $_{'type'} || "text/plain";
295 unless ($type eq "multipart/alternative") {
296 push @{$item->{'contents'}}, [$type, ""];
297 startaccum(\$item->{'contents'}->[-1]->[1]);
298 last TAGS;
300 # it's multipart/alternative, so recurse, but don't swallow
301 last TAGS;
304 # store tags which should require no further
305 # processing as they are, and others under _atom_*, to be processed
306 # in EndTag under </entry>
307 if ($tag eq 'title') {
308 if ($item) { # entry's subject
309 startaccum("subject");
310 } else { # feed's title
311 startaccum($tag);
313 last TAGS;
315 if ($tag eq 'id') {
316 unless ($item) {
317 swallow(); # we don't need feed-level <id>
318 } else {
319 startaccum($tag);
321 last TAGS;
324 if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"
325 startaccum("description");
326 last TAGS;
329 # accumulate and store
330 startaccum("_atom_" . $tag);
331 last TAGS;
334 return;
337 sub EndTag {
338 # $_ carries the unparsed tag
339 my ($p, $tag) = @_;
341 # do nothing if there has been an error
342 return if $error;
344 # are we accumulating data?
345 if (defined $data) {
346 $ddepth--;
347 if ($ddepth == 0) { # stop accumulating
348 $$dholder = $data
349 if $dholder;
350 undef $data;
351 return;
353 $data .= $_;
354 return;
357 TAGS: {
358 if ($tag eq 'entry') {
359 # finalize item...
360 # generate suitable text from $item->{'contents'}
361 my $content;
362 $item->{'contents'} ||= [];
363 unless (scalar(@{$item->{'contents'}}) >= 1) {
364 # this item had no <content>
365 # maybe it has <summary>? if so, use <summary>
366 # TODO: type= or encoding issues here? perhaps unite
367 # handling of <summary> with that of <content>?
368 if ($item->{'_atom_summary'}) {
369 $item->{'text'} = $item->{'_atom_summary'};
370 delete $item->{'contents'};
371 } else {
372 # nothing to display, so ignore this entry
373 undef $item;
374 last TAGS;
378 unless ($item->{'text'}) { # unless we already have text
379 if (scalar(@{$item->{'contents'}}) == 1) {
380 # only one <content> section
381 $content = $item->{'contents'}->[0];
382 } else {
383 # several <content> section, must choose the best one
384 foreach (@{$item->{'contents'}}) {
385 if ($_->[0] eq "application/xhtml+xml") { # best match
386 $content = $_;
387 last; # don't bother to look at others
389 if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.
390 # choose this unless we've already chosen some html
391 $content = $_
392 unless $content->[0] =~ m!html!;
393 next;
395 if ($_->[0] eq "text/plain") {
396 # choose this unless we have some html already
397 $content = $_
398 unless $content->[0] =~ m!html!;
399 next;
402 # if we didn't choose anything, pick the first one
403 $content = $item->{'contents'}->[0]
404 unless $content;
407 # we ignore the 'mode' attribute of <content>. If it's "xml", we've
408 # stringified it by accumulation; if it's "escaped", our parser
409 # unescaped it
410 # TODO: handle mode=base64?
412 $item->{'text'} = $content->[1];
413 delete $item->{'contents'};
416 # generate time
417 my $w3time = $item->{'_atom_created'} || $item->{'_atom_published'} ||
418 $item->{'_atom_modified'} || $item->{'_atom_updated'};
420 my $time;
421 if ($w3time) {
422 # see http://www.w3.org/TR/NOTE-datetime for format
423 # we insist on having granularity up to a minute,
424 # and ignore finer data as well as the timezone, for now
425 if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {
426 $time = "$1-$2-$3 $4:$5";
429 if ($time) {
430 $item->{'time'} = $time;
433 # get rid of all other tags we don't need anymore
434 foreach (keys %$item) {
435 delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
438 push @items, $item;
439 undef $item;
440 last TAGS;
442 if ($tag eq 'feed') {
443 # finalize feed
444 # get rid of all other tags we don't need anymore
445 foreach (keys %$feed) {
446 delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
449 # link the feed with its itms
450 $feed->{'items'} = \@items
451 if $feed;
452 last TAGS;
455 return;
458 sub Text {
459 my $p = shift;
461 # do nothing if there has been an error
462 return if $error;
464 $data .= $_ if defined $data;
467 sub PI {
468 # ignore processing instructions
469 return;
472 sub EndDocument {
473 # if we parsed a feed, link items to it
474 $feed->{'items'} = \@items
475 if $feed;
476 return;