12 # name: LJ::ParseFeed::parse_feed
13 # des: Parses an RSS/Atom feed.
15 # args: content, type?
16 # des-content: Feed content.
17 # des-type: Optional; can be "atom" or "rss".
18 # If type isn't supplied, the function will try to guess it
20 # info: items - An arrayref of item hashes, in the same order they were
22 # Each item contains: link - URL of the item; id - unique identifier (optional);
23 # text - text of the item; subject - subject;
24 # time - in format 'yyyy-mm-dd hh:mm' (optional).
25 # returns: Three arguments: $feed, $error, arrayref of items.
26 # $feed, which is a hash with the following keys:
27 # type - 'atom' or 'rss'; version - version of the feed in its
28 # standard; link - URL of the feed; title - title of the feed;
29 # description - description of the feed.
30 # The second argument returned is $error, which, if defined, is a
31 # human-readable error string. The third argument is an
32 # arrayref of items, same as $feed->{'items'}.
36 my ($content, $type) = @_;
37 my ($feed, $items, $error);
41 # Atom feeds are rare for now, so prefer to err in favor of RSS
42 # simple heuristic: Atom feeds will have '<feed' somewhere
43 # TODO: maybe store the feed's type on creation in a userprop and not guess here
45 if ($type eq 'atom' || $content =~ m!\<feed!) {
46 # try treating it as an atom feed
47 $parser = new XML
::Parser
(Style
=>'Stream', Pkg
=>'LJ::ParseFeed::Atom');
48 return ("", "failed to create XML parser") unless $parser;
50 $parser->parse($content);
53 $error = "XML parser error: $@";
55 ($feed, $items, $error) = LJ
::ParseFeed
::Atom
::results
();
58 if ($feed || $type eq 'atom') {
59 # there was a top-level <feed> there, or we're forced to treat
60 # as an Atom feed, so even if $error is set,
62 $feed->{'type'} = 'atom';
63 return ($feed, $error, $items);
67 # try parsing it as RSS
68 $parser = new XML
::RSS
;
69 return ("", "failed to create RSS parser") unless $parser;
71 $parser->parse($content);
74 $error = "RSS parser error: $@";
79 $feed->{'type'} = 'rss';
80 $feed->{'version'} = $parser->{'version'};
82 foreach (qw
(link title description
)) {
83 $feed->{$_} = $parser->{'channel'}->{$_}
84 if $parser->{'channel'}->{$_};
87 $feed->{'items'} = [];
89 foreach(@
{$parser->{'items'}}) {
91 $item->{'subject'} = $_->{'title'};
92 $item->{'text'} = $_->{'description'};
93 $item->{'link'} = $_->{'link'} if $_->{'link'};
94 $item->{'id'} = $_->{'guid'} if $_->{'guid'};
96 my $nsdc = 'http://purl.org/dc/elements/1.1/';
97 my $nsenc = 'http://purl.org/rss/1.0/modules/content/';
98 if ($_->{$nsenc} && ref($_->{$nsenc}) eq "HASH") {
99 # prefer content:encoded if present
100 $item->{'text'} = $_->{$nsenc}->{'encoded'}
101 if defined $_->{$nsenc}->{'encoded'};
104 if ($_->{'pubDate'}) {
105 my $time = time822_to_time
($_->{'pubDate'});
106 $item->{'time'} = $time if $time;
108 if ($_->{$nsdc} && ref($_->{$nsdc}) eq "HASH") {
109 if ($_->{$nsdc}->{date
}) {
110 my $time = w3cdtf_to_time
($_->{$nsdc}->{date
});
111 $item->{'time'} = $time if $time;
114 push @
{$feed->{'items'}}, $item;
117 return ($feed, undef, $feed->{'items'});
120 # convert rfc822-time in RSS's <pubDate> to our time
121 # see http://www.faqs.org/rfcs/rfc822.html
122 # RFC822 specifies 2 digits for year, and RSS2.0 refers to RFC822,
123 # but real RSS2.0 feeds apparently use 4 digits.
124 sub time822_to_time
{
126 # remove day name if present
127 $t822 =~ s/^\s*\w+\s*,//;
131 if ($t822 =~ m!(\d?\d)\s+(\w+)\s+(\d\d\d\d)\s+(\d?\d):(\d\d)!) {
132 my ($day, $mon, $year, $hour, $min) = ($1,$2,$3,$4,$5);
133 $day = "0" . $day if length($day) == 1;
134 $hour = "0" . $hour if length($hour) == 1;
135 $mon = {'Jan'=>'01', 'Feb'=>'02', 'Mar'=>'03', 'Apr'=>'04',
136 'May'=>'05', 'Jun'=>'06', 'Jul'=>'07', 'Aug'=>'08',
137 'Sep'=>'09', 'Oct'=>'10', 'Nov'=>'11', 'Dec'=>'12'}->{$mon};
138 return undef unless $mon;
139 return "$year-$mon-$day $hour:$min";
145 # convert W3C-DTF to our internal format
146 # see http://www.w3.org/TR/NOTE-datetime
147 # Based very loosely on code from DateTime::Format::W3CDTF,
148 # which isn't stable yet so we can't use it directly.
152 # TODO: Should somehow return the timezone offset
153 # so that it can stored... but we don't do timezones
154 # yet anyway. For now, just strip the timezone
155 # portion if it is present, along with the decimal
156 # fractions of a second.
158 $tw3 =~ s/(?:\.\d+)?(?:[+-]\d{1,2}:\d{1,2}|Z)$//;
159 $tw3 =~ s/^\s*//; $tw3 =~ s/\s*$//; # Eat any superflous whitespace
161 # We can only use complete times, so anything which
162 # doesn't feature the time part is considered invalid.
164 # This is working around clients that don't implement W3C-DTF
165 # correctly, and only send single digit values in the dates.
166 # 2004-4-8T16:9:4Z vs 2004-04-08T16:09:44Z
167 # If it's more messed up than that, reject it outright.
168 $tw3 =~ /^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2})(?::(\d{1,2}))?$/
171 my %pd; # parsed date
172 $pd{Y
} = $1; $pd{M
} = $2; $pd{D
} = $3;
173 $pd{h
} = $4; $pd{m
} = $5; $pd{s
} = $6;
175 # force double digits
176 foreach (qw
/ M D h m s /) {
177 next unless defined $pd{$_};
178 $pd{$_} = sprintf "%02d", $pd{$_};
181 return $pd{s
} ?
"$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}:$pd{s}" :
182 "$pd{Y}-$pd{M}-$pd{D} $pd{h}:$pd{m}";
185 package LJ
::ParseFeed
::Atom
;
187 our ($feed, $item, $data);
188 our ($ddepth, $dholder); # for accumulating;
193 $error = shift unless $error;
197 return ($feed, \
@items, $error);
200 # $name under which we'll store accumulated data may be different
201 # from $tag which causes us to store it
202 # $name may be a scalarref pointing to where we should store
203 # swallowing is achieved by calling startaccum('');
208 return err
("Tag found under neither <feed> nor <entry>")
209 unless $feed || $item;
210 $data = ""; # defining $data triggers accumulation
215 # if $name is a scalarref, it's actually our $dholder
216 if (ref($name) eq 'SCALAR') {
219 $dholder = ($item ? \
$item->{$name} : \
$feed->{$name})
226 return startaccum
('');
230 ($feed, $item, $data) = (undef, undef, undef);
236 # $_ carries the unparsed tag
240 # do nothing if there has been an error
243 # are we just accumulating data?
250 # where we'll usually store info
251 $holder = $item ?
$item : $feed;
254 if ($tag eq 'feed') {
255 return err
("Nested <feed> tags")
258 $feed->{'standard'} = 'atom';
259 $feed->{'version'} = $_{'version'};
260 return err
("Incompatible version specified in <feed>")
261 if $feed->{'version'} && $feed->{'version'} < 0.3;
264 if ($tag eq 'entry') {
265 return err
("Nested <entry> tags")
271 # at this point, we must have a top-level <feed> or <entry>
273 return err
("Tag found under neither <feed> nor <entry>")
276 if ($tag eq 'link') {
277 # ignore links with rel= anything but alternate
278 # and treat links as rel=alternate if not explicit
279 unless ($_{'rel'} eq 'alternate' || !$_{'rel'}) {
283 $holder->{'link'} = $_{'href'};
284 return err
("No href attribute in <link>")
285 unless $holder->{'link'};
289 if ($tag eq 'content') {
290 return err
("<content> outside <entry>")
292 # if type is multipart/alternative, we continue recursing
293 # otherwise we accumulate
294 my $type = $_{'type'} || "text/plain";
295 unless ($type eq "multipart/alternative") {
296 push @
{$item->{'contents'}}, [$type, ""];
297 startaccum
(\
$item->{'contents'}->[-1]->[1]);
300 # it's multipart/alternative, so recurse, but don't swallow
304 # store tags which should require no further
305 # processing as they are, and others under _atom_*, to be processed
306 # in EndTag under </entry>
307 if ($tag eq 'title') {
308 if ($item) { # entry's subject
309 startaccum
("subject");
310 } else { # feed's title
317 swallow
(); # we don't need feed-level <id>
324 if ($tag eq 'tagline' && !$item) { # feed's tagline, our "description"
325 startaccum
("description");
329 # accumulate and store
330 startaccum
("_atom_" . $tag);
338 # $_ carries the unparsed tag
341 # do nothing if there has been an error
344 # are we accumulating data?
347 if ($ddepth == 0) { # stop accumulating
358 if ($tag eq 'entry') {
360 # generate suitable text from $item->{'contents'}
362 $item->{'contents'} ||= [];
363 unless (scalar(@
{$item->{'contents'}}) >= 1) {
364 # this item had no <content>
365 # maybe it has <summary>? if so, use <summary>
366 # TODO: type= or encoding issues here? perhaps unite
367 # handling of <summary> with that of <content>?
368 if ($item->{'_atom_summary'}) {
369 $item->{'text'} = $item->{'_atom_summary'};
370 delete $item->{'contents'};
372 # nothing to display, so ignore this entry
378 unless ($item->{'text'}) { # unless we already have text
379 if (scalar(@
{$item->{'contents'}}) == 1) {
380 # only one <content> section
381 $content = $item->{'contents'}->[0];
383 # several <content> section, must choose the best one
384 foreach (@
{$item->{'contents'}}) {
385 if ($_->[0] eq "application/xhtml+xml") { # best match
387 last; # don't bother to look at others
389 if ($_->[0] =~ m!html!) { # some kind of html/xhtml/html+xml, etc.
390 # choose this unless we've already chosen some html
392 unless $content->[0] =~ m!html!;
395 if ($_->[0] eq "text/plain") {
396 # choose this unless we have some html already
398 unless $content->[0] =~ m!html!;
402 # if we didn't choose anything, pick the first one
403 $content = $item->{'contents'}->[0]
407 # we ignore the 'mode' attribute of <content>. If it's "xml", we've
408 # stringified it by accumulation; if it's "escaped", our parser
410 # TODO: handle mode=base64?
412 $item->{'text'} = $content->[1];
413 delete $item->{'contents'};
417 my $w3time = $item->{'_atom_created'} || $item->{'_atom_published'} ||
418 $item->{'_atom_modified'} || $item->{'_atom_updated'};
422 # see http://www.w3.org/TR/NOTE-datetime for format
423 # we insist on having granularity up to a minute,
424 # and ignore finer data as well as the timezone, for now
425 if ($w3time =~ m!^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d)!) {
426 $time = "$1-$2-$3 $4:$5";
430 $item->{'time'} = $time;
433 # get rid of all other tags we don't need anymore
434 foreach (keys %$item) {
435 delete $item->{$_} if substr($_, 0, 6) eq '_atom_';
442 if ($tag eq 'feed') {
444 # get rid of all other tags we don't need anymore
445 foreach (keys %$feed) {
446 delete $feed->{$_} if substr($_, 0, 6) eq '_atom_';
449 # link the feed with its itms
450 $feed->{'items'} = \
@items
461 # do nothing if there has been an error
464 $data .= $_ if defined $data;
468 # ignore processing instructions
473 # if we parsed a feed, link items to it
474 $feed->{'items'} = \
@items