2 # Feed aggregation plugin.
3 package IkiWiki
::Plugin
::aggregate
;
11 use open qw{:utf8
:std
};
17 hook
(type
=> "getopt", id
=> "aggregate", call
=> \
&getopt
);
18 hook
(type
=> "getsetup", id
=> "aggregate", call
=> \
&getsetup
);
19 hook
(type
=> "checkconfig", id
=> "aggregate", call
=> \
&checkconfig
,
21 hook
(type
=> "needsbuild", id
=> "aggregate", call
=> \
&needsbuild
);
22 hook
(type
=> "preprocess", id
=> "aggregate", call
=> \
&preprocess
);
23 hook
(type
=> "delete", id
=> "aggregate", call
=> \
&delete);
24 hook
(type
=> "savestate", id
=> "aggregate", call
=> \
&savestate
);
25 hook
(type
=> "htmlize", id
=> "_aggregated", call
=> \
&htmlize
);
26 if (exists $config{aggregate_webtrigger
} && $config{aggregate_webtrigger
}) {
27 hook
(type
=> "cgi", id
=> "aggregate", call
=> \
&cgi
);
32 eval q{use Getopt::Long};
34 Getopt
::Long
::Configure
('pass_through');
36 "aggregate" => \
$config{aggregate
},
37 "aggregateinternal!" => \
$config{aggregateinternal
},
47 aggregateinternal
=> {
50 description
=> "enable aggregation to internal pages?",
51 safe
=> 0, # enabling needs manual transition
54 aggregate_webtrigger
=> {
57 description
=> "allow aggregation to be triggered via the web?",
63 example
=> { file
=> "$ENV{HOME}/.ikiwiki/cookies" },
64 safe
=> 0, # hooks into perl module internals
65 description
=> "cookie control",
70 if (! defined $config{aggregateinternal
}) {
71 $config{aggregateinternal
}=1;
73 if (! defined $config{cookiejar
}) {
74 $config{cookiejar
}={ file
=> "$ENV{HOME}/.ikiwiki/cookies" };
77 # This is done here rather than in a refresh hook because it
78 # needs to run before the wiki is locked.
79 if ($config{aggregate
} && ! ($config{post_commit
} &&
80 IkiWiki
::commit_hook_enabled
())) {
88 if (defined $cgi->param('do') &&
89 $cgi->param("do") eq "aggregate_webtrigger") {
91 print "Content-Type: text/plain\n\n";
95 print gettext
("Aggregation triggered via web.")."\n\n";
96 if (launchaggregation
()) {
99 require IkiWiki
::Render
;
101 IkiWiki
::saveindex
();
104 print gettext
("Nothing to do right now, all feeds are up-to-date!")."\n";
110 sub launchaggregation
() {
111 # See if any feeds need aggregation.
113 my @feeds=needsaggregate
();
114 return unless @feeds;
115 if (! lockaggregate
()) {
116 debug
("an aggregation process is already running");
119 # force a later rebuild of source pages
120 $IkiWiki::forcerebuild
{$_->{sourcepage
}}=1
123 # Fork a child process to handle the aggregation.
124 # The parent process will then handle building the
125 # result. This avoids messy code to clear state
126 # accumulated while aggregating.
127 defined(my $pid = fork) or error
("Can't fork: $!");
129 IkiWiki
::loadindex
();
130 # Aggregation happens without the main wiki lock
131 # being held. This allows editing pages etc while
132 # aggregation is running.
136 # Merge changes, since aggregation state may have
137 # changed on disk while the aggregation was happening.
146 error
"aggregation failed with code $?";
155 # Pages with extension _aggregated have plain html markup, pass through.
158 return $params{content
};
161 # Used by ikiwiki-transition aggregateinternal.
162 sub migrate_to_internal
{
163 if (! lockaggregate
()) {
164 error
("an aggregation process is currently running");
171 foreach my $data (values %guids) {
172 next unless $data->{page
};
173 next if $data->{expired
};
175 $config{aggregateinternal
} = 0;
176 my $oldname = "$config{srcdir}/".htmlfn
($data->{page
});
178 $oldname = $IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($data->{page
});
181 my $oldoutput = $config{destdir
}."/".IkiWiki
::htmlpage
($data->{page
});
183 $config{aggregateinternal
} = 1;
184 my $newname = $IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($data->{page
});
186 debug
"moving $oldname -> $newname";
189 error
("$newname already exists");
192 debug
("already renamed to $newname?");
195 elsif (-e
$oldname) {
196 rename($oldname, $newname) || error
("$!");
199 debug
("$oldname not found");
202 require IkiWiki
::Render
;
203 debug
("removing output file $oldoutput");
204 IkiWiki
::prune
($oldoutput);
215 my $needsbuild=shift;
219 foreach my $feed (values %feeds) {
220 if (exists $pagesources{$feed->{sourcepage
}} &&
221 grep { $_ eq $pagesources{$feed->{sourcepage
}} } @
$needsbuild) {
222 # Mark all feeds originating on this page as
223 # not yet seen; preprocess will unmark those that
225 markunseen
($feed->{sourcepage
});
235 foreach my $required (qw{name url
}) {
236 if (! exists $params{$required}) {
237 error
sprintf(gettext
("missing %s parameter"), $required)
242 my $name=$params{name
};
243 if (exists $feeds{$name}) {
250 $feed->{sourcepage
}=$params{page
};
251 $feed->{url
}=$params{url
};
252 my $dir=exists $params{dir
} ?
$params{dir
} : $params{page
}."/".titlepage
($params{name
});
254 ($dir)=$dir=~/$config{wiki_file_regexp}/;
256 $feed->{feedurl
}=defined $params{feedurl
} ?
$params{feedurl
} : "";
257 $feed->{updateinterval
}=defined $params{updateinterval
} ?
$params{updateinterval
} * 60 : 15 * 60;
258 $feed->{expireage
}=defined $params{expireage
} ?
$params{expireage
} : 0;
259 $feed->{expirecount
}=defined $params{expirecount
} ?
$params{expirecount
} : 0;
260 if (exists $params{template
}) {
261 $params{template
}=~s/[^-_a-zA-Z0-9]+//g;
264 $params{template
} = "aggregatepost"
266 $feed->{template
}=$params{template
} . ".tmpl";
267 delete $feed->{unseen
};
268 $feed->{lastupdate
}=0 unless defined $feed->{lastupdate
};
269 $feed->{lasttry
}=$feed->{lastupdate
} unless defined $feed->{lasttry
};
270 $feed->{numposts
}=0 unless defined $feed->{numposts
};
271 $feed->{newposts
}=0 unless defined $feed->{newposts
};
272 $feed->{message
}=gettext
("new feed") unless defined $feed->{message
};
273 $feed->{error
}=0 unless defined $feed->{error
};
279 push @
{$feed->{tags
}}, $value;
283 return "<a href=\"".$feed->{url
}."\">".$feed->{name
}."</a>: ".
284 ($feed->{error
} ?
"<em>" : "").$feed->{message
}.
285 ($feed->{error
} ?
"</em>" : "").
286 " (".$feed->{numposts
}." ".gettext
("posts").
287 ($feed->{newposts
} ?
"; ".$feed->{newposts
}.
288 " ".gettext
("new") : "").
295 # Remove feed data for removed pages.
296 foreach my $file (@files) {
297 my $page=pagename
($file);
305 foreach my $id (keys %feeds) {
306 if ($feeds{$id}->{sourcepage
} eq $page) {
307 $feeds{$id}->{unseen
}=1;
315 return if $state_loaded;
317 if (-e
"$config{wikistatedir}/aggregate") {
318 open(IN
, "<", "$config{wikistatedir}/aggregate") ||
319 die "$config{wikistatedir}/aggregate: $!";
321 $_=IkiWiki
::possibly_foolish_untaint
($_);
324 foreach my $i (split(/ /, $_)) {
325 my ($field, $val)=split(/=/, $i, 2);
326 if ($field eq "name" || $field eq "feed" ||
327 $field eq "guid" || $field eq "message") {
328 $data->{$field}=decode_entities
($val, " \t\n");
330 elsif ($field eq "tag") {
331 push @
{$data->{tags
}}, $val;
334 $data->{$field}=$val;
338 if (exists $data->{name
}) {
339 $feeds{$data->{name
}}=$data;
341 elsif (exists $data->{guid
}) {
342 $guids{$data->{guid
}}=$data;
351 return unless $state_loaded;
353 my $newfile="$config{wikistatedir}/aggregate.new";
354 my $cleanup = sub { unlink($newfile) };
355 open (OUT
, ">", $newfile) || error
("open $newfile: $!", $cleanup);
356 foreach my $data (values %feeds, values %guids) {
358 foreach my $field (keys %$data) {
359 if ($field eq "name" || $field eq "feed" ||
360 $field eq "guid" || $field eq "message") {
361 push @line, "$field=".encode_entities
($data->{$field}, " \t\n");
363 elsif ($field eq "tags") {
364 push @line, "tag=$_" foreach @
{$data->{tags
}};
367 push @line, "$field=".$data->{$field}
368 if defined $data->{$field};
371 print OUT
join(" ", @line)."\n" || error
("write $newfile: $!", $cleanup);
373 close OUT
|| error
("save $newfile: $!", $cleanup);
374 rename($newfile, "$config{wikistatedir}/aggregate") ||
375 error
("rename $newfile: $!", $cleanup);
378 foreach my $feed (keys %feeds) {
379 my $t=$feeds{$feed}->{lastupdate
}+$feeds{$feed}->{updateinterval
};
380 if (! defined $timestamp || $timestamp > $t) {
384 $newfile=~s/\.new$/time/;
385 open (OUT
, ">", $newfile) || error
("open $newfile: $!", $cleanup);
386 if (defined $timestamp) {
387 print OUT
$timestamp."\n";
389 close OUT
|| error
("save $newfile: $!", $cleanup);
392 sub garbage_collect
() {
393 foreach my $name (keys %feeds) {
394 # remove any feeds that were not seen while building the pages
395 # that used to contain them
396 if ($feeds{$name}->{unseen
}) {
397 delete $feeds{$name};
401 foreach my $guid (values %guids) {
402 # any guid whose feed is gone should be removed
403 if (! exists $feeds{$guid->{feed
}}) {
404 if (exists $guid->{page
}) {
405 unlink $IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($guid->{page
})
406 || unlink "$config{srcdir}/".htmlfn
($guid->{page
});
408 delete $guids{$guid->{guid
}};
410 # handle expired guids
411 elsif ($guid->{expired
} && exists $guid->{page
}) {
412 unlink "$config{srcdir}/".htmlfn
($guid->{page
});
413 unlink $IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($guid->{page
});
414 delete $guid->{page
};
421 # Load the current state in from disk, and merge into it
422 # values from the state in memory that might have changed
423 # during aggregation.
429 # All that can change in feed state during aggregation is a few
431 foreach my $name (keys %myfeeds) {
432 if (exists $feeds{$name}) {
433 foreach my $field (qw{message lastupdate lasttry
434 numposts newposts error
}) {
435 $feeds{$name}->{$field}=$myfeeds{$name}->{$field};
440 # New guids can be created during aggregation.
441 # Guids have a few fields that may be updated during aggregation.
442 # It's also possible that guids were removed from the on-disk state
443 # while the aggregation was in process. That would only happen if
444 # their feed was also removed, so any removed guids added back here
445 # will be garbage collected later.
446 foreach my $guid (keys %myguids) {
447 if (! exists $guids{$guid}) {
448 $guids{$guid}=$myguids{$guid};
451 foreach my $field (qw{md5
}) {
452 $guids{$guid}->{$field}=$myguids{$guid}->{$field};
465 foreach my $feed (values %feeds) {
466 next unless $feed->{expireage
} || $feed->{expirecount
};
469 foreach my $item (sort { ($IkiWiki::pagectime
{$b->{page
}} || 0) <=> ($IkiWiki::pagectime
{$a->{page
}} || 0) }
470 grep { exists $_->{page
} && $_->{feed
} eq $feed->{name
} }
472 if ($feed->{expireage
}) {
473 my $days_old = (time - ($IkiWiki::pagectime
{$item->{page
}} || 0)) / 60 / 60 / 24;
474 if ($days_old > $feed->{expireage
}) {
475 debug
(sprintf(gettext
("expiring %s (%s days old)"),
476 $item->{page
}, int($days_old)));
480 elsif ($feed->{expirecount
} &&
481 $count >= $feed->{expirecount
}) {
482 debug
(sprintf(gettext
("expiring %s"), $item->{page
}));
486 if (! $seen{$item->{page
}}) {
487 $seen{$item->{page
}}=1;
495 sub needsaggregate
() {
496 return values %feeds if $config{rebuild
};
497 return grep { time - $_->{lastupdate
} >= $_->{updateinterval
} } values %feeds;
501 eval q{use XML::Feed};
503 eval q{use URI::Fetch};
506 foreach my $feed (@_) {
507 $feed->{lasttry
}=time;
509 $feed->{message
}=sprintf(gettext
("last checked %s"),
510 displaytime
($feed->{lasttry
}));
513 debug
(sprintf(gettext
("checking feed %s ..."), $feed->{name
}));
515 if (! length $feed->{feedurl
}) {
516 my @urls=XML
::Feed
->find_feeds($feed->{url
});
518 $feed->{message
}=sprintf(gettext
("could not find feed at %s"), $feed->{url
});
520 debug
($feed->{message
});
523 $feed->{feedurl
}=pop @urls;
525 my $res=URI
::Fetch
->fetch($feed->{feedurl
},
526 UserAgent
=> LWP
::UserAgent
->new(
527 cookie_jar
=> $config{cookiejar
},
531 $feed->{message
}=URI
::Fetch
->errstr;
533 debug
($feed->{message
});
537 # lastupdate is only set if we were able to contact the server
538 $feed->{lastupdate
}=$feed->{lasttry
};
540 if ($res->status == URI
::Fetch
::URI_GONE
()) {
541 $feed->{message
}=gettext
("feed not found");
543 debug
($feed->{message
});
546 my $content=$res->content;
547 my $f=eval{XML
::Feed
->parse(\
$content)};
549 # One common cause of XML::Feed crashing is a feed
550 # that contains invalid UTF-8 sequences. Convert
551 # feed to ascii to try to work around.
552 $feed->{message
}.=" ".sprintf(gettext
("(invalid UTF-8 stripped from feed)"));
554 $content=Encode
::decode_utf8
($content, 0);
555 XML
::Feed
->parse(\
$content)
559 # Another possibility is badly escaped entities.
560 $feed->{message
}.=" ".sprintf(gettext
("(feed entities escaped)"));
561 $content=~s/\&(?!amp)(\w+);/&$1;/g;
563 $content=Encode
::decode_utf8
($content, 0);
564 XML
::Feed
->parse(\
$content)
568 $feed->{message
}=gettext
("feed crashed XML::Feed!")." ($@)";
570 debug
($feed->{message
});
574 $feed->{message
}=XML
::Feed
->errstr;
576 debug
($feed->{message
});
580 foreach my $entry ($f->entries) {
581 # XML::Feed doesn't work around XML::Atom's bizarre
582 # API, so we will. Real unicode strings? Yes please.
583 # See [[bugs/Aggregated_Atom_feeds_are_double-encoded]]
584 local $XML::Atom
::ForceUnicode
= 1;
586 my $c=$entry->content;
587 # atom feeds may have no content, only a summary
588 if (! defined $c && ref $entry->summary) {
594 copyright
=> $f->copyright,
595 title
=> defined $entry->title ? decode_entities
($entry->title) : "untitled",
596 link => $entry->link,
597 content
=> (defined $c && defined $c->body) ?
$c->body : "",
598 guid
=> defined $entry->id ?
$entry->id : time."_".$feed->{name
},
599 ctime
=> $entry->issued ?
($entry->issued->epoch || time) : time,
600 base
=> (defined $c && $c->can("base")) ?
$c->base : undef,
609 my $feed=$params{feed
};
612 if (exists $guids{$params{guid
}}) {
613 # updating an existing post
614 $guid=$guids{$params{guid
}};
615 return if $guid->{expired
};
619 $guid->{guid
}=$params{guid
};
620 $guids{$params{guid
}}=$guid;
621 $mtime=$params{ctime
};
625 # assign it an unused page
626 my $page=titlepage
($params{title
});
627 # escape slashes and periods in title so it doesn't specify
628 # directory name or trigger ".." disallowing code.
629 $page=~s!([/.])!"__".ord($1)."__"!eg;
630 $page=$feed->{dir
}."/".$page;
631 ($page)=$page=~/$config{wiki_file_regexp}/;
632 if (! defined $page || ! length $page) {
633 $page=$feed->{dir
}."/item";
636 while (exists $IkiWiki::pagecase
{lc $page.$c} ||
637 -e
$IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($page.$c) ||
638 -e
"$config{srcdir}/".htmlfn
($page.$c)) {
642 # Make sure that the file name isn't too long.
643 # NB: This doesn't check for path length limits.
644 my $max=POSIX
::pathconf
($config{srcdir
}, &POSIX
::_PC_NAME_MAX
);
645 if (defined $max && length(htmlfn
($page)) >= $max) {
647 $page=$feed->{dir
}."/item";
648 while (exists $IkiWiki::pagecase
{lc $page.$c} ||
649 -e
$IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($page.$c) ||
651 -e
"$config{srcdir}/".htmlfn
($page.$c)) {
657 debug
(sprintf(gettext
("creating new page %s"), $page));
659 $guid->{feed
}=$feed->{name
};
661 # To write or not to write? Need to avoid writing unchanged pages
662 # to avoid unneccessary rebuilding. The mtime from rss cannot be
663 # trusted; let's use a digest.
664 eval q{use Digest::MD5 'md5_hex'};
667 my $digest=md5_hex
(Encode
::encode_utf8
($params{content
}));
668 return unless ! exists $guid->{md5
} || $guid->{md5
} ne $digest || $config{rebuild
};
669 $guid->{md5
}=$digest;
674 $template=template
($feed->{template
}, blind_cache
=> 1);
677 print STDERR gettext
("failed to process template:")." $@";
680 $template->param(title
=> $params{title
})
681 if defined $params{title
} && length($params{title
});
682 $template->param(content
=> wikiescape
(htmlabs
($params{content
},
683 defined $params{base
} ?
$params{base
} : $feed->{feedurl
})));
684 $template->param(name
=> $feed->{name
});
685 $template->param(url
=> $feed->{url
});
686 $template->param(copyright
=> $params{copyright
})
687 if defined $params{copyright
} && length $params{copyright
};
688 $template->param(permalink
=> IkiWiki
::urlabs
($params{link}, $feed->{feedurl
}))
689 if defined $params{link};
690 if (ref $feed->{tags
}) {
691 $template->param(tags
=> [map { tag
=> $_ }, @
{$feed->{tags
}}]);
693 writefile
(htmlfn
($guid->{page
}),
694 $IkiWiki::Plugin
::transient
::transientdir
, $template->output);
696 if (defined $mtime && $mtime <= time) {
697 # Set the mtime, this lets the build process get the right
698 # creation time on record for the new page.
699 utime $mtime, $mtime,
700 $IkiWiki::Plugin
::transient
::transientdir
."/".htmlfn
($guid->{page
});
701 # Store it in pagectime for expiry code to use also.
702 $IkiWiki::pagectime
{$guid->{page
}}=$mtime
703 unless exists $IkiWiki::pagectime
{$guid->{page
}};
706 # Dummy value for expiry code.
707 $IkiWiki::pagectime
{$guid->{page
}}=time
708 unless exists $IkiWiki::pagectime
{$guid->{page
}};
713 # escape accidental wikilinks and preprocessor stuff
714 return encode_entities
(shift, '\[\]');
718 # Convert links in html from relative to absolute.
719 # Note that this is a heuristic, which is not specified by the rss
720 # spec and may not be right for all feeds. Also, see Debian
726 my $p = HTML
::Parser
->new(api_version
=> 3);
727 $p->handler(default => sub { $ret.=join("", @_) }, "text");
728 $p->handler(start
=> sub {
729 my ($tagname, $pos, $text) = @_;
730 if (ref $HTML::Tagset
::linkElements
{$tagname}) {
732 # use attribute sets from right to left
733 # to avoid invalidating the offsets
734 # when replacing the values
735 my($k_offset, $k_len, $v_offset, $v_len) =
737 my $attrname = lc(substr($text, $k_offset, $k_len));
738 next unless grep { $_ eq $attrname } @
{$HTML::Tagset
::linkElements
{$tagname}};
739 next unless $v_offset; # 0 v_offset means no value
740 my $v = substr($text, $v_offset, $v_len);
741 $v =~ s/^([\'\"])(.*)\1$/$2/;
742 my $new_v=IkiWiki
::urlabs
($v, $urlbase);
743 $new_v =~ s/\"/"/g; # since we quote with ""
744 substr($text, $v_offset, $v_len) = qq("$new_v");
748 }, "tagname, tokenpos, text");
756 return shift().".".($config{aggregateinternal
} ?
"_aggregated" : $config{htmlext
});
761 sub lockaggregate
() {
762 # Take an exclusive lock to prevent multiple concurrent aggregators.
763 # Returns true if the lock was aquired.
764 if (! -d
$config{wikistatedir
}) {
765 mkdir($config{wikistatedir
});
767 open($aggregatelock, '>', "$config{wikistatedir}/aggregatelock") ||
768 error
("cannot open to $config{wikistatedir}/aggregatelock: $!");
769 if (! flock($aggregatelock, 2 | 4)) { # LOCK_EX | LOCK_NB
770 close($aggregatelock) || error
("failed closing aggregatelock: $!");
776 sub unlockaggregate
() {
777 return close($aggregatelock) if $aggregatelock;