2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
5 * Creates a sitemap for the site
9 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
10 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
11 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
13 * @see http://www.sitemaps.org/
14 * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
16 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
19 class GenerateSitemap
{
21 * The maximum amount of urls in a sitemap file
23 * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
30 * The maximum size of a sitemap file
32 * @link http://www.sitemaps.org/faq.php#faq_sitemap_size
39 * The path to prepend to the filename
46 * The path to append to the domain name
53 * Whether or not to use compression
60 * The number of entries to save in each sitemap file
67 * Key => value entries of namespaces and their priorities
71 var $priorities = array(
72 // Custom main namespaces
74 // Custom talk namesspaces
76 // MediaWiki standard namespaces
80 NS_USER_TALK
=> '0.1',
82 NS_PROJECT_TALK
=> '0.1',
84 NS_IMAGE_TALK
=> '0.1',
85 NS_MEDIAWIKI
=> '0.0',
86 NS_MEDIAWIKI_TALK
=> '0.1',
88 NS_TEMPLATE_TALK
=> '0.1',
90 NS_HELP_TALK
=> '0.1',
92 NS_CATEGORY_TALK
=> '0.1',
96 * A one-dimensional array of namespaces in the wiki
100 var $namespaces = array();
103 * When this sitemap batch was generated
110 * A database slave object
117 * A resource pointing to the sitemap index file
125 * A resource pointing to a sitemap file
132 * A resource pointing to php://stderr
141 * @param string $fspath The path to prepend to the filenames, used to
142 * save them somewhere else than in the root directory
143 * @param string $path The path to append to the domain name
144 * @param bool $compress Whether to compress the sitemap files
146 function GenerateSitemap( $fspath, $compress ) {
147 global $wgScriptPath;
149 $this->url_limit
= 50000;
150 $this->size_limit
= pow( 2, 20 ) * 10;
151 $this->fspath
= isset( $fspath ) ?
$fspath : '';
152 $this->compress
= $compress;
154 $this->stderr
= fopen( 'php://stderr', 'wt' );
155 $this->dbr
= wfGetDB( DB_SLAVE
);
156 $this->generateNamespaces();
157 $this->timestamp
= wfTimestamp( TS_ISO_8601
, wfTimestampNow() );
158 $this->findex
= fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
162 * Generate a one-dimensional array of existing namespaces
164 function generateNamespaces() {
165 $fname = 'GenerateSitemap::generateNamespaces';
167 // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
168 global $wgSitemapNamespaces;
169 if( is_array( $wgSitemapNamespaces ) ) {
170 $this->namespaces
= $wgSitemapNamespaces;
174 $res = $this->dbr
->select( 'page',
175 array( 'page_namespace' ),
179 'GROUP BY' => 'page_namespace',
180 'ORDER BY' => 'page_namespace',
184 while ( $row = $this->dbr
->fetchObject( $res ) )
185 $this->namespaces
[] = $row->page_namespace
;
189 * Get the priority of a given namespace
191 * @param int $namespace The namespace to get the priority for
196 function priority( $namespace ) {
197 return isset( $this->priorities
[$namespace] ) ?
$this->priorities
[$namespace] : $this->guessPriority( $namespace );
201 * If the namespace isn't listed on the priority list return the
202 * default priority for the namespace, varies depending on whether it's
205 * @param int $namespace The namespace to get the priority for
209 function guessPriority( $namespace ) {
210 return MWNamespace
::isMain( $namespace ) ?
$this->priorities
[GS_MAIN
] : $this->priorities
[GS_TALK
];
214 * Return a database resolution of all the pages in a given namespace
216 * @param int $namespace Limit the query to this namespace
220 function getPageRes( $namespace ) {
221 $fname = 'GenerateSitemap::getPageRes';
223 return $this->dbr
->select( 'page',
229 array( 'page_namespace' => $namespace ),
242 fwrite( $this->findex
, $this->openIndex() );
244 foreach ( $this->namespaces
as $namespace ) {
245 $res = $this->getPageRes( $namespace );
247 $this->generateLimit( $namespace );
248 $length = $this->limit
[0];
251 $fns = $wgContLang->getFormattedNsText( $namespace );
252 $this->debug( "$namespace ($fns)" );
253 while ( $row = $this->dbr
->fetchObject( $res ) ) {
254 if ( $i++
=== 0 ||
$i === $this->url_limit +
1 ||
$length +
$this->limit
[1] +
$this->limit
[2] > $this->size_limit
) {
255 if ( $this->file
!== false ) {
256 $this->write( $this->file
, $this->closeFile() );
257 $this->close( $this->file
);
259 $filename = $this->sitemapFilename( $namespace, $smcount++
);
260 $this->file
= $this->open( $this->fspath
. $filename, 'wb' );
261 $this->write( $this->file
, $this->openFile() );
262 fwrite( $this->findex
, $this->indexEntry( $filename ) );
263 $this->debug( "\t$filename" );
264 $length = $this->limit
[0];
267 $title = Title
::makeTitle( $row->page_namespace
, $row->page_title
);
268 $date = wfTimestamp( TS_ISO_8601
, $row->page_touched
);
269 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
270 $length +
= strlen( $entry );
271 $this->write( $this->file
, $entry );
272 // generate pages for language variants
273 if($wgContLang->hasVariants()){
274 $variants = $wgContLang->getVariants();
275 foreach($variants as $vCode){
276 if($vCode==$wgContLang->getCode()) continue; // we don't want default variant
277 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
278 $length +
= strlen( $entry );
279 $this->write( $this->file
, $entry );
284 $this->write( $this->file
, $this->closeFile() );
285 $this->close( $this->file
);
288 fwrite( $this->findex
, $this->closeIndex() );
289 fclose( $this->findex
);
293 * gzopen() / fopen() wrapper
297 function open( $file, $flags ) {
298 return $this->compress ?
gzopen( $file, $flags ) : fopen( $file, $flags );
302 * gzwrite() / fwrite() wrapper
304 function write( &$handle, $str ) {
305 if ( $this->compress
)
306 gzwrite( $handle, $str );
308 fwrite( $handle, $str );
312 * gzclose() / fclose() wrapper
314 function close( &$handle ) {
315 if ( $this->compress
)
322 * Get a sitemap filename
326 * @param int $namespace The namespace
327 * @param int $count The count
331 function sitemapFilename( $namespace, $count ) {
332 $ext = $this->compress ?
'.gz' : '';
333 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
337 * Return the XML required to open an XML file
344 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
348 * Return the XML schema being used
354 function xmlSchema() {
355 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
359 * Return the XML required to open a sitemap index file
363 function openIndex() {
364 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
368 * Return the XML for a single sitemap indexfile entry
372 * @param string $filename The filename of the sitemap file
376 function indexEntry( $filename ) {
379 "\t\t<loc>$filename</loc>\n" .
380 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
385 * Return the XML required to close a sitemap index file
391 function closeIndex() {
392 return "</sitemapindex>\n";
396 * Return the XML required to open a sitemap file
400 function openFile() {
401 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
405 * Return the XML for a single sitemap entry
409 * @param string $url An RFC 2396 compilant URL
410 * @param string $date A ISO 8601 date
411 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
415 function fileEntry( $url, $date, $priority ) {
418 "\t\t<loc>$url</loc>\n" .
419 "\t\t<lastmod>$date</lastmod>\n" .
420 "\t\t<priority>$priority</priority>\n" .
425 * Return the XML required to close sitemap file
430 function closeFile() {
431 return "</urlset>\n";
435 * Write a string to stderr followed by a UNIX newline
437 function debug( $str ) {
438 fwrite( $this->stderr
, "$str\n" );
442 * Populate $this->limit
444 function generateLimit( $namespace ) {
445 $title = Title
::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
447 $this->limit
= array(
448 strlen( $this->openFile() ),
449 strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601
, wfTimestamp() ), $this->priority( $namespace ) ) ),
450 strlen( $this->closeFile() )
455 if ( in_array( '--help', $argv ) ) {
457 Usage: php generateSitemap.php [options]
458 --help show this message
460 --fspath=<path> The file system path to save to, e.g /tmp/sitemap/
462 --server=<server> The protocol and server name to use in URLs, e.g.
463 http://en.wikipedia.org. This is sometimes necessary because
464 server name detection may fail in command line scripts.
466 --compress=[yes|no] compress the sitemap files, default yes
472 $optionsWithArgs = array( 'fspath', 'server', 'compress' );
473 require_once( dirname( __FILE__
) . '/commandLine.inc' );
475 if ( isset( $options['server'] ) ) {
476 $wgServer = $options['server'];
479 $gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' );