* Update docs/skin.txt.
[mediawiki.git] / maintenance / generateSitemap.php
blob657437bcbf70a644a7d3b68a8d1a6109c04aa6a9
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5 * Creates a Google sitemap for the site
7 * @addtogroup Maintenance
9 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
10 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
11 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
13 * @see http://www.google.com/webmasters/sitemaps/docs/en/about.html
14 * @see http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
16 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
19 class GenerateSitemap {
20 /**
21 * The maximum amount of urls in a sitemap file
23 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
25 * @var int
27 var $url_limit;
29 /**
30 * The maximum size of a sitemap file
32 * @link http://www.google.com/webmasters/sitemaps/docs/en/protocol.html#faq_sitemap_size
34 * @var int
36 var $size_limit;
38 /**
39 * The path to prepend to the filename
41 * @var string
43 var $fspath;
45 /**
46 * The path to append to the domain name
48 * @var string
50 var $path;
52 /**
53 * Whether or not to use compression
55 * @var bool
57 var $compress;
59 /**
60 * The number of entries to save in each sitemap file
62 * @var array
64 var $limit = array();
66 /**
67 * Key => value entries of namespaces and their priorities
69 * @var array
71 var $priorities = array(
72 // Custom main namespaces
73 GS_MAIN => '0.5',
74 // Custom talk namesspaces
75 GS_TALK => '0.1',
76 // MediaWiki standard namespaces
77 NS_MAIN => '1.0',
78 NS_TALK => '0.1',
79 NS_USER => '0.5',
80 NS_USER_TALK => '0.1',
81 NS_PROJECT => '0.5',
82 NS_PROJECT_TALK => '0.1',
83 NS_IMAGE => '0.5',
84 NS_IMAGE_TALK => '0.1',
85 NS_MEDIAWIKI => '0.0',
86 NS_MEDIAWIKI_TALK => '0.1',
87 NS_TEMPLATE => '0.0',
88 NS_TEMPLATE_TALK => '0.1',
89 NS_HELP => '0.5',
90 NS_HELP_TALK => '0.1',
91 NS_CATEGORY => '0.5',
92 NS_CATEGORY_TALK => '0.1',
95 /**
96 * A one-dimensional array of namespaces in the wiki
98 * @var array
100 var $namespaces = array();
103 * When this sitemap batch was generated
105 * @var string
107 var $timestamp;
110 * A database slave object
112 * @var object
114 var $dbr;
117 * A resource pointing to the sitemap index file
119 * @var resource
121 var $findex;
125 * A resource pointing to a sitemap file
127 * @var resource
129 var $file;
132 * A resource pointing to php://stderr
134 * @var resource
136 var $stderr;
139 * Constructor
141 * @param string $fspath The path to prepend to the filenames, used to
142 * save them somewhere else than in the root directory
143 * @param string $path The path to append to the domain name
144 * @param bool $compress Whether to compress the sitemap files
146 function GenerateSitemap( $fspath, $compress ) {
147 global $wgScriptPath;
149 $this->url_limit = 50000;
150 $this->size_limit = pow( 2, 20 ) * 10;
151 $this->fspath = isset( $fspath ) ? $fspath : '';
152 $this->compress = $compress;
154 $this->stderr = fopen( 'php://stderr', 'wt' );
155 $this->dbr = wfGetDB( DB_SLAVE );
156 $this->generateNamespaces();
157 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
158 $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
162 * Generate a one-dimensional array of existing namespaces
164 function generateNamespaces() {
165 $fname = 'GenerateSitemap::generateNamespaces';
167 $res = $this->dbr->select( 'page',
168 array( 'page_namespace' ),
169 array(),
170 $fname,
171 array(
172 'GROUP BY' => 'page_namespace',
173 'ORDER BY' => 'page_namespace',
177 while ( $row = $this->dbr->fetchObject( $res ) )
178 $this->namespaces[] = $row->page_namespace;
182 * Get the priority of a given namespace
184 * @param int $namespace The namespace to get the priority for
186 * @return string
189 function priority( $namespace ) {
190 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
194 * If the namespace isn't listed on the priority list return the
195 * default priority for the namespace, varies depending on whether it's
196 * a talkpage or not.
198 * @param int $namespace The namespace to get the priority for
200 * @return string
202 function guessPriority( $namespace ) {
203 return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
207 * Return a database resolution of all the pages in a given namespace
209 * @param int $namespace Limit the query to this namespace
211 * @return resource
213 function getPageRes( $namespace ) {
214 $fname = 'GenerateSitemap::getPageRes';
216 return $this->dbr->select( 'page',
217 array(
218 'page_namespace',
219 'page_title',
220 'page_touched',
222 array( 'page_namespace' => $namespace ),
223 $fname
228 * Main loop
230 * @access public
232 function main() {
233 global $wgContLang;
235 fwrite( $this->findex, $this->openIndex() );
237 foreach ( $this->namespaces as $namespace ) {
238 $res = $this->getPageRes( $namespace );
239 $this->file = false;
240 $this->generateLimit( $namespace );
241 $length = $this->limit[0];
242 $i = $smcount = 0;
244 $fns = $wgContLang->getFormattedNsText( $namespace );
245 $this->debug( "$namespace ($fns)" );
246 while ( $row = $this->dbr->fetchObject( $res ) ) {
247 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
248 if ( $this->file !== false ) {
249 $this->write( $this->file, $this->closeFile() );
250 $this->close( $this->file );
252 $filename = $this->sitemapFilename( $namespace, $smcount++ );
253 $this->file = $this->open( $this->fspath . $filename, 'wb' );
254 $this->write( $this->file, $this->openFile() );
255 fwrite( $this->findex, $this->indexEntry( $filename ) );
256 $this->debug( "\t$filename" );
257 $length = $this->limit[0];
258 $i = 1;
260 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
261 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
262 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
263 $length += strlen( $entry );
264 $this->write( $this->file, $entry );
265 // generate pages for language variants
266 if($wgContLang->hasVariants()){
267 $variants = $wgContLang->getVariants();
268 foreach($variants as $vCode){
269 if($vCode==$wgContLang->getCode()) continue; // we don't want default variant
270 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
271 $length += strlen( $entry );
272 $this->write( $this->file, $entry );
276 if ( $this->file ) {
277 $this->write( $this->file, $this->closeFile() );
278 $this->close( $this->file );
281 fwrite( $this->findex, $this->closeIndex() );
282 fclose( $this->findex );
286 * gzopen() / fopen() wrapper
288 * @return resource
290 function open( $file, $flags ) {
291 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
295 * gzwrite() / fwrite() wrapper
297 function write( &$handle, $str ) {
298 if ( $this->compress )
299 gzwrite( $handle, $str );
300 else
301 fwrite( $handle, $str );
305 * gzclose() / fclose() wrapper
307 function close( &$handle ) {
308 if ( $this->compress )
309 gzclose( $handle );
310 else
311 fclose( $handle );
315 * Get a sitemap filename
317 * @static
319 * @param int $namespace The namespace
320 * @param int $count The count
322 * @return string
324 function sitemapFilename( $namespace, $count ) {
325 $ext = $this->compress ? '.gz' : '';
326 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
330 * Return the XML required to open an XML file
332 * @static
334 * @return string
336 function xmlHead() {
337 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
341 * Return the XML schema being used
343 * @static
345 * @returns string
347 function xmlSchema() {
348 return 'http://www.google.com/schemas/sitemap/0.84';
352 * Return the XML required to open a sitemap index file
354 * @return string
356 function openIndex() {
357 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
361 * Return the XML for a single sitemap indexfile entry
363 * @static
365 * @param string $filename The filename of the sitemap file
367 * @return string
369 function indexEntry( $filename ) {
370 return
371 "\t<sitemap>\n" .
372 "\t\t<loc>$filename</loc>\n" .
373 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
374 "\t</sitemap>\n";
378 * Return the XML required to close a sitemap index file
380 * @static
382 * @return string
384 function closeIndex() {
385 return "</sitemapindex>\n";
389 * Return the XML required to open a sitemap file
391 * @return string
393 function openFile() {
394 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
398 * Return the XML for a single sitemap entry
400 * @static
402 * @param string $url An RFC 2396 compilant URL
403 * @param string $date A ISO 8601 date
404 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
406 * @return string
408 function fileEntry( $url, $date, $priority ) {
409 return
410 "\t<url>\n" .
411 "\t\t<loc>$url</loc>\n" .
412 "\t\t<lastmod>$date</lastmod>\n" .
413 "\t\t<priority>$priority</priority>\n" .
414 "\t</url>\n";
418 * Return the XML required to close sitemap file
420 * @static
421 * @return string
423 function closeFile() {
424 return "</urlset>\n";
428 * Write a string to stderr followed by a UNIX newline
430 function debug( $str ) {
431 fwrite( $this->stderr, "$str\n" );
435 * Populate $this->limit
437 function generateLimit( $namespace ) {
438 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
440 $this->limit = array(
441 strlen( $this->openFile() ),
442 strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
443 strlen( $this->closeFile() )
448 if ( in_array( '--help', $argv ) ) {
449 echo <<<EOT
450 Usage: php generateSitemap.php [options]
451 --help show this message
453 --fspath=<path> The file system path to save to, e.g /tmp/sitemap/
455 --server=<server> The protocol and server name to use in URLs, e.g.
456 http://en.wikipedia.org. This is sometimes necessary because
457 server name detection may fail in command line scripts.
459 --compress=[yes|no] compress the sitemap files, default yes
461 EOT;
462 die( -1 );
465 $optionsWithArgs = array( 'fspath', 'server', 'compress' );
466 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
468 if ( isset( $options['server'] ) ) {
469 $wgServer = $options['server'];
472 $gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' );
473 $gs->main();