2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
5 * Creates a Google sitemap for the site
8 * @subpackage Maintenance
10 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
11 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
12 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
14 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
15 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
17 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
20 class GenerateSitemap
{
22 * The maximum amount of urls in a sitemap file
24 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
31 * The maximum size of a sitemap file
33 * @link http://www.google.com/webmasters/sitemaps/docs/en/protocol.html#faq_sitemap_size
40 * The path to prepend to the filename
47 * The path to append to the domain name
54 * Whether or not to use compression
61 * The number of entries to save in each sitemap file
68 * Key => value entries of namespaces and their priorities
72 var $priorities = array(
73 // Custom main namespaces
75 // Custom talk namesspaces
77 // MediaWiki standard namespaces
81 NS_USER_TALK
=> '0.1',
83 NS_PROJECT_TALK
=> '0.1',
85 NS_IMAGE_TALK
=> '0.1',
86 NS_MEDIAWIKI
=> '0.0',
87 NS_MEDIAWIKI_TALK
=> '0.1',
89 NS_TEMPLATE_TALK
=> '0.1',
91 NS_HELP_TALK
=> '0.1',
93 NS_CATEGORY_TALK
=> '0.1',
97 * A one-dimensional array of namespaces in the wiki
101 var $namespaces = array();
104 * When this sitemap batch was generated
111 * A database slave object
118 * A resource pointing to the sitemap index file
126 * A resource pointing to a sitemap file
133 * A resource pointing to php://stderr
142 * @param string $fspath The path to prepend to the filenames, used to
143 * save them somewhere else than in the root directory
144 * @param string $path The path to append to the domain name
145 * @param bool $compress Whether to compress the sitemap files
147 function GenerateSitemap( $fspath, $path, $compress ) {
148 global $wgDBname, $wgScriptPath;
150 $this->url_limit
= 50000;
151 $this->size_limit
= pow( 2, 20 ) * 10;
152 $this->fspath
= isset( $fspath ) ?
$fspath : '';
153 $this->path
= isset( $path ) ?
$path : $wgScriptPath;
154 $this->compress
= $compress;
156 $this->stderr
= fopen( 'php://stderr', 'wt' );
157 $this->dbr
=& wfGetDB( DB_SLAVE
);
158 $this->generateNamespaces();
159 $this->timestamp
= wfTimestamp( TS_ISO_8601
, wfTimestampNow() );
160 $this->findex
= fopen( "{$this->fspath}sitemap-index-$wgDBname.xml", 'wb' );
164 * Generate a one-dimensional array of existing namespaces
166 function generateNamespaces() {
167 $fname = 'GenerateSitemap::generateNamespaces';
169 $res = $this->dbr
->select( 'page',
170 array( 'page_namespace' ),
174 'GROUP BY' => 'page_namespace',
175 'ORDER BY' => 'page_namespace',
179 while ( $row = $this->dbr
->fetchObject( $res ) )
180 $this->namespaces
[] = $row->page_namespace
;
184 * Get the priority of a given namespace
186 * @param int $namespace The namespace to get the priority for
191 function priority( $namespace ) {
192 return isset( $this->priorities
[$namespace] ) ?
$this->priorities
[$namespace] : $this->guessPriority( $namespace );
196 * If the namespace isn't listed on the priority list return the
197 * default priority for the namespace, varies depending on whether it's
200 * @param int $namespace The namespace to get the priority for
204 function guessPriority( $namespace ) {
205 return Namespace::isMain( $namespace ) ?
$this->priorities
[GS_MAIN
] : $this->priorities
[GS_TALK
];
209 * Return a database resolution of all the pages in a given namespace
211 * @param int $namespace Limit the query to this namespace
215 function getPageRes( $namespace ) {
216 $fname = 'GenerateSitemap::getPageRes';
218 return $this->dbr
->select( 'page',
224 array( 'page_namespace' => $namespace ),
235 global $wgDBname, $wgContLang;
237 fwrite( $this->findex
, $this->openIndex() );
239 foreach ( $this->namespaces
as $namespace ) {
240 $res = $this->getPageRes( $namespace );
242 $this->generateLimit( $namespace );
243 $length = $this->limit
[0];
246 $fns = $wgContLang->getFormattedNsText( $namespace );
247 $this->debug( "$namespace ($fns)" );
248 while ( $row = $this->dbr
->fetchObject( $res ) ) {
249 if ( $i++
=== 0 ||
$i === $this->url_limit +
1 ||
$length +
$this->limit
[1] +
$this->limit
[2] > $this->size_limit
) {
250 if ( $this->file
!== false ) {
251 $this->write( $this->file
, $this->closeFile() );
252 $this->close( $this->file
);
254 $filename = $this->sitemapFilename( $namespace, $smcount++
);
255 $this->file
= $this->open( $this->fspath
. $filename, 'wb' );
256 $this->write( $this->file
, $this->openFile() );
257 fwrite( $this->findex
, $this->indexEntry( $filename ) );
258 $this->debug( "\t$filename" );
259 $length = $this->limit
[0];
262 $title = Title
::makeTitle( $row->page_namespace
, $row->page_title
);
263 $date = wfTimestamp( TS_ISO_8601
, $row->page_touched
);
264 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
265 $length +
= strlen( $entry );
266 $this->write( $this->file
, $entry );
269 $this->write( $this->file
, $this->closeFile() );
270 $this->close( $this->file
);
273 fwrite( $this->findex
, $this->closeIndex() );
274 fclose( $this->findex
);
278 * gzopen() / fopen() wrapper
282 function open( $file, $flags ) {
283 return $this->compress ?
gzopen( $file, $flags ) : fopen( $file, $flags );
287 * gzwrite() / fwrite() wrapper
289 function write( &$handle, $str ) {
290 if ( $this->compress
)
291 gzwrite( $handle, $str );
293 fwrite( $handle, $str );
297 * gzclose() / fclose() wrapper
299 function close( &$handle ) {
300 if ( $this->compress
)
307 * Get a sitemap filename
311 * @param int $namespace The namespace
312 * @param int $count The count
316 function sitemapFilename( $namespace, $count ) {
319 $ext = $this->compress ?
'.gz' : '';
321 return "sitemap-$wgDBname-NS_$namespace-$count.xml$ext";
325 * Return the XML required to open an XML file
332 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
336 * Return the XML schema being used
342 function xmlSchema() {
343 return 'http://www.google.com/schemas/sitemap/0.84';
347 * Return the XML required to open a sitemap index file
351 function openIndex() {
352 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
356 * Return the XML for a single sitemap indexfile entry
360 * @param string $filename The filename of the sitemap file
364 function indexEntry( $filename ) {
367 "\t\t<loc>$filename</loc>\n" .
368 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
373 * Return the XML required to close a sitemap index file
379 function closeIndex() {
380 return "</sitemapindex>\n";
384 * Return the XML required to open a sitemap file
388 function openFile() {
389 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
393 * Return the XML for a single sitemap entry
397 * @param string $url An RFC 2396 compilant URL
398 * @param string $date A ISO 8601 date
399 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
403 function fileEntry( $url, $date, $priority ) {
406 "\t\t<loc>$url</loc>\n" .
407 "\t\t<lastmod>$date</lastmod>\n" .
408 "\t\t<priority>$priority</priority>\n" .
413 * Return the XML required to close sitemap file
418 function closeFile() {
419 return "</urlset>\n";
423 * Write a string to stderr followed by a UNIX newline
425 function debug( $str ) {
426 fwrite( $this->stderr
, "$str\n" );
430 * Populate $this->limit
432 function generateLimit( $namespace ) {
433 $title = Title
::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
435 $this->limit
= array(
436 strlen( $this->openFile() ),
437 strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601
, wfTimestamp() ), $this->priority( $namespace ) ) ),
438 strlen( $this->closeFile() )
443 if ( in_array( '--help', $argv ) ) {
445 "Usage: php generateSitemap.php [host] [options]\n" .
446 "\thost = hostname\n" .
448 "\t\t--help\tshow this message\n" .
449 "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" .
450 "\t\t--path\tThe http path to use, e.g. /wiki\n" .
451 "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n";
455 if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 )
456 $_SERVER['SERVER_NAME'] = $argv[1];
458 $optionsWithArgs = array( 'fspath', 'path', 'compress' );
459 require_once 'commandLine.inc';
461 $gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' );