Fix names of parsercache_selective_* stats
[mediawiki.git] / maintenance / importImages.php
blob503f0f05e5149ef891e7639e9134841b82c962b1
1 <?php
2 /**
3 * Import one or more images from the local file system into the wiki without
4 * using the web-based interface.
6 * "Smart import" additions:
7 * - aim: preserve the essential metadata (user, description) when importing media
8 * files from an existing wiki.
9 * - process:
10 * - interface with the source wiki, don't use bare files only (see --source-wiki-url).
11 * - fetch metadata from source wiki for each file to import.
12 * - commit the fetched metadata to the destination wiki while submitting.
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
29 * @file
30 * @ingroup Maintenance
31 * @author Rob Church <robchur@gmail.com>
32 * @author Mij <mij@bitchx.it>
35 // @codeCoverageIgnoreStart
36 require_once __DIR__ . '/Maintenance.php';
37 // @codeCoverageIgnoreEnd
39 use MediaWiki\MainConfigNames;
40 use MediaWiki\Specials\SpecialUpload;
41 use MediaWiki\StubObject\StubGlobalUser;
42 use MediaWiki\Title\Title;
43 use MediaWiki\User\User;
45 class ImportImages extends Maintenance {
47 public function __construct() {
48 parent::__construct();
50 $this->addDescription( 'Imports images and other media files into the wiki' );
51 $this->addArg( 'dir', 'Path to the directory containing images to be imported' );
53 $this->addOption( 'extensions',
54 'Comma-separated list of allowable extensions, defaults to $wgFileExtensions',
55 false,
56 true
58 $this->addOption( 'overwrite',
59 'Overwrite existing images with the same name (default is to skip them)' );
60 $this->addOption( 'limit',
61 'Limit the number of images to process. Ignored or skipped images are not counted',
62 false,
63 true
65 $this->addOption( 'from',
66 "Ignore all files until the one with the given name. Useful for resuming aborted "
67 . "imports. The name should be the file's canonical database form.",
68 false,
69 true
71 $this->addOption( 'skip-dupes',
72 'Skip images that were already uploaded under a different name (check SHA1)' );
73 $this->addOption( 'search-recursively', 'Search recursively for files in subdirectories' );
74 $this->addOption( 'sleep',
75 'Sleep between files. Useful mostly for debugging',
76 false,
77 true
79 $this->addOption( 'user',
80 "Set username of uploader, default 'Maintenance script'",
81 false,
82 true
84 // This parameter can optionally have an argument. If none specified, getOption()
85 // returns 1 which is precisely what we need.
86 $this->addOption( 'check-userblock', 'Check if the user got blocked during import' );
87 $this->addOption( 'comment',
88 "Set file description, default 'Importing file'",
89 false,
90 true
92 $this->addOption( 'comment-file',
93 'Set description to the content of this file',
94 false,
95 true
97 $this->addOption( 'comment-ext',
98 'Causes the description for each file to be loaded from a file with the same name, but '
99 . 'the extension provided. If a global description is also given, it is appended.',
100 false,
101 true
103 $this->addOption( 'summary',
104 'Upload summary, description will be used if not provided',
105 false,
106 true
108 $this->addOption( 'license',
109 'Use an optional license template',
110 false,
111 true
113 $this->addOption( 'timestamp',
114 'Override upload time/date, all MediaWiki timestamp formats are accepted',
115 false,
116 true
118 $this->addOption( 'protect',
119 'Specify the protect value (autoconfirmed,sysop)',
120 false,
121 true
123 $this->addOption( 'unprotect', 'Unprotects all uploaded images' );
124 $this->addOption( 'source-wiki-url',
125 'If specified, take User and Comment data for each imported file from this URL. '
126 . 'For example, --source-wiki-url="https://en.wikipedia.org/w/',
127 false,
128 true
130 $this->addOption( 'dry', "Dry run, don't import anything" );
133 public function execute() {
134 $services = $this->getServiceContainer();
135 $permissionManager = $services->getPermissionManager();
137 $found = 0;
138 $processed = 0;
139 $statistics = [
140 'ignored' => 0,
141 'added' => 0,
142 'skipped' => 0,
143 'overwritten' => 0,
144 'failed' => 0,
147 $this->output( "Importing Files\n\n" );
149 $dir = $this->getArg( 0 );
151 # Check Protection
152 if ( $this->hasOption( 'protect' ) && $this->hasOption( 'unprotect' ) ) {
153 $this->fatalError( "Cannot specify both protect and unprotect. Only 1 is allowed.\n" );
156 if ( $this->hasOption( 'protect' ) && trim( $this->getOption( 'protect' ) ) ) {
157 $this->fatalError( "You must specify a protection option.\n" );
160 # Prepare the list of allowed extensions
161 $extensions = $this->hasOption( 'extensions' )
162 ? explode( ',', strtolower( $this->getOption( 'extensions' ) ) )
163 : $this->getConfig()->get( MainConfigNames::FileExtensions );
165 # Search the path provided for candidates for import
166 $files = $this->findFiles( $dir, $extensions, $this->hasOption( 'search-recursively' ) );
167 if ( !$files->valid() ) {
168 $this->output( "No suitable files could be found for import.\n" );
169 return;
172 # Initialise the user for this operation
173 $user = $this->hasOption( 'user' )
174 ? User::newFromName( $this->getOption( 'user' ) )
175 : User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
176 if ( !$user instanceof User ) {
177 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
179 '@phan-var User $user';
180 StubGlobalUser::setUser( $user );
182 # Get block check. If a value is given, this specified how often the check is performed
183 $checkUserBlock = (int)$this->getOption( 'check-userblock' );
185 $from = $this->getOption( 'from' );
186 $sleep = (int)$this->getOption( 'sleep' );
187 $limit = (int)$this->getOption( 'limit' );
188 $timestamp = $this->getOption( 'timestamp', false );
190 # Get the upload comment. Provide a default one in case there's no comment given.
191 $commentFile = $this->getOption( 'comment-file' );
192 if ( $commentFile !== null ) {
193 $comment = file_get_contents( $commentFile );
194 if ( $comment === false || $comment === null ) {
195 $this->fatalError( "failed to read comment file: {$commentFile}\n" );
197 } else {
198 $comment = $this->getOption( 'comment', 'Importing file' );
200 $commentExt = $this->getOption( 'comment-ext' );
201 $summary = $this->getOption( 'summary', '' );
202 $license = $this->getOption( 'license', '' );
203 $sourceWikiUrl = $this->getOption( 'source-wiki-url' );
205 $tags = in_array( ChangeTags::TAG_SERVER_SIDE_UPLOAD, ChangeTags::getSoftwareTags() )
206 ? [ ChangeTags::TAG_SERVER_SIDE_UPLOAD ]
207 : [];
209 # Batch "upload" operation
210 $restrictionStore = $services->getRestrictionStore();
211 foreach ( $files as $file ) {
212 $found++;
213 if ( $sleep && ( $processed > 0 ) ) {
214 sleep( $sleep );
217 $base = UtfNormal\Validator::cleanUp( wfBaseName( $file ) );
219 # Validate a title
220 $title = Title::makeTitleSafe( NS_FILE, $base );
221 if ( !$title ) {
222 $this->output(
223 "{$base} could not be imported; a valid title cannot be produced\n"
225 continue;
228 if ( $from ) {
229 if ( $from !== $title->getDBkey() ) {
230 $statistics['ignored']++;
231 continue;
233 // Found the requested file, continue from here
234 $from = null;
237 if ( $checkUserBlock && ( ( $processed % $checkUserBlock ) == 0 ) ) {
238 $user->clearInstanceCache( 'name' ); // reload from DB!
239 if ( $permissionManager->isBlockedFrom( $user, $title ) ) {
240 $this->output(
241 "{$user->getName()} is blocked from {$title->getPrefixedText()}! skipping.\n"
243 $statistics['skipped']++;
244 continue;
248 # Check existence
249 $image = $services->getRepoGroup()->getLocalRepo()
250 ->newFile( $title );
251 if ( $image->exists() ) {
252 if ( $this->hasOption( 'overwrite' ) ) {
253 $this->output( "{$base} exists, overwriting..." );
254 $svar = 'overwritten';
255 } else {
256 $this->output( "{$base} exists, skipping\n" );
257 $statistics['skipped']++;
258 continue;
260 } else {
261 if ( $this->hasOption( 'skip-dupes' ) ) {
262 $repo = $image->getRepo();
263 # XXX: we end up calculating this again when actually uploading. that sucks.
264 $sha1 = FSFile::getSha1Base36FromPath( $file );
265 $dupes = $repo->findBySha1( $sha1 );
266 if ( $dupes ) {
267 $this->output(
268 "{$base} already exists as {$dupes[0]->getName()}, skipping\n"
270 $statistics['skipped']++;
271 continue;
275 $this->output( "Importing {$base}..." );
276 $svar = 'added';
279 if ( $sourceWikiUrl ) {
280 /* find comment text directly from source wiki, through MW's API */
281 $real_comment = $this->getFileCommentFromSourceWiki( $sourceWikiUrl, $base );
282 $commentText = $real_comment !== false ? $real_comment : $comment;
284 /* find user directly from source wiki, through MW's API */
285 $real_user = $this->getFileUserFromSourceWiki( $sourceWikiUrl, $base );
286 if ( $real_user !== false ) {
287 $realUser = User::newFromName( $real_user );
288 if ( $realUser === false ) {
289 # user does not exist in target wiki
290 $this->output(
291 "failed: user '$real_user' does not exist in target wiki."
293 continue;
295 StubGlobalUser::setUser( $realUser );
296 $user = $realUser;
298 } else {
299 # Find comment text
300 $commentText = false;
302 if ( $commentExt ) {
303 $f = $this->findAuxFile( $file, $commentExt );
304 if ( !$f ) {
305 $this->output( " No comment file with extension {$commentExt} found "
306 . "for {$file}, using default comment." );
307 } else {
308 $commentText = file_get_contents( $f );
309 if ( !$commentText ) {
310 $this->output(
311 " Failed to load comment file {$f}, using default comment."
317 if ( !$commentText ) {
318 $commentText = $comment;
322 # Import the file
323 if ( $this->hasOption( 'dry' ) ) {
324 $this->output(
325 " publishing {$file} by '{$user->getName()}', comment '$commentText'..."
327 } else {
328 $mwProps = new MWFileProps( $services->getMimeAnalyzer() );
329 $props = $mwProps->getPropsFromPath( $file, true );
330 $flags = 0;
331 $publishOptions = [];
332 $handler = MediaHandler::getHandler( $props['mime'] );
333 if ( $handler ) {
334 $publishOptions['headers'] = $handler->getContentHeaders( $props['metadata'] );
335 } else {
336 $publishOptions['headers'] = [];
338 $archive = $image->publish( $file, $flags, $publishOptions );
339 if ( !$archive->isGood() ) {
340 $this->output( "failed. (" .
341 $archive->getMessage( false, false, 'en' )->text() .
342 ")\n" );
343 $statistics['failed']++;
344 continue;
348 $commentText = SpecialUpload::getInitialPageText( $commentText, $license );
349 if ( !$this->hasOption( 'summary' ) ) {
350 $summary = $commentText;
353 if ( $this->hasOption( 'dry' ) ) {
354 $this->output( "done.\n" );
355 } elseif ( $image->recordUpload3(
356 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable
357 $archive->value,
358 $summary,
359 $commentText,
360 $user,
361 // @phan-suppress-next-line PhanTypeMismatchArgumentNullable,PhanPossiblyUndeclaredVariable
362 $props,
363 $timestamp,
364 $tags
365 )->isOK() ) {
366 $this->output( "done.\n" );
368 $doProtect = false;
370 $protectLevel = $this->getOption( 'protect' );
371 $restrictionLevels = $this->getConfig()->get( MainConfigNames::RestrictionLevels );
373 if ( $protectLevel && in_array( $protectLevel, $restrictionLevels ) ) {
374 $doProtect = true;
376 if ( $this->hasOption( 'unprotect' ) ) {
377 $protectLevel = '';
378 $doProtect = true;
381 if ( $doProtect ) {
382 # Protect the file
383 $this->output( "\nWaiting for replica DBs...\n" );
384 // Wait for replica DBs.
385 sleep( 2 ); # Why this sleep?
386 $this->waitForReplication();
388 $this->output( "\nSetting image restrictions ..." );
390 $cascade = false;
391 $restrictions = [];
392 foreach ( $restrictionStore->listApplicableRestrictionTypes( $title ) as $type ) {
393 $restrictions[$type] = $protectLevel;
396 $page = $services->getWikiPageFactory()->newFromTitle( $title );
397 $status = $page->doUpdateRestrictions( $restrictions, [], $cascade, '', $user );
398 $this->output( ( $status->isOK() ? 'done' : 'failed' ) . "\n" );
400 } else {
401 $this->output( "failed. (at recordUpload stage)\n" );
402 $svar = 'failed';
405 $statistics[$svar]++;
406 $processed++;
408 if ( $limit && $processed >= $limit ) {
409 break;
413 # Print out some statistics
414 $this->output( "\n" );
415 foreach ( array_merge(
417 'Found' => $found,
418 'Limit' => $limit,
420 $statistics
421 ) as $desc => $number ) {
422 if ( $number > 0 ) {
423 $this->output( ucfirst( $desc ) . ": $number\n" );
429 * Search a directory for files with one of a set of extensions
431 * @param string $dir Path to directory to search
432 * @param array $exts Array of lowercase extensions to search for
433 * @param bool $recurse Search subdirectories recursively
434 * @return Generator<string> Generator that iterating filenames
436 private function findFiles( $dir, $exts, $recurse = false ) {
437 $dhl = is_dir( $dir ) ? opendir( $dir ) : false;
438 if ( !$dhl ) {
439 return;
442 // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
443 while ( ( $file = readdir( $dhl ) ) !== false ) {
444 if ( is_file( $dir . '/' . $file ) ) {
445 $ext = pathinfo( $file, PATHINFO_EXTENSION );
446 if ( in_array( strtolower( $ext ), $exts ) ) {
447 yield $dir . '/' . $file;
449 } elseif ( $recurse && is_dir( $dir . '/' . $file ) && $file !== '..' && $file !== '.' ) {
450 yield from $this->findFiles( $dir . '/' . $file, $exts, true );
456 * Find an auxiliary file with the given extension, matching
457 * the give base file path. $maxStrip determines how many extensions
458 * may be stripped from the original file name before appending the
459 * new extension. For example, with $maxStrip = 1 (the default),
460 * file files acme.foo.bar.txt and acme.foo.txt would be auxilliary
461 * files for acme.foo.bar and the extension ".txt". With $maxStrip = 2,
462 * acme.txt would also be acceptable.
464 * @param string $file Base path
465 * @param string $auxExtension The extension to be appended to the base path
466 * @param int $maxStrip The maximum number of extensions to strip from the base path (default: 1)
467 * @return string|false
469 private function findAuxFile( $file, $auxExtension, $maxStrip = 1 ) {
470 if ( !str_starts_with( $auxExtension, '.' ) ) {
471 $auxExtension = '.' . $auxExtension;
474 $d = dirname( $file );
475 $n = basename( $file );
477 while ( $maxStrip >= 0 ) {
478 $f = $d . '/' . $n . $auxExtension;
480 if ( file_exists( $f ) ) {
481 return $f;
484 $idx = strrpos( $n, '.' );
485 if ( !$idx ) {
486 break;
489 $n = substr( $n, 0, $idx );
490 $maxStrip -= 1;
493 return false;
497 * @todo FIXME: Access the api in a better way and performing just one query
498 * (preferably batching files too).
500 * @param string $wiki_host
501 * @param string $file
503 * @return string|false
505 private function getFileCommentFromSourceWiki( $wiki_host, $file ) {
506 $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
507 . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=comment';
508 $body = $this->getServiceContainer()->getHttpRequestFactory()->get( $url, [], __METHOD__ );
509 if ( preg_match( '#<ii comment="([^"]*)" />#', $body, $matches ) == 0 ) {
510 return false;
513 return html_entity_decode( $matches[1] );
516 private function getFileUserFromSourceWiki( $wiki_host, $file ) {
517 $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
518 . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=user';
519 $body = $this->getServiceContainer()->getHttpRequestFactory()->get( $url, [], __METHOD__ );
520 if ( preg_match( '#<ii user="([^"]*)" />#', $body, $matches ) == 0 ) {
521 return false;
524 return html_entity_decode( $matches[1] );
529 // @codeCoverageIgnoreStart
530 $maintClass = ImportImages::class;
531 require_once RUN_MAINTENANCE_IF_MAIN;
532 // @codeCoverageIgnoreEnd