3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
21 use MediaWiki\Deferred\LinksUpdate\LinksUpdate
;
22 use MediaWiki\Deferred\RefreshSecondaryDataUpdate
;
23 use MediaWiki\Logger\LoggerFactory
;
24 use MediaWiki\MainConfigNames
;
25 use MediaWiki\MediaWikiServices
;
26 use MediaWiki\Page\PageAssertionException
;
27 use MediaWiki\Page\PageIdentity
;
28 use MediaWiki\Parser\ParserCache
;
29 use MediaWiki\Parser\ParserOutput
;
30 use MediaWiki\Revision\RevisionRecord
;
31 use MediaWiki\Revision\RevisionRenderer
;
32 use MediaWiki\Revision\SlotRecord
;
33 use MediaWiki\Title\Title
;
34 use MediaWiki\User\User
;
35 use MediaWiki\WikiMap\WikiMap
;
36 use Wikimedia\Rdbms\IDBAccessObject
;
37 use Wikimedia\Stats\StatsFactory
;
40 * Job to update link tables for rerendered wiki pages.
42 * This job comes in a few variants:
44 * - a) Recursive jobs to update links for backlink pages for a given title.
45 * Scheduled by {@see LinksUpdate::queueRecursiveJobsForTable()}; used to
46 * refresh pages which link/transclude a given title.
47 * These jobs have (recursive:true,table:<table>) set. They just look up
48 * which pages link to the job title and schedule them as a set of non-recursive
49 * RefreshLinksJob jobs (and possible one new recursive job as a way of
51 * - b) Jobs to update links for a set of pages (the job title is ignored).
52 * These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set.
53 * - c) Jobs to update links for a single page (the job title).
54 * These jobs need no extra fields set.
56 * Job parameters for all jobs:
57 * - recursive (bool): When false, updates the current page. When true, updates
58 * the pages which link/transclude the current page.
59 * - triggeringRevisionId (int): The revision of the edit which caused the link
60 * refresh. For manually triggered updates, the last revision of the page (at the
61 * time of scheduling).
62 * - triggeringUser (array): The user who triggered the refresh, in the form of a
63 * [ 'userId' => int, 'userName' => string ] array. This is not necessarily the user
64 * who created the revision.
65 * - triggeredRecursive (bool): Set on all jobs which were partitioned from another,
66 * recursive job. For debugging.
67 * - Standard deduplication params (see {@see JobQueue::deduplicateRootJob()}).
69 * - table (string): Which table to use (imagelinks or templatelinks) when searching for
71 * - range (array): Used for recursive jobs when some pages have already been partitioned
72 * into separate jobs. Contains the list of ranges that still need to be partitioned.
73 * See {@see BacklinkJobUtils::partitionBacklinkJob()}.
74 * - division: Number of times the job was partitioned already (for debugging).
75 * For non-recursive jobs:
76 * - pages (array): Associative array of [ <page ID> => [ <namespace>, <dbkey> ] ].
77 * Might be omitted, then the job title will be used.
78 * - isOpportunistic (bool): Set for opportunistic single-page updates. These are "free"
79 * updates that are queued when most of the work needed to be performed anyway for
80 * non-linkrefresh-related reasons, and can be more easily discarded if they don't seem
81 * useful. See {@see WikiPage::triggerOpportunisticLinksUpdate()}.
82 * - useRecursiveLinksUpdate (bool): When true, triggers recursive jobs for each page.
85 * - `refreshlinks_superseded_updates_total`: The number of times the job was cancelled
86 * because the target page had already been refreshed by a different edit or job.
87 * The job is considered to have succeeded in this case.
89 * - `refreshlinks_warnings_total`: The number of times the job failed due to a recoverable issue.
90 * Possible `reason` label values include:
91 * - `lag_wait_failed`: The job timed out while waiting for replication.
93 * - `refreshlinks_failures_total`: The number of times the job failed.
94 * The `reason` label may be:
95 * - `page_not_found`: The target page did not exist.
96 * - `rev_not_current`: The target revision was no longer the latest revision for the target page.
97 * - `rev_not_found`: The target revision was not found.
98 * - `lock_failure`: The job failed to acquire an exclusive lock to refresh the target page.
100 * - `refreshlinks_parsercache_operations_total`: The number of times the job attempted
101 * to fetch parser output from the parser cache.
102 * Possible `status` label values include:
103 * - `cache_hit`: The parser output was found in the cache.
104 * - `cache_miss`: The parser output was not found in the cache.
107 * @see RefreshSecondaryDataUpdate
108 * @see WikiPage::doSecondaryDataUpdates()
110 class RefreshLinksJob
extends Job
{
111 /** @var int Lag safety margin when comparing root job times to last-refresh times */
112 private const NORMAL_MAX_LAG
= 10;
113 /** @var int How many seconds to wait for replica DBs to catch up */
114 private const LAG_WAIT_TIMEOUT
= 15;
116 public function __construct( PageIdentity
$page, array $params ) {
117 if ( empty( $params['pages'] ) && !$page->canExist() ) {
118 // BC with the Title class
119 throw new PageAssertionException(
120 'The given PageIdentity {pageIdentity} does not represent a proper page',
121 [ 'pageIdentity' => $page ]
125 parent
::__construct( 'refreshLinks', $page, $params );
126 // Avoid the overhead of de-duplication when it would be pointless
127 $this->removeDuplicates
= (
128 // Ranges rarely will line up
129 !isset( $params['range'] ) &&
130 // Multiple pages per job make matches unlikely
131 !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
133 $this->params +
= [ 'causeAction' => 'RefreshLinksJob', 'causeAgent' => 'unknown' ];
134 // Tell JobRunner to not automatically wrap run() in a transaction round.
135 // Each runForTitle() call will manage its own rounds in order to run DataUpdates
136 // and to avoid contention as well.
137 $this->executionFlags |
= self
::JOB_NO_EXPLICIT_TRX_ROUND
;
141 * @param PageIdentity $page
142 * @param array $params
143 * @return RefreshLinksJob
145 public static function newPrioritized( PageIdentity
$page, array $params ) {
146 $job = new self( $page, $params );
147 $job->command
= 'refreshLinksPrioritized';
153 * @param PageIdentity $page
154 * @param array $params
155 * @return RefreshLinksJob
157 public static function newDynamic( PageIdentity
$page, array $params ) {
158 $job = new self( $page, $params );
159 $job->command
= 'refreshLinksDynamic';
164 public function run() {
167 if ( !empty( $this->params
['recursive'] ) ) {
168 // Job to update all (or a range of) backlink pages for a page
170 // When the base job branches, wait for the replica DBs to catch up to the primary.
171 // From then on, we know that any template changes at the time the base job was
172 // enqueued will be reflected in backlink page parses when the leaf jobs run.
173 $services = MediaWikiServices
::getInstance();
174 if ( !isset( $this->params
['range'] ) ) {
175 $lbFactory = $services->getDBLoadBalancerFactory();
176 if ( !$lbFactory->waitForReplication( [
177 'timeout' => self
::LAG_WAIT_TIMEOUT
179 // only try so hard, keep going with what we have
180 $stats = $services->getStatsFactory();
181 $stats->getCounter( 'refreshlinks_warnings_total' )
182 ->setLabel( 'reason', 'lag_wait_failed' )
183 ->copyToStatsdAt( 'refreshlinks_warning.lag_wait_failed' )
187 // Carry over information for de-duplication
188 $extraParams = $this->getRootJobParams();
189 $extraParams['triggeredRecursive'] = true;
190 // Carry over cause information for logging
191 $extraParams['causeAction'] = $this->params
['causeAction'];
192 $extraParams['causeAgent'] = $this->params
['causeAgent'];
193 // Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
194 // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
195 $jobs = BacklinkJobUtils
::partitionBacklinkJob(
197 $services->getMainConfig()->get( MainConfigNames
::UpdateRowsPerJob
),
199 [ 'params' => $extraParams ]
201 $services->getJobQueueGroup()->push( $jobs );
203 } elseif ( isset( $this->params
['pages'] ) ) {
204 // Job to update link tables for a set of titles
205 foreach ( $this->params
['pages'] as [ $ns, $dbKey ] ) {
206 $title = Title
::makeTitleSafe( $ns, $dbKey );
207 if ( $title && $title->canExist() ) {
208 $ok = $this->runForTitle( $title ) && $ok;
211 $this->setLastError( "Invalid title ($ns,$dbKey)." );
216 // Job to update link tables for a given title
217 $ok = $this->runForTitle( $this->title
);
224 * @param PageIdentity $pageIdentity
227 protected function runForTitle( PageIdentity
$pageIdentity ) {
228 $services = MediaWikiServices
::getInstance();
229 $stats = $services->getStatsFactory();
230 $renderer = $services->getRevisionRenderer();
231 $parserCache = $services->getParserCache();
232 $lbFactory = $services->getDBLoadBalancerFactory();
233 $ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__
);
235 // Load the page from the primary DB
236 $page = $services->getWikiPageFactory()->newFromTitle( $pageIdentity );
237 $page->loadPageData( IDBAccessObject
::READ_LATEST
);
239 if ( !$page->exists() ) {
240 // Probably due to concurrent deletion or renaming of the page
241 $logger = LoggerFactory
::getInstance( 'RefreshLinksJob' );
243 'The page does not exist. Perhaps it was deleted?',
245 'page_title' => $this->title
->getPrefixedDBkey(),
246 'job_params' => $this->getParams(),
247 'job_metadata' => $this->getMetadata()
250 $this->incrementFailureCounter( $stats, 'page_not_found' );
252 // retry later to handle unlucky race condition
256 // Serialize link update job by page ID so they see each others' changes.
257 // The page ID and latest revision ID will be queried again after the lock
258 // is acquired to bail if they are changed from that of loadPageData() above.
259 // Serialize links updates by page ID so they see each others' changes
260 $dbw = $lbFactory->getPrimaryDatabase();
261 /** @noinspection PhpUnusedLocalVariableInspection */
262 $scopedLock = LinksUpdate
::acquirePageLock( $dbw, $page->getId(), 'job' );
263 if ( $scopedLock === null ) {
264 // Another job is already updating the page, likely for a prior revision (T170596)
265 $this->setLastError( 'LinksUpdate already running for this page, try again later.' );
266 $this->incrementFailureCounter( $stats, 'lock_failure' );
268 // retry later when overlapping job for previous rev is done
272 if ( $this->isAlreadyRefreshed( $page ) ) {
273 // this job has been superseded, e.g. by overlapping recursive job
274 // for a different template edit, or by direct edit or purge.
275 $stats->getCounter( 'refreshlinks_superseded_updates_total' )
276 ->copyToStatsdAt( 'refreshlinks_outcome.good_update_superseded' )
282 // Parse during a fresh transaction round for better read consistency
283 $lbFactory->beginPrimaryChanges( __METHOD__
);
284 $output = $this->getParserOutput( $renderer, $parserCache, $page, $stats );
285 $options = $this->getDataUpdateOptions();
286 $lbFactory->commitPrimaryChanges( __METHOD__
);
289 // probably raced out.
290 // Specific refreshlinks_outcome metric sent by getCurrentRevisionIfUnchanged().
295 // Tell DerivedPageDataUpdater to use this parser output
296 $options['known-revision-output'] = $output;
297 // Execute corresponding DataUpdates immediately
298 $page->doSecondaryDataUpdates( $options );
299 InfoAction
::invalidateCache( $page );
301 // NOTE: Since 2019 (f588586e) this no longer saves the new ParserOutput to the ParserCache!
302 // This means the page will have to be rendered on-the-fly when it is next viewed.
303 // This is to avoid spending limited ParserCache capacity on rarely visited pages.
304 // TODO: Save the ParserOutput to ParserCache by calling WikiPage::updateParserCache()
305 // for pages that are likely to benefit (T327162).
307 // Commit any writes here in case this method is called in a loop.
308 // In that case, the scoped lock will fail to be acquired.
309 $lbFactory->commitAndWaitForReplication( __METHOD__
, $ticket );
315 * @return string|null Minimum lag-safe TS_MW timestamp with regard to root job creation
317 private function getLagAwareRootTimestamp() {
318 // Get the timestamp of the change that triggered this job
319 $rootTimestamp = $this->params
['rootJobTimestamp'] ??
null;
320 if ( $rootTimestamp === null ) {
324 if ( !empty( $this->params
['isOpportunistic'] ) ) {
325 // Neither clock skew nor DB snapshot/replica DB lag matter much for
326 // such updates; focus on reusing the (often recently updated) cache
327 $lagAwareTimestamp = $rootTimestamp;
329 // For transclusion updates, the template changes must be reflected
330 $lagAwareTimestamp = wfTimestamp(
332 (int)wfTimestamp( TS_UNIX
, $rootTimestamp ) + self
::NORMAL_MAX_LAG
336 return $lagAwareTimestamp;
340 * @param WikiPage $page
341 * @return bool Whether something updated the backlinks with data newer than this job
343 private function isAlreadyRefreshed( WikiPage
$page ) {
344 $lagAwareTimestamp = $this->getLagAwareRootTimestamp();
346 return ( $lagAwareTimestamp !== null && $page->getLinksTimestamp() > $lagAwareTimestamp );
350 * @see DerivedPageDataUpdater::shouldGenerateHTMLOnEdit
351 * @return bool true if at least one of slots require rendering HTML on edit, false otherwise.
352 * This is needed for example in populating ParserCache.
354 private function shouldGenerateHTMLOnEdit( RevisionRecord
$revision ): bool {
355 $services = MediaWikiServices
::getInstance();
356 foreach ( $revision->getSlots()->getSlotRoles() as $role ) {
357 $slot = $revision->getSlots()->getSlot( $role );
358 $contentHandler = $services->getContentHandlerFactory()->getContentHandler( $slot->getModel() );
359 if ( $contentHandler->generateHTMLOnEdit() ) {
367 * Get the parser output if the page is unchanged from what was loaded in $page
369 * @param RevisionRenderer $renderer
370 * @param ParserCache $parserCache
371 * @param WikiPage $page Page already loaded with READ_LATEST
372 * @param StatsFactory $stats
373 * @return ParserOutput|null Combined output for all slots; might only contain metadata
375 private function getParserOutput(
376 RevisionRenderer
$renderer,
377 ParserCache
$parserCache,
381 $revision = $this->getCurrentRevisionIfUnchanged( $page, $stats );
387 $cachedOutput = $this->getParserOutputFromCache( $parserCache, $page, $revision, $stats );
388 $statsCounter = $stats->getCounter( 'refreshlinks_parsercache_operations_total' );
390 if ( $cachedOutput && $this->canUseParserOutputFromCache( $cachedOutput, $revision ) ) {
392 ->setLabel( 'status', 'cache_hit' )
393 ->setLabel( 'html_changed', 'n/a' )
394 ->copyToStatsdAt( 'refreshlinks.parser_cached' )
397 return $cachedOutput;
400 $causeAction = $this->params
['causeAction'] ??
'RefreshLinksJob';
401 $parserOptions = $page->makeParserOptions( 'canonical' );
403 // T371713: Temporary statistics collection code to determine
404 // feasibility of Parsoid selective update
405 $sampleRate = MediaWikiServices
::getInstance()->getMainConfig()->get(
406 MainConfigNames
::ParsoidSelectiveUpdateSampleRate
408 $doSample = $sampleRate && mt_rand( 1, $sampleRate ) === 1;
409 if ( $doSample && $cachedOutput === null ) {
410 // In order to collect accurate statistics, check for
411 // a dirty copy in the cache even if we wouldn't have
413 $cachedOutput = $parserCache->getDirty( $page, $parserOptions ) ?
: null;
416 $renderedRevision = $renderer->getRenderedRevision(
421 'audience' => $revision::RAW
,
422 'causeAction' => $causeAction,
423 // Providing a previous parse potentially allows for
425 'previous-output' => $cachedOutput,
429 $parseTimestamp = wfTimestampNow(); // timestamp that parsing started
430 $output = $renderedRevision->getRevisionParserOutput( [
431 // To avoid duplicate parses, this must match DerivedPageDataUpdater::shouldGenerateHTMLOnEdit() (T301309)
432 'generate-html' => $this->shouldGenerateHTMLOnEdit( $revision )
434 $output->setCacheTime( $parseTimestamp ); // notify LinksUpdate::doUpdate()
435 // T371713: Temporary statistics collection code to determine
436 // feasibility of Parsoid selective update
438 $content = $revision->getContent( SlotRecord
::MAIN
);
440 'source' => 'RefreshLinksJob',
441 'type' => $cachedOutput === null ?
'full' : 'selective',
442 'reason' => $causeAction,
443 'parser' => $parserOptions->getUseParsoid() ?
'parsoid' : 'legacy',
444 'opportunistic' => empty( $this->params
['isOpportunistic'] ) ?
'false' : 'true',
445 'wiki' => WikiMap
::getCurrentWikiId(),
446 'model' => $content ?
$content->getModel() : 'unknown',
449 ->getCounter( 'ParserCache_selective_total' )
450 ->setLabels( $labels )
453 ->getCounter( 'ParserCache_selective_cpu_seconds' )
454 ->setLabels( $labels )
455 ->incrementBy( $output->getTimeProfile( 'cpu' ) );
458 // Collect stats on parses that don't actually change the page content.
459 // In that case, we could abort here, and perhaps we could also avoid
460 // triggering CDN purges (T369898).
461 if ( !$cachedOutput ) {
462 // There was no cached output
463 $htmlChanged = 'unknown';
464 } elseif ( $cachedOutput->getRawText() === $output->getRawText() ) {
465 // We have cached output, but we couldn't be sure that it was still good.
466 // So we parsed again, but the result turned out to be the same HTML as
470 // Re-parsing yielded HTML different from the cached output.
471 $htmlChanged = 'yes';
475 ->setLabel( 'status', 'cache_miss' )
476 ->setLabel( 'html_changed', $htmlChanged )
477 ->copyToStatsdAt( 'refreshlinks.parser_uncached' )
484 * Get the current revision record if it is unchanged from what was loaded in $page
486 * @param WikiPage $page Page already loaded with READ_LATEST
487 * @param StatsFactory $stats
488 * @return RevisionRecord|null The same instance that $page->getRevisionRecord() uses
490 private function getCurrentRevisionIfUnchanged(
494 $title = $page->getTitle();
495 // Get the latest ID since acquirePageLock() in runForTitle() flushed the transaction.
496 // This is used to detect edits/moves after loadPageData() but before the scope lock.
497 // The works around the chicken/egg problem of determining the scope lock key name
498 $latest = $title->getLatestRevID( IDBAccessObject
::READ_LATEST
);
500 $triggeringRevisionId = $this->params
['triggeringRevisionId'] ??
null;
501 if ( $triggeringRevisionId && $triggeringRevisionId !== $latest ) {
502 // This job is obsolete and one for the latest revision will handle updates
503 $this->incrementFailureCounter( $stats, 'rev_not_current' );
504 $this->setLastError( "Revision $triggeringRevisionId is not current" );
508 // Load the current revision. Note that $page should have loaded with READ_LATEST.
509 // This instance will be reused in WikiPage::doSecondaryDataUpdates() later on.
510 $revision = $page->getRevisionRecord();
512 // revision just got deleted?
513 $this->incrementFailureCounter( $stats, 'rev_not_found' );
514 $this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
517 } elseif ( $revision->getId() !== $latest ||
$revision->getPageId() !== $page->getId() ) {
518 // Do not clobber over newer updates with older ones. If all jobs where FIFO and
519 // serialized, it would be OK to update links based on older revisions since it
520 // would eventually get to the latest. Since that is not the case (by design),
521 // only update the link tables to a state matching the current revision's output.
522 $this->incrementFailureCounter( $stats, 'rev_not_current' );
523 $this->setLastError( "Revision {$revision->getId()} is not current" );
532 * Get the parser output from cache if it reflects the change that triggered this job
534 * @param ParserCache $parserCache
535 * @param WikiPage $page
536 * @param RevisionRecord $currentRevision
537 * @param StatsFactory $stats
538 * @return ParserOutput|null
540 private function getParserOutputFromCache(
541 ParserCache
$parserCache,
543 RevisionRecord
$currentRevision,
546 // Parsoid can do selective updates, so it is always worth the I/O
547 // to check for a previous parse.
548 $parserOptions = $page->makeParserOptions( 'canonical' );
549 if ( $parserOptions->getUseParsoid() ) {
550 return $parserCache->getDirty( $page, $parserOptions ) ?
: null;
552 // If page_touched changed after this root job, then it is likely that
553 // any views of the pages already resulted in re-parses which are now in
554 // cache. The cache can be reused to avoid expensive parsing in some cases.
555 $rootTimestamp = $this->params
['rootJobTimestamp'] ??
null;
556 if ( $rootTimestamp !== null ) {
557 $opportunistic = !empty( $this->params
['isOpportunistic'] );
558 if ( $page->getTouched() >= $rootTimestamp ||
$opportunistic ) {
559 // Cache is suspected to be up-to-date so it's worth the I/O of checking.
560 // We call canUseParserOutputFromCache() later to check if it's usable.
561 return $parserCache->getDirty( $page, $parserOptions ) ?
: null;
568 private function canUseParserOutputFromCache(
569 ParserOutput
$cachedOutput,
570 RevisionRecord
$currentRevision
572 // As long as the cache rev ID matches the current rev ID and it reflects
573 // the job's triggering change, then it is usable.
574 return $cachedOutput->getCacheRevisionId() == $currentRevision->getId()
575 && $cachedOutput->getCacheTime() >= $this->getLagAwareRootTimestamp();
579 * Increment the RefreshLinks failure counter metric with the given reason.
581 * @param StatsFactory $stats
582 * @param string $reason Well-known failure reason string
585 private function incrementFailureCounter( StatsFactory
$stats, $reason ): void
{
586 $stats->getCounter( 'refreshlinks_failures_total' )
587 ->setLabel( 'reason', $reason )
588 ->copyToStatsdAt( "refreshlinks_outcome.bad_$reason" )
595 private function getDataUpdateOptions() {
597 'recursive' => !empty( $this->params
['useRecursiveLinksUpdate'] ),
598 // Carry over cause so the update can do extra logging
599 'causeAction' => $this->params
['causeAction'],
600 'causeAgent' => $this->params
['causeAgent']
602 if ( !empty( $this->params
['triggeringUser'] ) ) {
603 $userInfo = $this->params
['triggeringUser'];
604 if ( $userInfo['userId'] ) {
605 $options['triggeringUser'] = User
::newFromId( $userInfo['userId'] );
607 // Anonymous, use the username
608 $options['triggeringUser'] = User
::newFromName( $userInfo['userName'], false );
615 public function getDeduplicationInfo() {
616 $info = parent
::getDeduplicationInfo();
617 unset( $info['causeAction'] );
618 unset( $info['causeAgent'] );
619 if ( is_array( $info['params'] ) ) {
620 // For per-pages jobs, the job title is that of the template that changed
621 // (or similar), so remove that since it ruins duplicate detection
622 if ( isset( $info['params']['pages'] ) ) {
623 unset( $info['namespace'] );
624 unset( $info['title'] );
631 public function workItemCount() {
632 if ( !empty( $this->params
['recursive'] ) ) {
633 return 0; // nothing actually refreshed
634 } elseif ( isset( $this->params
['pages'] ) ) {
635 return count( $this->params
['pages'] );
638 return 1; // one title