Merge "Mocha tests: Support language links to en-x-piglatin"
[mediawiki.git] / includes / parser / Parsoid / ParsoidParser.php
blobcec3538f4f463414f4f56c9a7baf0aa1efb3bb46
1 <?php
3 namespace MediaWiki\Parser\Parsoid;
5 use MediaWiki\Content\TextContent;
6 use MediaWiki\Content\WikitextContent;
7 use MediaWiki\Context\RequestContext;
8 use MediaWiki\Languages\LanguageConverterFactory;
9 use MediaWiki\MainConfigNames;
10 use MediaWiki\MediaWikiServices;
11 use MediaWiki\Page\PageReference;
12 use MediaWiki\Parser\ParserOptions;
13 use MediaWiki\Parser\ParserOutput;
14 use MediaWiki\Parser\Parsoid\Config\DataAccess;
15 use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
16 use MediaWiki\Revision\MutableRevisionRecord;
17 use MediaWiki\Revision\RevisionRecord;
18 use MediaWiki\Revision\SlotRecord;
19 use MediaWiki\Title\Title;
20 use MediaWiki\WikiMap\WikiMap;
21 use Wikimedia\Assert\Assert;
22 use Wikimedia\Parsoid\Config\PageConfig;
23 use Wikimedia\Parsoid\Parsoid;
25 /**
26 * Parser implementation which uses Parsoid.
28 * Currently incomplete; see T236809 for the long-term plan.
30 * @since 1.41
31 * @unstable since 1.41; see T236809 for plan.
33 class ParsoidParser /* eventually this will extend \Parser */ {
34 /**
35 * @unstable
36 * This should not be used widely right now since this may go away.
37 * This is being added to support DiscussionTools with Parsoid HTML
38 * and after initial exploration, this may be implemented differently.
40 public const PARSOID_TITLE_KEY = "parsoid:title-dbkey";
41 private Parsoid $parsoid;
42 private PageConfigFactory $pageConfigFactory;
43 private LanguageConverterFactory $languageConverterFactory;
44 private DataAccess $dataAccess;
46 /**
47 * @param Parsoid $parsoid
48 * @param PageConfigFactory $pageConfigFactory
49 * @param LanguageConverterFactory $languageConverterFactory
51 public function __construct(
52 Parsoid $parsoid,
53 PageConfigFactory $pageConfigFactory,
54 LanguageConverterFactory $languageConverterFactory,
55 DataAccess $dataAccess
56 ) {
57 $this->parsoid = $parsoid;
58 $this->pageConfigFactory = $pageConfigFactory;
59 $this->languageConverterFactory = $languageConverterFactory;
60 $this->dataAccess = $dataAccess;
63 /**
64 * Internal helper to avoid code deuplication across two methods
66 * @param PageConfig $pageConfig
67 * @param ParserOptions $options
68 * @return ParserOutput
70 private function genParserOutput(
71 PageConfig $pageConfig, ParserOptions $options, ?ParserOutput $previousOutput
72 ): ParserOutput {
73 $parserOutput = new ParserOutput();
75 // Parsoid itself does not vary output by parser options right now.
76 // But, ensure that any option use by extensions, parser functions,
77 // recursive parses, or (in the unlikely future scenario) Parsoid itself
78 // are recorded as used.
79 $options->registerWatcher( [ $parserOutput, 'recordOption' ] );
81 // The enable/disable logic here matches that in Parser::internalParseHalfParsed(),
82 // although __NOCONTENTCONVERT__ is handled internal to Parsoid.
84 // T349137: It might be preferable to handle __NOCONTENTCONVERT__ here rather than
85 // by inspecting the DOM inside Parsoid. That will come in a separate patch.
86 $htmlVariantLanguage = null;
87 if ( !( $options->getDisableContentConversion() || $options->getInterfaceMessage() ) ) {
88 // NOTES (some of these are TODOs for read views integration)
89 // 1. This html variant conversion is a pre-cache transform. HtmlOutputRendererHelper
90 // has another variant conversion that is a post-cache transform based on the
91 // 'Accept-Language' header. If that header is set, there is really no reason to
92 // do this conversion here. So, eventually, we are likely to either not pass in
93 // the htmlVariantLanguage option below OR disable language conversion from the
94 // wt2html path in Parsoid and this and the Accept-Language variant conversion
95 // both would have to be handled as post-cache transforms.
97 // 2. Parser.php calls convert() which computes a preferred variant from the
98 // target language. But, we cannot do that unconditionally here because REST API
99 // requests specify the exact variant via the 'Content-Language' header.
101 // For Parsoid page views, either the callers will have to compute the
102 // preferred variant and set it in ParserOptions OR the REST API will have
103 // to set some other flag indicating that the preferred variant should not
104 // be computed. For now, I am adding a temporary hack, but this should be
105 // replaced with something more sensible (T267067).
107 // 3. Additionally, Parsoid's callers will have to set targetLanguage in ParserOptions
108 // to mimic the logic in Parser.php (missing right now).
109 $langCode = $pageConfig->getPageLanguageBcp47();
110 if ( $options->getRenderReason() === 'page-view' ) { // TEMPORARY HACK
111 $langFactory = MediaWikiServices::getInstance()->getLanguageFactory();
112 $lang = $langFactory->getLanguage( $langCode );
113 $langConv = $this->languageConverterFactory->getLanguageConverter( $lang );
114 $htmlVariantLanguage = $langFactory->getLanguage( $langConv->getPreferredVariant() );
115 } else {
116 $htmlVariantLanguage = $langCode;
119 $oldPageConfig = null;
120 $oldPageBundle = null;
122 // T371713: Temporary statistics collection code to determine
123 // feasibility of Parsoid selective update
124 $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get(
125 MainConfigNames::ParsoidSelectiveUpdateSampleRate
127 $doSample = ( $sampleRate && mt_rand( 1, $sampleRate ) === 1 );
128 if ( $doSample && $previousOutput !== null && $previousOutput->getCacheRevisionId() ) {
129 // Allow fetching the old wikitext corresponding to the
130 // $previousOutput
131 $oldPageConfig = $this->pageConfigFactory->create(
132 Title::newFromLinkTarget( $pageConfig->getLinkTarget() ),
133 $options->getUserIdentity(),
134 $previousOutput->getCacheRevisionId(),
135 null,
136 $previousOutput->getLanguage(),
138 $oldPageBundle =
139 PageBundleParserOutputConverter::pageBundleFromParserOutput(
140 $previousOutput
144 $defaultOptions = [
145 'pageBundle' => true,
146 'wrapSections' => true,
147 'logLinterData' => true,
148 'body_only' => false,
149 'htmlVariantLanguage' => $htmlVariantLanguage,
150 'offsetType' => 'byte',
151 'outputContentVersion' => Parsoid::defaultHTMLVersion(),
152 'previousOutput' => $oldPageBundle,
153 'previousInput' => $oldPageConfig,
154 // The following are passed for metrics & labelling
155 'sampleStats' => $doSample,
156 'renderReason' => $options->getRenderReason(),
157 'userAgent' => RequestContext::getMain()->getRequest()->getHeader( 'User-Agent' ),
160 $parserOutput->resetParseStartTime();
162 // This can throw ClientError or ResourceLimitExceededException.
163 // Callers are responsible for figuring out how to handle them.
164 $pageBundle = $this->parsoid->wikitext2html(
165 $pageConfig,
166 $defaultOptions,
167 $headers,
168 $parserOutput );
170 $parserOutput = PageBundleParserOutputConverter::parserOutputFromPageBundle( $pageBundle, $parserOutput );
172 // Record the page title in dbkey form so that post-cache transforms
173 // have access to the title.
174 $parserOutput->setExtensionData(
175 self::PARSOID_TITLE_KEY,
176 Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedDBkey()
179 // Register a watcher again because the $parserOutput arg
180 // and $parserOutput return value above are different objects!
181 $options->registerWatcher( [ $parserOutput, 'recordOption' ] );
183 $parserOutput->setFromParserOptions( $options );
185 $parserOutput->recordTimeProfile();
186 $limitReporting = MediaWikiServices::getInstance()->getMainConfig()->get(
187 MainConfigNames::EnableParserLimitReporting
189 if ( $limitReporting ) {
190 $this->dataAccess->makeLimitReport( $pageConfig, $options, $parserOutput );
193 // T371713: Collect statistics on parsing time -vs- presence of
194 // $previousOutput
195 $stats = MediaWikiServices::getInstance()->getStatsFactory();
196 $labels = [
197 'type' => $previousOutput === null ? 'full' : 'selective',
198 'wiki' => WikiMap::getCurrentWikiId(),
199 'reason' => $options->getRenderReason() ?: 'unknown',
201 $stats
202 ->getCounter( 'Parsoid_parse_cpu_seconds' )
203 ->setLabels( $labels )
204 ->incrementBy( $parserOutput->getTimeProfile( 'cpu' ) );
205 $stats
206 ->getCounter( 'Parsoid_parse_total' )
207 ->setLabels( $labels )
208 ->increment();
210 // Add Parsoid skinning module
211 $parserOutput->addModuleStyles( [ 'mediawiki.skinning.content.parsoid' ] );
213 // Record Parsoid version in extension data; this allows
214 // us to use the onRejectParserCacheValue hook to selectively
215 // expire "bad" generated content in the event of a rollback.
216 $parserOutput->setExtensionData(
217 'core:parsoid-version', Parsoid::version()
219 $parserOutput->setExtensionData(
220 'core:html-version', Parsoid::defaultHTMLVersion()
223 return $parserOutput;
227 * Convert wikitext to HTML
228 * Do not call this function recursively.
230 * @param string|TextContent $text Text we want to parse
231 * @param-taint $text escapes_htmlnoent
232 * @param PageReference $page
233 * @param ParserOptions $options
234 * @param bool $linestart
235 * @param bool $clearState
236 * @param int|null $revId ID of the revision being rendered. This is used to render
237 * REVISION* magic words. 0 means that any current revision will be used. Null means
238 * that {{REVISIONID}}/{{REVISIONUSER}} will be empty and {{REVISIONTIMESTAMP}} will
239 * use the current timestamp.
240 * @param ?ParserOutput $previousOutput The (optional) result of a
241 * previous parse of this page, which can be used for selective update.
242 * @return ParserOutput
243 * @return-taint escaped
244 * @unstable since 1.41
246 public function parse(
247 $text, PageReference $page, ParserOptions $options,
248 bool $linestart = true, bool $clearState = true, ?int $revId = null,
249 ?ParserOutput $previousOutput = null
250 ): ParserOutput {
251 Assert::invariant( $linestart, '$linestart=false is not yet supported' );
252 Assert::invariant( $clearState, '$clearState=false is not yet supported' );
253 $title = Title::newFromPageReference( $page );
254 $lang = $options->getTargetLanguage();
255 if ( $lang === null && $options->getInterfaceMessage() ) {
256 $lang = $options->getUserLangObj();
258 $pageConfig = $revId === null || $revId === 0 ? null : $this->pageConfigFactory->create(
259 $title,
260 $options->getUserIdentity(),
261 $revId,
262 null, // unused
263 $lang // defaults to title page language if null
265 $content = null;
266 if ( $text instanceof TextContent ) {
267 $content = $text;
268 $text = $content->getText();
270 if ( !( $pageConfig && $pageConfig->getPageMainContent() === $text ) ) {
271 // This is a bit awkward! But we really need to parse $text, which
272 // may or may not correspond to the $revId provided!
273 // T332928 suggests one solution: splitting the "have revid"
274 // callers from the "bare text, no associated revision" callers.
275 $revisionRecord = new MutableRevisionRecord( $title );
276 if ( $revId !== null ) {
277 $revisionRecord->setId( $revId );
279 $revisionRecord->setSlot(
280 SlotRecord::newUnsaved(
281 SlotRecord::MAIN,
282 $content ?? new WikitextContent( $text )
285 $pageConfig = $this->pageConfigFactory->create(
286 $title,
287 $options->getUserIdentity(),
288 $revisionRecord,
289 null, // unused
290 $lang // defaults to title page language if null
294 return $this->genParserOutput( $pageConfig, $options, $previousOutput );
298 * @internal
300 * Convert custom wikitext (stored in main slot of the $fakeRev arg) to HTML.
301 * Callers are expected NOT to stuff the result into ParserCache.
303 * @param RevisionRecord $fakeRev Revision to parse
304 * @param PageReference $page
305 * @param ParserOptions $options
306 * @return ParserOutput
307 * @unstable since 1.41
309 public function parseFakeRevision(
310 RevisionRecord $fakeRev, PageReference $page, ParserOptions $options
311 ): ParserOutput {
312 wfDeprecated( __METHOD__, '1.43' );
313 $title = Title::newFromPageReference( $page );
314 $lang = $options->getTargetLanguage();
315 if ( $lang === null && $options->getInterfaceMessage() ) {
316 $lang = $options->getUserLangObj();
318 $pageConfig = $this->pageConfigFactory->create(
319 $title,
320 $options->getUserIdentity(),
321 $fakeRev,
322 null, // unused
323 $lang // defaults to title page language if null
326 return $this->genParserOutput( $pageConfig, $options, null );