Merge "mediawiki.content.json: Remove file and author annotations"
[mediawiki.git] / includes / parser / Parsoid / LanguageVariantConverter.php
blob8a555f17f1eaef8531acddecbf2e6d5cf4b00192
1 <?php
3 namespace MediaWiki\Parser\Parsoid;
5 use MediaWiki\Language\LanguageCode;
6 use MediaWiki\Languages\LanguageConverterFactory;
7 use MediaWiki\Languages\LanguageFactory;
8 use MediaWiki\Page\PageIdentity;
9 use MediaWiki\Parser\ParserOutput;
10 use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
11 use MediaWiki\Rest\HttpException;
12 use MediaWiki\Rest\LocalizedHttpException;
13 use MediaWiki\Revision\RevisionAccessException;
14 use MediaWiki\Title\Title;
15 use MediaWiki\Title\TitleFactory;
16 use Wikimedia\Bcp47Code\Bcp47Code;
17 use Wikimedia\Bcp47Code\Bcp47CodeValue;
18 use Wikimedia\Message\MessageValue;
19 use Wikimedia\Parsoid\Config\PageConfig;
20 use Wikimedia\Parsoid\Config\SiteConfig;
21 use Wikimedia\Parsoid\Core\PageBundle;
22 use Wikimedia\Parsoid\DOM\Element;
23 use Wikimedia\Parsoid\Parsoid;
24 use Wikimedia\Parsoid\Utils\DOMCompat;
25 use Wikimedia\Parsoid\Utils\DOMUtils;
27 /**
28 * @since 1.40
29 * @unstable should be marked stable before 1.40 release
31 class LanguageVariantConverter {
32 private PageConfigFactory $pageConfigFactory;
33 private ?PageConfig $pageConfig = null;
34 private PageIdentity $pageIdentity;
35 private Title $pageTitle;
36 private Parsoid $parsoid;
37 private SiteConfig $siteConfig;
38 private LanguageConverterFactory $languageConverterFactory;
39 private LanguageFactory $languageFactory;
40 /**
41 * Page language override from the Content-Language header.
43 private ?Bcp47Code $pageLanguageOverride = null;
44 private bool $isFallbackLanguageConverterEnabled = true;
46 public function __construct(
47 PageIdentity $pageIdentity,
48 PageConfigFactory $pageConfigFactory,
49 Parsoid $parsoid,
50 SiteConfig $siteConfig,
51 TitleFactory $titleFactory,
52 LanguageConverterFactory $languageConverterFactory,
53 LanguageFactory $languageFactory
54 ) {
55 $this->pageConfigFactory = $pageConfigFactory;
56 $this->pageIdentity = $pageIdentity;
57 $this->parsoid = $parsoid;
58 $this->siteConfig = $siteConfig;
59 $this->pageTitle = $titleFactory->newFromPageIdentity( $this->pageIdentity );
60 $this->languageConverterFactory = $languageConverterFactory;
61 $this->languageFactory = $languageFactory;
64 /**
65 * Set the PageConfig object to be used during language variant conversion.
66 * If not provided, the object will be created.
68 * @param PageConfig $pageConfig
69 * @return void
71 public function setPageConfig( PageConfig $pageConfig ) {
72 $this->pageConfig = $pageConfig;
75 /**
76 * Set the page content language override.
78 * @param Bcp47Code $language
79 * @return void
81 public function setPageLanguageOverride( Bcp47Code $language ) {
82 $this->pageLanguageOverride = $language;
85 /**
86 * Perform variant conversion on a PageBundle object.
88 * @param PageBundle $pageBundle
89 * @param Bcp47Code $targetVariant
90 * @param ?Bcp47Code $sourceVariant
92 * @return PageBundle The converted PageBundle, or the object passed in as
93 * $pageBundle if the conversion is not supported.
94 * @throws HttpException
96 public function convertPageBundleVariant(
97 PageBundle $pageBundle,
98 Bcp47Code $targetVariant,
99 ?Bcp47Code $sourceVariant = null
100 ): PageBundle {
101 [ $pageLanguage, $sourceVariant ] =
102 $this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant );
104 if ( !$this->siteConfig->langConverterEnabledBcp47( $pageLanguage ) ) {
105 // If the language doesn't support variants, just return the content unmodified.
106 return $pageBundle;
109 $pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant );
111 if ( $this->parsoid->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) {
112 return $this->parsoid->pb2pb(
113 $pageConfig, 'variant', $pageBundle,
115 'variant' => [
116 'source' => $sourceVariant,
117 'target' => $targetVariant,
121 } else {
122 if ( !$this->isFallbackLanguageConverterEnabled ) {
123 // Fallback variant conversion is not enabled, return the page bundle as is.
124 return $pageBundle;
127 // LanguageConverter::hasVariant and LanguageConverter::convertTo
128 // could take a string|Bcp47Code in the future, which would
129 // allow us to avoid the $targetVariantCode conversion here.
130 $baseLanguage = $this->languageFactory->getParentLanguage( $targetVariant );
131 $languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage );
132 $targetVariantCode = $this->languageFactory->getLanguage( $targetVariant )->getCode();
133 if ( $languageConverter->hasVariant( $targetVariantCode ) ) {
134 // NOTE: This is not a convert() because we have the exact desired variant
135 // and don't need to compute a preferred variant based on a base language.
136 // Also see T267067 for why convert() should be avoided.
137 $convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode );
138 $pageVariant = $targetVariant;
139 } else {
140 // No conversion possible - pass through original HTML in original language
141 $convertedHtml = $pageBundle->html;
142 $pageVariant = $pageConfig->getPageLanguageBcp47();
145 // Add a note so that we can identify what was used to perform the variant conversion
146 $msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
147 $convertedHtml = $msg . $convertedHtml;
149 // NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo
150 // Add meta information that Parsoid normally adds
151 $headers = [
152 'content-language' => $pageVariant->toBcp47Code(),
153 'vary' => [ 'Accept', 'Accept-Language' ]
155 $doc = DOMUtils::parseHTML( '' );
156 $doc->appendChild( $doc->createElement( 'head' ) );
157 DOMUtils::addHttpEquivHeaders( $doc, $headers );
158 $docElt = $doc->documentElement;
159 '@phan-var Element $docElt';
160 $docHtml = DOMCompat::getOuterHTML( $docElt );
161 $convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" );
162 return new PageBundle(
163 $convertedHtml, [], [], $pageBundle->version, $headers
169 * Perform variant conversion on a ParserOutput object.
171 * @param ParserOutput $parserOutput
172 * @param Bcp47Code $targetVariant
173 * @param ?Bcp47Code $sourceVariant
175 * @return ParserOutput
177 public function convertParserOutputVariant(
178 ParserOutput $parserOutput,
179 Bcp47Code $targetVariant,
180 ?Bcp47Code $sourceVariant = null
181 ): ParserOutput {
182 $pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
183 $modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant );
185 return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput );
189 * Disable fallback language variant converter
191 public function disableFallbackLanguageConverter(): void {
192 $this->isFallbackLanguageConverterEnabled = false;
195 private function getPageConfig( Bcp47Code $pageLanguage, ?Bcp47Code $sourceVariant ): PageConfig {
196 if ( $this->pageConfig ) {
197 return $this->pageConfig;
200 try {
201 $this->pageConfig = $this->pageConfigFactory->create(
202 $this->pageIdentity,
203 null,
204 null,
205 null,
206 $pageLanguage
209 if ( $sourceVariant ) {
210 $this->pageConfig->setVariantBcp47( $sourceVariant );
212 } catch ( RevisionAccessException $exception ) {
213 // TODO: Throw a different exception, this class should not know
214 // about HTTP status codes.
215 throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 );
218 return $this->pageConfig;
222 * Try to determine the page's language code as follows:
224 * First consider any value set by calling ::setPageLanguageOverride();
225 * this would have come from a Content-Language header.
227 * If ::setPageLanguageOverride() has not been called, check for a
228 * content-language header in $pageBundle, which should be
229 * equivalent. These are used when the title/article doesn't
230 * (yet) exist.
232 * If these are not given, use the $default if given; this is used
233 * to allow additional parameters to the request to be used as
234 * fallbacks.
236 * If we don't have $default, but we do have a PageConfig in
237 * $this->pageConfig, return $this->pageConfig->getPageLanguage().
239 * Finally, fall back to $this->pageTitle->getPageLanguage().
241 * @param PageBundle $pageBundle
242 * @param Bcp47Code|null $default A default language, used after
243 * Content-Language but before PageConfig/Title lookup.
245 * @return Bcp47Code the page language; may be a variant.
247 private function getPageLanguage( PageBundle $pageBundle, ?Bcp47Code $default = null ): Bcp47Code {
248 // If a language was set by calling setPageLanguageOverride(), always use it!
249 if ( $this->pageLanguageOverride ) {
250 return $this->pageLanguageOverride;
253 // If the page bundle contains a language code, use that.
254 $pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null;
255 if ( $pageBundleLanguage ) {
256 // The HTTP header will contain a BCP-47 language code, not a
257 // mediawiki-internal one.
258 return new Bcp47CodeValue( $pageBundleLanguage );
261 // NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
262 // falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
263 if ( $default ) {
264 return $default;
267 // If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
268 // Title::getPageLanguage(), so it has to be the last thing we try.
269 if ( $this->pageConfig ) {
270 return $this->pageConfig->getPageLanguageBcp47();
273 // Finally, just go by the code associated with the title. This may come from the database or
274 // it may be determined based on the title itself.
275 return $this->pageTitle->getPageLanguage();
279 * Determine the codes of the base language and the source variant.
281 * The base language will be used to find the appropriate LanguageConverter.
282 * It should never be a variant.
284 * The source variant will be used to instruct the LanguageConverter.
285 * It should always be a variant (or null to trigger auto-detection of
286 * the source variant).
288 * @param PageBundle $pageBundle
289 * @param ?Bcp47Code $sourceLanguage
291 * @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ]
293 private function getBaseAndSourceLanguage( PageBundle $pageBundle, ?Bcp47Code $sourceLanguage ): array {
294 // Try to determine the language code associated with the content of the page.
295 // The result may be a variant code.
296 $baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage );
298 // To find out if $baseLanguage is actually a variant, get the parent language and compare.
299 $parentLang = $this->languageFactory->getParentLanguage( $baseLanguage );
301 // If $parentLang is not the same language as $baseLanguage, this means that
302 // $baseLanguage is a variant. In that case, set $sourceLanguage to that
303 // variant (unless $sourceLanguage is already set), and set $baseLanguage
304 // to the $parentLang
305 if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) {
306 if ( !$sourceLanguage ) {
307 $sourceLanguage = $baseLanguage;
309 $baseLanguage = $parentLang;
312 if ( $sourceLanguage !== null ) {
313 $parentConverter = $this->languageConverterFactory->getLanguageConverter( $parentLang );
314 // If the source variant isn't actually a variant, trigger auto-detection
315 $sourceIsVariant = (
316 strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 &&
317 $parentConverter->hasVariant(
318 LanguageCode::bcp47ToInternal( $sourceLanguage->toBcp47Code() )
321 if ( !$sourceIsVariant ) {
322 $sourceLanguage = null;
326 return [ $baseLanguage, $sourceLanguage ];