3 namespace MediaWiki\Parser\Parsoid
;
5 use MediaWiki\Language\LanguageCode
;
6 use MediaWiki\Languages\LanguageConverterFactory
;
7 use MediaWiki\Languages\LanguageFactory
;
8 use MediaWiki\Page\PageIdentity
;
9 use MediaWiki\Parser\ParserOutput
;
10 use MediaWiki\Parser\Parsoid\Config\PageConfigFactory
;
11 use MediaWiki\Rest\HttpException
;
12 use MediaWiki\Rest\LocalizedHttpException
;
13 use MediaWiki\Revision\RevisionAccessException
;
14 use MediaWiki\Title\Title
;
15 use MediaWiki\Title\TitleFactory
;
16 use Wikimedia\Bcp47Code\Bcp47Code
;
17 use Wikimedia\Bcp47Code\Bcp47CodeValue
;
18 use Wikimedia\Message\MessageValue
;
19 use Wikimedia\Parsoid\Config\PageConfig
;
20 use Wikimedia\Parsoid\Config\SiteConfig
;
21 use Wikimedia\Parsoid\Core\PageBundle
;
22 use Wikimedia\Parsoid\DOM\Element
;
23 use Wikimedia\Parsoid\Parsoid
;
24 use Wikimedia\Parsoid\Utils\DOMCompat
;
25 use Wikimedia\Parsoid\Utils\DOMUtils
;
29 * @unstable should be marked stable before 1.40 release
31 class LanguageVariantConverter
{
32 private PageConfigFactory
$pageConfigFactory;
33 private ?PageConfig
$pageConfig = null;
34 private PageIdentity
$pageIdentity;
35 private Title
$pageTitle;
36 private Parsoid
$parsoid;
37 private SiteConfig
$siteConfig;
38 private LanguageConverterFactory
$languageConverterFactory;
39 private LanguageFactory
$languageFactory;
41 * Page language override from the Content-Language header.
43 private ?Bcp47Code
$pageLanguageOverride = null;
44 private bool $isFallbackLanguageConverterEnabled = true;
46 public function __construct(
47 PageIdentity
$pageIdentity,
48 PageConfigFactory
$pageConfigFactory,
50 SiteConfig
$siteConfig,
51 TitleFactory
$titleFactory,
52 LanguageConverterFactory
$languageConverterFactory,
53 LanguageFactory
$languageFactory
55 $this->pageConfigFactory
= $pageConfigFactory;
56 $this->pageIdentity
= $pageIdentity;
57 $this->parsoid
= $parsoid;
58 $this->siteConfig
= $siteConfig;
59 $this->pageTitle
= $titleFactory->newFromPageIdentity( $this->pageIdentity
);
60 $this->languageConverterFactory
= $languageConverterFactory;
61 $this->languageFactory
= $languageFactory;
65 * Set the PageConfig object to be used during language variant conversion.
66 * If not provided, the object will be created.
68 * @param PageConfig $pageConfig
71 public function setPageConfig( PageConfig
$pageConfig ) {
72 $this->pageConfig
= $pageConfig;
76 * Set the page content language override.
78 * @param Bcp47Code $language
81 public function setPageLanguageOverride( Bcp47Code
$language ) {
82 $this->pageLanguageOverride
= $language;
86 * Perform variant conversion on a PageBundle object.
88 * @param PageBundle $pageBundle
89 * @param Bcp47Code $targetVariant
90 * @param ?Bcp47Code $sourceVariant
92 * @return PageBundle The converted PageBundle, or the object passed in as
93 * $pageBundle if the conversion is not supported.
94 * @throws HttpException
96 public function convertPageBundleVariant(
97 PageBundle
$pageBundle,
98 Bcp47Code
$targetVariant,
99 ?Bcp47Code
$sourceVariant = null
101 [ $pageLanguage, $sourceVariant ] =
102 $this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant );
104 if ( !$this->siteConfig
->langConverterEnabledBcp47( $pageLanguage ) ) {
105 // If the language doesn't support variants, just return the content unmodified.
109 $pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant );
111 if ( $this->parsoid
->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) {
112 return $this->parsoid
->pb2pb(
113 $pageConfig, 'variant', $pageBundle,
116 'source' => $sourceVariant,
117 'target' => $targetVariant,
122 if ( !$this->isFallbackLanguageConverterEnabled
) {
123 // Fallback variant conversion is not enabled, return the page bundle as is.
127 // LanguageConverter::hasVariant and LanguageConverter::convertTo
128 // could take a string|Bcp47Code in the future, which would
129 // allow us to avoid the $targetVariantCode conversion here.
130 $baseLanguage = $this->languageFactory
->getParentLanguage( $targetVariant );
131 $languageConverter = $this->languageConverterFactory
->getLanguageConverter( $baseLanguage );
132 $targetVariantCode = $this->languageFactory
->getLanguage( $targetVariant )->getCode();
133 if ( $languageConverter->hasVariant( $targetVariantCode ) ) {
134 // NOTE: This is not a convert() because we have the exact desired variant
135 // and don't need to compute a preferred variant based on a base language.
136 // Also see T267067 for why convert() should be avoided.
137 $convertedHtml = $languageConverter->convertTo( $pageBundle->html
, $targetVariantCode );
138 $pageVariant = $targetVariant;
140 // No conversion possible - pass through original HTML in original language
141 $convertedHtml = $pageBundle->html
;
142 $pageVariant = $pageConfig->getPageLanguageBcp47();
145 // Add a note so that we can identify what was used to perform the variant conversion
146 $msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
147 $convertedHtml = $msg . $convertedHtml;
149 // NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo
150 // Add meta information that Parsoid normally adds
152 'content-language' => $pageVariant->toBcp47Code(),
153 'vary' => [ 'Accept', 'Accept-Language' ]
155 $doc = DOMUtils
::parseHTML( '' );
156 $doc->appendChild( $doc->createElement( 'head' ) );
157 DOMUtils
::addHttpEquivHeaders( $doc, $headers );
158 $docElt = $doc->documentElement
;
159 '@phan-var Element $docElt';
160 $docHtml = DOMCompat
::getOuterHTML( $docElt );
161 $convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" );
162 return new PageBundle(
163 $convertedHtml, [], [], $pageBundle->version
, $headers
169 * Perform variant conversion on a ParserOutput object.
171 * @param ParserOutput $parserOutput
172 * @param Bcp47Code $targetVariant
173 * @param ?Bcp47Code $sourceVariant
175 * @return ParserOutput
177 public function convertParserOutputVariant(
178 ParserOutput
$parserOutput,
179 Bcp47Code
$targetVariant,
180 ?Bcp47Code
$sourceVariant = null
182 $pageBundle = PageBundleParserOutputConverter
::pageBundleFromParserOutput( $parserOutput );
183 $modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant );
185 return PageBundleParserOutputConverter
::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput );
189 * Disable fallback language variant converter
191 public function disableFallbackLanguageConverter(): void
{
192 $this->isFallbackLanguageConverterEnabled
= false;
195 private function getPageConfig( Bcp47Code
$pageLanguage, ?Bcp47Code
$sourceVariant ): PageConfig
{
196 if ( $this->pageConfig
) {
197 return $this->pageConfig
;
201 $this->pageConfig
= $this->pageConfigFactory
->create(
209 if ( $sourceVariant ) {
210 $this->pageConfig
->setVariantBcp47( $sourceVariant );
212 } catch ( RevisionAccessException
$exception ) {
213 // TODO: Throw a different exception, this class should not know
214 // about HTTP status codes.
215 throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 );
218 return $this->pageConfig
;
222 * Try to determine the page's language code as follows:
224 * First consider any value set by calling ::setPageLanguageOverride();
225 * this would have come from a Content-Language header.
227 * If ::setPageLanguageOverride() has not been called, check for a
228 * content-language header in $pageBundle, which should be
229 * equivalent. These are used when the title/article doesn't
232 * If these are not given, use the $default if given; this is used
233 * to allow additional parameters to the request to be used as
236 * If we don't have $default, but we do have a PageConfig in
237 * $this->pageConfig, return $this->pageConfig->getPageLanguage().
239 * Finally, fall back to $this->pageTitle->getPageLanguage().
241 * @param PageBundle $pageBundle
242 * @param Bcp47Code|null $default A default language, used after
243 * Content-Language but before PageConfig/Title lookup.
245 * @return Bcp47Code the page language; may be a variant.
247 private function getPageLanguage( PageBundle
$pageBundle, ?Bcp47Code
$default = null ): Bcp47Code
{
248 // If a language was set by calling setPageLanguageOverride(), always use it!
249 if ( $this->pageLanguageOverride
) {
250 return $this->pageLanguageOverride
;
253 // If the page bundle contains a language code, use that.
254 $pageBundleLanguage = $pageBundle->headers
[ 'content-language' ] ??
null;
255 if ( $pageBundleLanguage ) {
256 // The HTTP header will contain a BCP-47 language code, not a
257 // mediawiki-internal one.
258 return new Bcp47CodeValue( $pageBundleLanguage );
261 // NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
262 // falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
267 // If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
268 // Title::getPageLanguage(), so it has to be the last thing we try.
269 if ( $this->pageConfig
) {
270 return $this->pageConfig
->getPageLanguageBcp47();
273 // Finally, just go by the code associated with the title. This may come from the database or
274 // it may be determined based on the title itself.
275 return $this->pageTitle
->getPageLanguage();
279 * Determine the codes of the base language and the source variant.
281 * The base language will be used to find the appropriate LanguageConverter.
282 * It should never be a variant.
284 * The source variant will be used to instruct the LanguageConverter.
285 * It should always be a variant (or null to trigger auto-detection of
286 * the source variant).
288 * @param PageBundle $pageBundle
289 * @param ?Bcp47Code $sourceLanguage
291 * @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ]
293 private function getBaseAndSourceLanguage( PageBundle
$pageBundle, ?Bcp47Code
$sourceLanguage ): array {
294 // Try to determine the language code associated with the content of the page.
295 // The result may be a variant code.
296 $baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage );
298 // To find out if $baseLanguage is actually a variant, get the parent language and compare.
299 $parentLang = $this->languageFactory
->getParentLanguage( $baseLanguage );
301 // If $parentLang is not the same language as $baseLanguage, this means that
302 // $baseLanguage is a variant. In that case, set $sourceLanguage to that
303 // variant (unless $sourceLanguage is already set), and set $baseLanguage
304 // to the $parentLang
305 if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) {
306 if ( !$sourceLanguage ) {
307 $sourceLanguage = $baseLanguage;
309 $baseLanguage = $parentLang;
312 if ( $sourceLanguage !== null ) {
313 $parentConverter = $this->languageConverterFactory
->getLanguageConverter( $parentLang );
314 // If the source variant isn't actually a variant, trigger auto-detection
316 strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 &&
317 $parentConverter->hasVariant(
318 LanguageCode
::bcp47ToInternal( $sourceLanguage->toBcp47Code() )
321 if ( !$sourceIsVariant ) {
322 $sourceLanguage = null;
326 return [ $baseLanguage, $sourceLanguage ];