Localisation updates from https://translatewiki.net.
[mediawiki.git] / includes / tidy / RemexCompatFormatter.php
blob54f6d2890b83c282ba97c948b1c04cffbc92d9bf
1 <?php
3 namespace MediaWiki\Tidy;
5 use MediaWiki\Parser\Sanitizer;
6 use Wikimedia\RemexHtml\HTMLData;
7 use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
8 use Wikimedia\RemexHtml\Serializer\SerializerNode;
10 /**
11 * @internal
13 * WATCH OUT! Unlike normal HtmlFormatter, this class requires the 'ignoreCharRefs' option
14 * in Tokenizer to be used. If that option is not used, it will produce wrong results (T354361).
16 class RemexCompatFormatter extends HtmlFormatter {
17 private const MARKED_EMPTY_ELEMENTS = [
18 'li' => true,
19 'p' => true,
20 'tr' => true,
23 /** @var ?callable */
24 private $textProcessor;
26 public function __construct( $options = [] ) {
27 parent::__construct( $options );
28 // Escape non-breaking space
29 $this->attributeEscapes["\u{00A0}"] = '&#160;';
30 $this->textEscapes["\u{00A0}"] = '&#160;';
31 // Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
32 unset( $this->attributeEscapes["&"] );
33 unset( $this->textEscapes["&"] );
34 $this->textProcessor = $options['textProcessor'] ?? null;
37 public function startDocument( $fragmentNamespace, $fragmentName ) {
38 return '';
41 /**
42 * WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains
43 * unexpanded character references (entities), as a result of using the 'ignoreCharRefs' option
44 * in Tokenizer. If that option is not used, this method will produce wrong results (T354361).
46 * @inheritDoc
48 public function characters( SerializerNode $parent, $text, $start, $length ) {
49 $text = parent::characters( $parent, $text, $start, $length );
51 if ( $parent->namespace !== HTMLData::NS_HTML
52 || !isset( $this->rawTextElements[$parent->name] )
53 ) {
54 if ( $this->textProcessor !== null ) {
55 $text = call_user_func( $this->textProcessor, $text );
59 // Ensure a consistent representation for all entities
60 $text = Sanitizer::normalizeCharReferences( $text );
61 return $text;
64 public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
65 $data = $node->snData;
66 if ( $data && $data->isPWrapper ) {
67 if ( $data->nonblankNodeCount ) {
68 return "<p>$contents</p>";
69 } else {
70 return $contents;
74 $name = $node->name;
75 $attrs = $node->attrs;
76 if ( isset( self::MARKED_EMPTY_ELEMENTS[$name] ) && $attrs->count() === 0
77 && strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
78 ) {
79 return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
82 $s = "<$name";
83 foreach ( $attrs->getValues() as $attrName => $attrValue ) {
84 $encValue = strtr( $attrValue, $this->attributeEscapes );
85 $encValue = Sanitizer::normalizeCharReferences( $encValue );
86 $s .= " $attrName=\"$encValue\"";
88 if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
89 $s .= ' />';
90 return $s;
93 $s .= '>';
94 if ( $node->namespace === HTMLData::NS_HTML
95 && isset( $contents[0] ) && $contents[0] === "\n"
96 && isset( $this->prefixLfElements[$name] )
97 ) {
98 $s .= "\n$contents</$name>";
99 } else {
100 $s .= "$contents</$name>";
102 return $s;