3 namespace MediaWiki\Tidy
;
5 use MediaWiki\Parser\Sanitizer
;
6 use Wikimedia\RemexHtml\HTMLData
;
7 use Wikimedia\RemexHtml\Serializer\HtmlFormatter
;
8 use Wikimedia\RemexHtml\Serializer\SerializerNode
;
13 * WATCH OUT! Unlike normal HtmlFormatter, this class requires the 'ignoreCharRefs' option
14 * in Tokenizer to be used. If that option is not used, it will produce wrong results (T354361).
16 class RemexCompatFormatter
extends HtmlFormatter
{
17 private const MARKED_EMPTY_ELEMENTS
= [
24 private $textProcessor;
26 public function __construct( $options = [] ) {
27 parent
::__construct( $options );
28 // Escape non-breaking space
29 $this->attributeEscapes
["\u{00A0}"] = ' ';
30 $this->textEscapes
["\u{00A0}"] = ' ';
31 // Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
32 unset( $this->attributeEscapes
["&"] );
33 unset( $this->textEscapes
["&"] );
34 $this->textProcessor
= $options['textProcessor'] ??
null;
37 public function startDocument( $fragmentNamespace, $fragmentName ) {
42 * WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains
43 * unexpanded character references (entities), as a result of using the 'ignoreCharRefs' option
44 * in Tokenizer. If that option is not used, this method will produce wrong results (T354361).
48 public function characters( SerializerNode
$parent, $text, $start, $length ) {
49 $text = parent
::characters( $parent, $text, $start, $length );
51 if ( $parent->namespace !== HTMLData
::NS_HTML
52 ||
!isset( $this->rawTextElements
[$parent->name
] )
54 if ( $this->textProcessor
!== null ) {
55 $text = call_user_func( $this->textProcessor
, $text );
59 // Ensure a consistent representation for all entities
60 $text = Sanitizer
::normalizeCharReferences( $text );
64 public function element( SerializerNode
$parent, SerializerNode
$node, $contents ) {
65 $data = $node->snData
;
66 if ( $data && $data->isPWrapper
) {
67 if ( $data->nonblankNodeCount
) {
68 return "<p>$contents</p>";
75 $attrs = $node->attrs
;
76 if ( isset( self
::MARKED_EMPTY_ELEMENTS
[$name] ) && $attrs->count() === 0
77 && strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
79 return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
83 foreach ( $attrs->getValues() as $attrName => $attrValue ) {
84 $encValue = strtr( $attrValue, $this->attributeEscapes
);
85 $encValue = Sanitizer
::normalizeCharReferences( $encValue );
86 $s .= " $attrName=\"$encValue\"";
88 if ( $node->namespace === HTMLData
::NS_HTML
&& isset( $this->voidElements
[$name] ) ) {
94 if ( $node->namespace === HTMLData
::NS_HTML
95 && isset( $contents[0] ) && $contents[0] === "\n"
96 && isset( $this->prefixLfElements
[$name] )
98 $s .= "\n$contents</$name>";
100 $s .= "$contents</$name>";