Localisation updates from https://translatewiki.net.
[mediawiki.git] / includes / tidy / RemexCompatMunger.php
blobe100072b354788549bdb6b04983ed4e06641e926
1 <?php
3 namespace MediaWiki\Tidy;
5 use InvalidArgumentException;
6 use Wikimedia\RemexHtml\HTMLData;
7 use Wikimedia\RemexHtml\Serializer\Serializer;
8 use Wikimedia\RemexHtml\Serializer\SerializerNode;
9 use Wikimedia\RemexHtml\Tokenizer\Attributes;
10 use Wikimedia\RemexHtml\Tokenizer\PlainAttributes;
11 use Wikimedia\RemexHtml\TreeBuilder\Element;
12 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
13 use Wikimedia\RemexHtml\TreeBuilder\TreeHandler;
15 /**
16 * @internal
18 class RemexCompatMunger implements TreeHandler {
19 private const ONLY_INLINE_ELEMENTS = [
20 "a" => true,
21 "abbr" => true,
22 "acronym" => true,
23 "applet" => true,
24 "b" => true,
25 "basefont" => true,
26 "bdo" => true,
27 "big" => true,
28 "br" => true,
29 "button" => true,
30 "cite" => true,
31 "code" => true,
32 "del" => true,
33 "dfn" => true,
34 "em" => true,
35 "font" => true,
36 "i" => true,
37 "iframe" => true,
38 "img" => true,
39 "input" => true,
40 "ins" => true,
41 "kbd" => true,
42 "label" => true,
43 "legend" => true,
44 "map" => true,
45 "object" => true,
46 "param" => true,
47 "q" => true,
48 "rb" => true,
49 "rbc" => true,
50 "rp" => true,
51 "rt" => true,
52 "rtc" => true,
53 "ruby" => true,
54 "s" => true,
55 "samp" => true,
56 "select" => true,
57 "small" => true,
58 "span" => true,
59 "strike" => true,
60 "strong" => true,
61 "sub" => true,
62 "sup" => true,
63 "textarea" => true,
64 "tt" => true,
65 "u" => true,
66 "var" => true,
67 // Those defined in tidy.conf
68 "video" => true,
69 "audio" => true,
70 "bdi" => true,
71 "data" => true,
72 "time" => true,
73 "mark" => true,
76 /**
77 * For the purposes of this class, "metadata" elements are those that
78 * should neither trigger p-wrapping nor stop an outer p-wrapping,
79 * typically those that are themselves invisible in a browser's rendering.
80 * This isn't a complete list, it's just the tags that we're likely to
81 * encounter in practice.
83 private const METADATA_ELEMENTS = [
84 'style' => true,
85 'script' => true,
86 'link' => true,
87 // Except for the TableOfContentsMarker (see ::isTableOfContentsMarker()
88 // and Parser::TOC_PLACEHOLDER) which should break a paragraph.
89 'meta' => true,
92 private const FORMATTING_ELEMENTS = [
93 'a' => true,
94 'b' => true,
95 'big' => true,
96 'code' => true,
97 'em' => true,
98 'font' => true,
99 'i' => true,
100 'nobr' => true,
101 's' => true,
102 'small' => true,
103 'strike' => true,
104 'strong' => true,
105 'tt' => true,
106 'u' => true,
109 /** @var Serializer */
110 private $serializer;
112 /** @var bool */
113 private $trace;
116 * @param Serializer $serializer
117 * @param bool $trace
119 public function __construct( Serializer $serializer, $trace = false ) {
120 $this->serializer = $serializer;
121 $this->trace = $trace;
124 public function startDocument( $fragmentNamespace, $fragmentName ) {
125 $this->serializer->startDocument( $fragmentNamespace, $fragmentName );
126 $root = $this->serializer->getRootNode();
127 $root->snData = new RemexMungerData;
128 $root->snData->needsPWrapping = true;
131 public function endDocument( $pos ) {
132 $this->serializer->endDocument( $pos );
135 private function getParentForInsert( $preposition, $refElement ) {
136 if ( $preposition === TreeBuilder::ROOT ) {
137 return [ $this->serializer->getRootNode(), null ];
138 } elseif ( $preposition === TreeBuilder::BEFORE ) {
139 $refNode = $refElement->userData;
140 return [ $this->serializer->getParentNode( $refNode ), $refNode ];
141 } else {
142 $refNode = $refElement->userData;
143 $refData = $refNode->snData;
144 if ( $refData->currentCloneElement ) {
145 // Follow a chain of clone links if necessary
146 $origRefData = $refData;
147 while ( $refData->currentCloneElement ) {
148 $refElement = $refData->currentCloneElement;
149 $refNode = $refElement->userData;
150 $refData = $refNode->snData;
152 // Cache the end of the chain in the requested element
153 $origRefData->currentCloneElement = $refElement;
154 } elseif ( $refData->childPElement ) {
155 $refElement = $refData->childPElement;
156 $refNode = $refElement->userData;
158 return [ $refNode, $refNode ];
163 * Insert a p-wrapper
165 * @param SerializerNode $parent
166 * @param int $sourceStart
167 * @return SerializerNode
169 private function insertPWrapper( SerializerNode $parent, $sourceStart ) {
170 $pWrap = new Element( HTMLData::NS_HTML, 'mw:p-wrap', new PlainAttributes );
171 $this->serializer->insertElement( TreeBuilder::UNDER, $parent, $pWrap, false,
172 $sourceStart, 0 );
173 $data = new RemexMungerData;
174 $data->isPWrapper = true;
175 $data->wrapBaseNode = $parent;
176 $pWrap->userData->snData = $data;
177 $parent->snData->childPElement = $pWrap;
178 return $pWrap->userData;
181 public function characters( $preposition, $refElement, $text, $start, $length,
182 $sourceStart, $sourceLength
184 $isBlank = strspn( $text, "\t\n\f\r ", $start, $length ) === $length;
186 [ $parent, $refNode ] = $this->getParentForInsert( $preposition, $refElement );
187 $parentData = $parent->snData;
189 if ( $preposition === TreeBuilder::UNDER ) {
190 if ( $parentData->needsPWrapping && !$isBlank ) {
191 // Add a p-wrapper for bare text under body/blockquote
192 $refNode = $this->insertPWrapper( $refNode, $sourceStart );
193 $parent = $refNode;
194 $parentData = $parent->snData;
195 } elseif ( $parentData->isSplittable && !$parentData->ancestorPNode ) {
196 // The parent is splittable and in block mode, so split the tag stack
197 $refNode = $this->splitTagStack( $refNode, true, $sourceStart );
198 $parent = $refNode;
199 $parentData = $parent->snData;
203 if ( !$isBlank ) {
204 // Non-whitespace characters detected
205 $parentData->nonblankNodeCount++;
207 $this->serializer->characters( $preposition, $refNode, $text, $start,
208 $length, $sourceStart, $sourceLength );
211 private function trace( $msg ) {
212 if ( $this->trace ) {
213 wfDebug( "[RCM] $msg" );
218 * Insert or reparent an element. Create p-wrappers or split the tag stack
219 * as necessary.
221 * Consider the following insertion locations. The parent may be:
223 * - A: A body or blockquote (!!needsPWrapping)
224 * - B: A p-wrapper (!!isPWrapper)
225 * - C: A descendant of a p-wrapper (!!ancestorPNode)
226 * - CS: With splittable formatting elements in the stack region up to
227 * the p-wrapper
228 * - CU: With one or more unsplittable elements in the stack region up
229 * to the p-wrapper
230 * - D: Not a descendant of a p-wrapper (!ancestorNode)
231 * - DS: With splittable formatting elements in the stack region up to
232 * the body or blockquote
233 * - DU: With one or more unsplittable elements in the stack region up
234 * to the body or blockquote
236 * And consider that we may insert two types of element:
237 * - b: block
238 * - i: inline
240 * We handle the insertion as follows:
242 * - A/i: Create a p-wrapper, insert under it
243 * - A/b: Insert as normal
244 * - B/i: Insert as normal
245 * - B/b: Close the p-wrapper, insert under the body/blockquote (wrap
246 * base) instead)
247 * - C/i: Insert as normal
248 * - CS/b: Split the tag stack, insert the block under cloned formatting
249 * elements which have the wrap base (the parent of the p-wrap) as
250 * their ultimate parent.
251 * - CU/b: Disable the p-wrap, by reparenting the currently open child
252 * of the p-wrap under the p-wrap's parent. Then insert the block as
253 * normal.
254 * - D/b: Insert as normal
255 * - DS/i: Split the tag stack, creating a new p-wrapper as the ultimate
256 * parent of the formatting elements thus cloned. The parent of the
257 * p-wrapper is the body or blockquote.
258 * - DU/i: Insert as normal
260 * FIXME: fostering ($preposition == BEFORE) is mostly done by inserting as
261 * normal, the full algorithm is not followed.
263 * @param int $preposition
264 * @param Element|SerializerNode|null $refElement
265 * @param Element $element
266 * @param bool $void
267 * @param int $sourceStart
268 * @param int $sourceLength
270 public function insertElement( $preposition, $refElement, Element $element, $void,
271 $sourceStart, $sourceLength
273 [ $parent, $newRef ] = $this->getParentForInsert( $preposition, $refElement );
274 $parentData = $parent->snData;
275 $elementName = $element->htmlName;
277 $inline = isset( self::ONLY_INLINE_ELEMENTS[$elementName] );
278 $under = $preposition === TreeBuilder::UNDER;
280 if ( isset( self::METADATA_ELEMENTS[$elementName] )
281 && !self::isTableOfContentsMarker( $element )
283 // The element is a metadata element, that we allow to appear in
284 // both inline and block contexts.
285 $this->trace( 'insert metadata' );
286 } elseif ( $under && $parentData->isPWrapper && !$inline ) {
287 // [B/b] The element is non-inline and the parent is a p-wrapper,
288 // close the parent and insert into its parent instead
289 $this->trace( 'insert B/b' );
290 $newParent = $this->serializer->getParentNode( $parent );
291 $parent = $newParent;
292 $parentData = $parent->snData;
293 $parentData->childPElement = null;
294 $newRef = $refElement->userData;
295 } elseif ( $under && $parentData->isSplittable
296 && (bool)$parentData->ancestorPNode !== $inline
298 // [CS/b, DS/i] The parent is splittable and the current element is
299 // inline in block context, or if the current element is a block
300 // under a p-wrapper, split the tag stack.
301 $this->trace( $inline ? 'insert DS/i' : 'insert CS/b' );
302 $newRef = $this->splitTagStack( $newRef, $inline, $sourceStart );
303 $parent = $newRef;
304 $parentData = $parent->snData;
305 } elseif ( $under && $parentData->needsPWrapping && $inline ) {
306 // [A/i] If the element is inline and we are in body/blockquote,
307 // we need to create a p-wrapper
308 $this->trace( 'insert A/i' );
309 $newRef = $this->insertPWrapper( $newRef, $sourceStart );
310 $parent = $newRef;
311 $parentData = $parent->snData;
312 } elseif ( $parentData->ancestorPNode && !$inline ) {
313 // [CU/b] If the element is non-inline and (despite attempting to
314 // split above) there is still an ancestor p-wrap, disable that
315 // p-wrap
316 $this->trace( 'insert CU/b' );
317 $this->disablePWrapper( $parent, $sourceStart );
318 } else {
319 // [A/b, B/i, C/i, D/b, DU/i] insert as normal
320 $this->trace( 'insert normal' );
323 // An element with element children is a non-blank element
324 $parentData->nonblankNodeCount++;
326 // Insert the element downstream and so initialise its userData
327 $this->serializer->insertElement( $preposition, $newRef,
328 $element, $void, $sourceStart, $sourceLength );
330 // Initialise snData
331 if ( !$element->userData->snData ) {
332 $elementData = $element->userData->snData = new RemexMungerData;
333 } else {
334 $elementData = $element->userData->snData;
336 if ( ( $parentData->isPWrapper || $parentData->isSplittable )
337 && isset( self::FORMATTING_ELEMENTS[$elementName] )
339 $elementData->isSplittable = true;
341 if ( $parentData->isPWrapper ) {
342 $elementData->ancestorPNode = $parent;
343 } elseif ( $parentData->ancestorPNode ) {
344 $elementData->ancestorPNode = $parentData->ancestorPNode;
346 if ( $parentData->wrapBaseNode ) {
347 $elementData->wrapBaseNode = $parentData->wrapBaseNode;
348 } elseif ( $parentData->needsPWrapping ) {
349 $elementData->wrapBaseNode = $parent;
351 if ( $elementName === 'body'
352 || $elementName === 'blockquote'
353 || $elementName === 'html'
355 $elementData->needsPWrapping = true;
360 * Clone nodes in a stack range and return the new parent
362 * @param SerializerNode $parentNode
363 * @param bool $inline
364 * @param int $pos The source position
365 * @return SerializerNode
367 private function splitTagStack( SerializerNode $parentNode, $inline, $pos ) {
368 $parentData = $parentNode->snData;
369 $wrapBase = $parentData->wrapBaseNode;
370 $pWrap = $parentData->ancestorPNode;
371 if ( !$pWrap ) {
372 $cloneEnd = $wrapBase;
373 } else {
374 $cloneEnd = $parentData->ancestorPNode;
377 $serializer = $this->serializer;
378 $node = $parentNode;
379 $root = $serializer->getRootNode();
380 $nodes = [];
381 $removableNodes = [];
382 while ( $node !== $cloneEnd ) {
383 $nextParent = $serializer->getParentNode( $node );
384 if ( $nextParent === $root ) {
385 throw new InvalidArgumentException( 'Did not find end of clone range' );
387 $nodes[] = $node;
388 if ( $node->snData->nonblankNodeCount === 0 ) {
389 $removableNodes[] = $node;
390 $nextParent->snData->nonblankNodeCount--;
392 $node = $nextParent;
395 if ( $inline ) {
396 $pWrap = $this->insertPWrapper( $wrapBase, $pos );
397 $node = $pWrap;
398 } else {
399 if ( $pWrap ) {
400 // End the p-wrap which was open, cancel the diversion
401 $wrapBase->snData->childPElement = null;
403 $pWrap = null;
404 $node = $wrapBase;
407 for ( $i = count( $nodes ) - 1; $i >= 0; $i-- ) {
408 $oldNode = $nodes[$i];
409 $oldData = $oldNode->snData;
410 $nodeParent = $node;
411 $element = new Element( $oldNode->namespace, $oldNode->name, $oldNode->attrs );
412 $this->serializer->insertElement( TreeBuilder::UNDER, $nodeParent,
413 $element, false, $pos, 0 );
414 $oldData->currentCloneElement = $element;
416 $newNode = $element->userData;
417 $newData = $newNode->snData = new RemexMungerData;
418 if ( $pWrap ) {
419 $newData->ancestorPNode = $pWrap;
421 $newData->isSplittable = true;
422 $newData->wrapBaseNode = $wrapBase;
423 $newData->isPWrapper = $oldData->isPWrapper;
425 $nodeParent->snData->nonblankNodeCount++;
427 $node = $newNode;
429 foreach ( $removableNodes as $rNode ) {
430 $fakeElement = new Element( $rNode->namespace, $rNode->name, $rNode->attrs );
431 $fakeElement->userData = $rNode;
432 $this->serializer->removeNode( $fakeElement, $pos );
434 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
435 return $node;
439 * Find the ancestor of $node which is a child of a p-wrapper, and
440 * reparent that node so that it is placed after the end of the p-wrapper
441 * @param SerializerNode $node
442 * @param int $sourceStart
444 private function disablePWrapper( SerializerNode $node, $sourceStart ) {
445 $nodeData = $node->snData;
446 $pWrapNode = $nodeData->ancestorPNode;
447 $newParent = $this->serializer->getParentNode( $pWrapNode );
448 if ( $pWrapNode !== $this->serializer->getLastChild( $newParent ) ) {
449 // Fostering or something? Abort!
450 return;
453 $nextParent = $node;
454 do {
455 $victim = $nextParent;
456 $victim->snData->ancestorPNode = null;
457 $nextParent = $this->serializer->getParentNode( $victim );
458 } while ( $nextParent !== $pWrapNode );
460 // Make a fake Element to use in a reparenting operation
461 $victimElement = new Element( $victim->namespace, $victim->name, $victim->attrs );
462 $victimElement->userData = $victim;
464 // Reparent
465 $this->serializer->insertElement( TreeBuilder::UNDER, $newParent, $victimElement,
466 false, $sourceStart, 0 );
468 // Decrement nonblank node count
469 $pWrapNode->snData->nonblankNodeCount--;
471 // Cancel the diversion so that no more elements are inserted under this p-wrap
472 $newParent->snData->childPElement = null;
475 public function endTag( Element $element, $sourceStart, $sourceLength ) {
476 $data = $element->userData->snData;
477 if ( $data->childPElement ) {
478 $this->endTag( $data->childPElement, $sourceStart, 0 );
480 $this->serializer->endTag( $element, $sourceStart, $sourceLength );
481 $element->userData->snData = null;
482 $element->userData = null;
485 public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
486 $this->serializer->doctype( $name, $public, $system, $quirks,
487 $sourceStart, $sourceLength );
490 public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) {
491 [ , $refNode ] = $this->getParentForInsert( $preposition, $refElement );
492 $this->serializer->comment( $preposition, $refNode, $text, $sourceStart, $sourceLength );
495 public function error( $text, $pos ) {
496 $this->serializer->error( $text, $pos );
499 public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) {
500 $this->serializer->mergeAttributes( $element, $attrs, $sourceStart );
503 public function removeNode( Element $element, $sourceStart ) {
504 $this->serializer->removeNode( $element, $sourceStart );
507 public function reparentChildren( Element $element, Element $newParent, $sourceStart ) {
508 $self = $element->userData;
509 if ( $self->snData->childPElement ) {
510 // Reparent under the p-wrapper instead, so that e.g.
511 // <blockquote><mw:p-wrap>...</mw:p-wrap></blockquote>
512 // becomes
513 // <blockquote><mw:p-wrap><i>...</i></mw:p-wrap></blockquote>
515 // The formatting element should not be the parent of the p-wrap.
516 // Without this special case, the insertElement() of the <i> below
517 // would be diverted into the p-wrapper, causing infinite recursion
518 // (T178632)
519 $this->reparentChildren( $self->snData->childPElement, $newParent, $sourceStart );
520 return;
523 $children = $self->children;
524 $self->children = [];
525 $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 );
526 $newParentNode = $newParent->userData;
527 $newParentId = $newParentNode->id;
528 foreach ( $children as $child ) {
529 if ( is_object( $child ) ) {
530 $this->trace( "reparent <{$child->name}>" );
531 $child->parentId = $newParentId;
534 $newParentNode->children = $children;
538 * Helper function to match the Parser::TOC_PLACEHOLDER.
539 * Note that Parsoid's version of this placeholder might
540 * include additional attributes.
541 * @param Element $element
542 * @return bool If the given element is a Parser::TOC_PLACEHOLDER
544 private function isTableOfContentsMarker( Element $element ): bool {
545 // Keep this in sync with Parser::TOC_PLACEHOLDER
546 return (
547 $element->htmlName === 'meta' &&
548 isset( $element->attrs['property'] ) &&
549 $element->attrs['property'] === 'mw:PageProp/toc'