3 * Performs transformations of HTML by wrapping around libxml2 and working
4 * around its countless bugs.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
30 private $itemsToRemove = array();
31 private $elementsToFlatten = array();
32 protected $removeMedia = false;
37 * @param string $html Text to process
39 public function __construct( $html ) {
44 * Turns a chunk of HTML into a proper document
48 public static function wrapHTML( $html ) {
49 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
53 * Override this in descendant class to modify HTML after it has been converted from DOM tree
54 * @param string $html HTML to process
55 * @return string Processed HTML
57 protected function onHtmlReady( $html ) {
62 * @return DOMDocument DOM to manipulate
64 public function getDoc() {
66 // DOMDocument::loadHTML apparently isn't very good with encodings, so
67 // convert input to ASCII by encoding everything above 128 as entities.
68 if ( function_exists( 'mb_convert_encoding' ) ) {
69 $html = mb_convert_encoding( $this->html
, 'HTML-ENTITIES', 'UTF-8' );
71 $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
72 return '&#' . UtfNormal\Utils
::utf8ToCodepoint( $m[0] ) . ';';
76 // Workaround for bug that caused spaces before references
77 // to disappear during processing: https://phabricator.wikimedia.org/T55086
78 // TODO: Please replace with a better fix if one can be found.
79 $html = str_replace( ' <', ' <', $html );
81 libxml_use_internal_errors( true );
82 $loader = libxml_disable_entity_loader();
83 $this->doc
= new DOMDocument();
84 $this->doc
->strictErrorChecking
= false;
85 $this->doc
->loadHTML( $html );
86 libxml_disable_entity_loader( $loader );
87 libxml_use_internal_errors( false );
88 $this->doc
->encoding
= 'UTF-8';
94 * Sets whether images/videos/sounds should be removed from output
97 public function setRemoveMedia( $flag = true ) {
98 $this->removeMedia
= $flag;
102 * Adds one or more selector of content to remove. A subset of CSS selector
103 * syntax is supported:
110 * @param array|string $selectors Selector(s) of stuff to remove
112 public function remove( $selectors ) {
113 $this->itemsToRemove
= array_merge( $this->itemsToRemove
, (array)$selectors );
117 * Adds one or more element name to the list to flatten (remove tag, but not its content)
118 * Can accept undelimited regexes
120 * Note this interface may fail in surprising unexpected ways due to usage of regexes,
121 * so should not be relied on for HTML markup security measures.
123 * @param array|string $elements Name(s) of tag(s) to flatten
125 public function flatten( $elements ) {
126 $this->elementsToFlatten
= array_merge( $this->elementsToFlatten
, (array)$elements );
130 * Instructs the formatter to flatten all tags
132 public function flattenAllTags() {
133 $this->flatten( '[?!]?[a-z0-9]+' );
137 * Removes content we've chosen to remove. The text of the removed elements can be
138 * extracted with the getText method.
139 * @return array Array of removed DOMElements
141 public function filterContent() {
142 $removals = $this->parseItemsToRemove();
144 // Bail out early if nothing to do
145 if ( array_reduce( $removals,
146 function ( $carry, $item ) {
147 return $carry && !$item;
154 $doc = $this->getDoc();
158 // You can't remove DOMNodes from a DOMNodeList as you're iterating
159 // over them in a foreach loop. It will seemingly leave the internal
160 // iterator on the foreach out of wack and results will be quite
161 // strange. Though, making a queue of items to remove seems to work.
162 $domElemsToRemove = array();
163 foreach ( $removals['TAG'] as $tagToRemove ) {
164 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
165 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
166 if ( $tagToRemoveNode ) {
167 $domElemsToRemove[] = $tagToRemoveNode;
171 $removed = $this->removeElements( $domElemsToRemove );
173 // Elements with named IDs
174 $domElemsToRemove = array();
175 foreach ( $removals['ID'] as $itemToRemove ) {
176 $itemToRemoveNode = $doc->getElementById( $itemToRemove );
177 if ( $itemToRemoveNode ) {
178 $domElemsToRemove[] = $itemToRemoveNode;
181 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
184 $domElemsToRemove = array();
185 $xpath = new DOMXPath( $doc );
186 foreach ( $removals['CLASS'] as $classToRemove ) {
187 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
189 /** @var $element DOMElement */
190 foreach ( $elements as $element ) {
191 $classes = $element->getAttribute( 'class' );
192 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode
) {
193 $domElemsToRemove[] = $element;
197 $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
199 // Tags with CSS Classes
200 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
201 $parts = explode( '.', $classToRemove );
203 $elements = $xpath->query(
204 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
206 $removed = array_merge( $removed, $this->removeElements( $elements ) );
213 * Removes a list of elelments from DOMDocument
214 * @param array|DOMNodeList $elements
215 * @return array Array of removed elements
217 private function removeElements( $elements ) {
219 if ( $elements instanceof DOMNodeList
) {
221 foreach ( $elements as $element ) {
225 /** @var $element DOMElement */
226 foreach ( $list as $element ) {
227 if ( $element->parentNode
) {
228 $element->parentNode
->removeChild( $element );
235 * libxml in its usual pointlessness converts many chars to entities - this function
236 * perfoms a reverse conversion
237 * @param string $html
240 private function fixLibXML( $html ) {
241 static $replacements;
242 if ( !$replacements ) {
243 // We don't include rules like '"' => '&quot;' because entities had already been
244 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
245 $replacements = new ReplacementArray( array(
246 '"' => '&quot;',
247 '&' => '&amp;',
248 '<' => '&lt;',
249 '>' => '&gt;',
252 $html = $replacements->replace( $html );
254 if ( function_exists( 'mb_convert_encoding' ) ) {
255 // Just in case the conversion in getDoc() above used named
256 // entities that aren't known to html_entity_decode().
257 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
259 $html = html_entity_decode( $html, ENT_COMPAT
, 'utf-8' );
265 * Performs final transformations and returns resulting HTML. Note that if you want to call this
266 * both without an element and with an element you should call it without an element first. If you
267 * specify the $element in the method it'll change the underlying dom and you won't be able to get
270 * @param DOMElement|string|null $element ID of element to get HTML from or
271 * false to get it from the whole tree
272 * @return string Processed HTML
274 public function getText( $element = null ) {
277 if ( $element !== null && !( $element instanceof DOMElement
) ) {
278 $element = $this->doc
->getElementById( $element );
281 $body = $this->doc
->getElementsByTagName( 'body' )->item( 0 );
282 $nodesArray = array();
283 foreach ( $body->childNodes
as $node ) {
284 $nodesArray[] = $node;
286 foreach ( $nodesArray as $nodeArray ) {
287 $body->removeChild( $nodeArray );
289 $body->appendChild( $element );
291 $html = $this->doc
->saveHTML();
293 $html = $this->fixLibXml( $html );
294 if ( wfIsWindows() ) {
295 // Cleanup for CRLF misprocessing of unknown origin on Windows.
296 // If this error continues in the future, please track it down in the
297 // XML code paths if possible and fix there.
298 $html = str_replace( ' ', '', $html );
303 // Remove stuff added by wrapHTML()
304 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
305 $html = $this->onHtmlReady( $html );
307 if ( $this->elementsToFlatten
) {
308 $elements = implode( '|', $this->elementsToFlatten
);
309 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
316 * Helper function for parseItemsToRemove(). This function extracts the selector type
317 * and the raw name of a selector from a CSS-style selector string and assigns those
318 * values to parameters passed by reference. For example, if given '#toc' as the
319 * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
320 * @param string $selector CSS selector to parse
321 * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
322 * @param string $rawName The raw name of the selector
323 * @return bool Whether the selector was successfully recognised
324 * @throws MWException
326 protected function parseSelector( $selector, &$type, &$rawName ) {
327 if ( strpos( $selector, '.' ) === 0 ) {
329 $rawName = substr( $selector, 1 );
330 } elseif ( strpos( $selector, '#' ) === 0 ) {
332 $rawName = substr( $selector, 1 );
333 } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
335 $rawName = $selector;
336 } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
338 $rawName = $selector;
340 throw new MWException( __METHOD__
. "(): unrecognized selector '$selector'" );
347 * Transforms CSS-style selectors into an internal representation suitable for
348 * processing by filterContent()
351 protected function parseItemsToRemove() {
356 'TAG_CLASS' => array(),
359 foreach ( $this->itemsToRemove
as $itemToRemove ) {
362 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
363 $removals[$type][] = $rawName;
367 if ( $this->removeMedia
) {
368 $removals['TAG'][] = 'img';
369 $removals['TAG'][] = 'audio';
370 $removals['TAG'][] = 'video';