From 5dd13dd812318592e27352de69a47f9c2416afeb Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 4 Sep 2007 12:47:27 +0000 Subject: [PATCH] Refactor FilterManager into smaller chunks, entity files in XIncluded documents are now properly registered. git-svn-id: http://htmlpurifier.org/svnroot@1412 48356398-32a2-884e-a903-53898d9a118a --- TODO.txt | 3 - XHTMLCompiler/FilterManager.php | 191 +++++++++++++++++++++++----------------- 2 files changed, 111 insertions(+), 83 deletions(-) diff --git a/TODO.txt b/TODO.txt index 5abe7b3..eb69b8e 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,5 @@ ===== TODO ============================================= XHTML Compiler == -Bugs -- Dependency finder doesn't check XIncluded XML file's entity files - Architectural - Genericize Page code so that source can be one of many extensions (the extension dictates a TextFilter to use?) diff --git a/XHTMLCompiler/FilterManager.php b/XHTMLCompiler/FilterManager.php index e8e0200..c6d0ecd 100644 --- a/XHTMLCompiler/FilterManager.php +++ b/XHTMLCompiler/FilterManager.php @@ -100,7 +100,7 @@ class XHTMLCompiler_FilterManager } /** - * Accepts a page's text and processes it. + * Accepts a page's text (usually XHTML) and processes it. * @param $text String text to be processed * @param $page XHTMLCompiler_Page representing currently processed page */ @@ -111,61 +111,19 @@ class XHTMLCompiler_FilterManager $text = $filter->process($text, $page, $this); } - // setup XML catalog to improve speed - $catalog = str_replace(array(' ', '\\'), array('%20', '/'), - dirname(__FILE__)) . '/../catalog/catalog.xml'; - if ($catalog[1] == ':') $catalog = substr($catalog, 2); // remove drive - putenv('XML_CATALOG_FILES=' . $catalog); - - // configure DOMDocument - $dom = new DOMDocument(); - $dom->preserveWhiteSpace = false; - $dom->formatOutput = true; - $dom->resolveExternals = true; + // generate the DOM + $this->setupXMLCatalog(); + $dom = $this->createDOM($text); - // todo: somehow, collect information on which entity files - // are being added to the document, and add to xc-deps. - $dom->substituteEntities = true; // allows for custom entities too! - - $dom->loadXML($text); - - $internal_subset = $dom->doctype->internalSubset; - if ($internal_subset) { - // there are some entities that need to be registered to - // the dependency list. Match ones that declare SYSTEM - // '' - preg_match_all( - '//s', - $internal_subset, - $matches - ); - foreach ($matches[1] as $filename) { - // $filename will always be relative to web root, so - // no munging necessary - $this->addDependency($filename); - } - } + $this->analyzeInternalSubset($dom); + // validate the document to force the entities to be resolved, + // we don't actually care about the errors set_error_handler(array($this, 'muteErrorHandler')); $dom->validate(); restore_error_handler(); - $dom->encoding = 'UTF-8'; // override document encoding - - // XInclude - // todo: - // * factor into a DOMFilter - // * add xincludes to the dependency list - $xpath = new DOMXPath($dom); - $xpath->registerNamespace('xi', $ns = 'http://www.w3.org/2001/XInclude'); - $nodes = $xpath->query('//xi:include'); - foreach ($nodes as $node) { - if (! $node instanceof DOMElement) continue; - if (! $filename = $node->getAttribute('href')) continue; - // doesn't handle second-level includes - $this->addDependency($filename); - } - // perform includes (we might need to loop to handle nested includes) + $this->analyzeXIncludes($dom); $dom->xinclude(); // run DOM filters @@ -177,50 +135,26 @@ class XHTMLCompiler_FilterManager // translate back to text $text = $dom->saveXML(); - // remove all non-default namespace declarations + // remove all non-default namespace declarations, may change, + // but for now embedded XML namespaces are not cross-browser friendly $text = preg_replace('/ xmlns:.+?=".+?"/', '', $text); + // scrub out custom DTD additions + $text = preg_replace('/(]*?) ?\[[^\]]+\]/', '\1', $text); foreach ($this->postTextFilters as $filter) { $text = $filter->process($text, $page, $this); } // okay, now finally do validation, and let the errors get - // spit out if there are some - // collect parse errors + // spit out if there are some collect parse errors set_error_handler(array($this, 'validationErrorHandler')); $dom->loadXML($text); $status = $dom->validate(); restore_error_handler(); - if (!$status || !empty($this->errors)) { - $body = $dom->getElementsByTagName('body')->item(0); - if (!$body) { - $dom->appendChild($html = $dom->createElement('html')); - $html->appendChild($body = $dom->createElement('body')); - } - $warning = $dom->createElement('div'); - $warning->setAttribute('class', 'warning'); - $warning->appendChild($dom->createElement('h2', 'Warning: Errors')); - $warning->appendChild($dom->createElement('p', 'This document has validation errors:')); - $list = $dom->createElement('ul'); - foreach ($this->errors as $error) { - // strip-tags removes HTML tags to make the plaintext output - // more friendly, IS NOT for security reasons - $list->appendChild($dom->createElement('li', strip_tags($error))); - } - $warning->appendChild($list); - $body->insertBefore($warning, $body->childNodes->item(0)); + $this->buildErrors($dom); $text = $dom->saveXML(); } - // scrub out XML declaration for Internet Explorer - // disabled, we're going to try serving application/xhtml+xml - // to browsers that support it - /*$text = str_replace(''."\n", '', $text);*/ - - // scrub out custom DTD additions, they should have been - // resolved already - $text = preg_replace('/(]*?) ?\[[^\]]+\]/', '\1', $text); - return $text; } @@ -236,6 +170,103 @@ class XHTMLCompiler_FilterManager */ public function muteErrorHandler($n, $t) {} + /** + * Sets up an XML catalog to speed up entity resolution + */ + public function setupXMLCatalog() { + $catalog = str_replace(array(' ', '\\'), array('%20', '/'), + dirname(__FILE__)) . '/../catalog/catalog.xml'; + if ($catalog[1] == ':') $catalog = substr($catalog, 2); // remove drive + putenv('XML_CATALOG_FILES=' . $catalog); + } + + /** + * Creates a reasonable well default configured DOM + * @param string $xml XML to load DOM with + */ + public function createDOM($text = false) { + $dom = new DOMDocument(); + $dom->preserveWhiteSpace = false; + $dom->formatOutput = true; + $dom->resolveExternals = true; + + // todo: somehow, collect information on which entity files + // are being added to the document, and add to xc-deps. + $dom->substituteEntities = true; // allows for custom entities too! + + if ($text !== false) $dom->loadXML($text); + + return $dom; + } + + /** + * Analyzes the internal subset of a DOM, registering any file + * entity definitions as dependencies + */ + public function analyzeInternalSubset($dom) { + if (empty($dom->doctype)) return; + $internal_subset = $dom->doctype->internalSubset; + if ($internal_subset) { + // there are some entities that need to be registered to + // the dependency list. Match ones that declare SYSTEM + // '' + preg_match_all( + '//s', + $internal_subset, + $matches + ); + foreach ($matches[1] as $filename) { + // $filename will always be relative to web root, so + // no munging necessary + $this->addDependency($filename); + } + } + } + + /** + * Analyzes a documents XIncludes and registers necessary dependencies. + * Make sure you call this before calling $dom->xinclude + * @param DOMDocument $dom to process + * @todo Factor into a DOMFilter + * @todo Handle arbitrary nestings of includes + */ + public function analyzeXIncludes($dom) { + $xpath = new DOMXPath($dom); + $xpath->registerNamespace('xi', $ns = 'http://www.w3.org/2001/XInclude'); + $nodes = $xpath->query('//xi:include'); + foreach ($nodes as $node) { + if (! $node instanceof DOMElement) continue; + if (! $filename = $node->getAttribute('href')) continue; + $this->addDependency($filename); + $sub_dom = new DOMDocument(); + $sub_dom->load($filename); + $this->analyzeInternalSubset($sub_dom); + } + } + + /** + * Adds validation errors to the output document as a message + */ + public function buildErrors($dom) { + $body = $dom->getElementsByTagName('body')->item(0); + if (!$body) { + $dom->appendChild($html = $dom->createElement('html')); + $html->appendChild($body = $dom->createElement('body')); + } + $warning = $dom->createElement('div'); + $warning->setAttribute('class', 'warning'); + $warning->appendChild($dom->createElement('h2', 'Warning: Errors')); + $warning->appendChild($dom->createElement('p', 'This document has validation errors:')); + $list = $dom->createElement('ul'); + foreach ($this->errors as $error) { + // strip-tags removes HTML tags to make the plaintext output + // more friendly, IS NOT for security reasons + $list->appendChild($dom->createElement('li', strip_tags($error))); + } + $warning->appendChild($list); + $body->insertBefore($warning, $body->childNodes->item(0)); + } + } ?> -- 2.11.4.GIT